From 2aa4a82499d4becd2284cdb482213d541b8804dd Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 28 Apr 2024 16:29:10 +0200 Subject: Adding upstream version 86.0.1. Signed-off-by: Daniel Baumann --- third_party/dav1d/src/arm/32/cdef.S | 538 ++ third_party/dav1d/src/arm/32/cdef16.S | 232 + third_party/dav1d/src/arm/32/cdef_tmpl.S | 515 ++ third_party/dav1d/src/arm/32/ipred.S | 2959 ++++++++ third_party/dav1d/src/arm/32/itx.S | 3386 +++++++++ third_party/dav1d/src/arm/32/loopfilter.S | 868 +++ third_party/dav1d/src/arm/32/looprestoration.S | 2099 ++++++ third_party/dav1d/src/arm/32/looprestoration16.S | 720 ++ third_party/dav1d/src/arm/32/mc.S | 3349 +++++++++ third_party/dav1d/src/arm/32/mc16.S | 2734 +++++++ third_party/dav1d/src/arm/32/msac.S | 575 ++ third_party/dav1d/src/arm/32/util.S | 126 + third_party/dav1d/src/arm/64/cdef.S | 517 ++ third_party/dav1d/src/arm/64/cdef16.S | 228 + third_party/dav1d/src/arm/64/cdef_tmpl.S | 482 ++ third_party/dav1d/src/arm/64/ipred.S | 2764 +++++++ third_party/dav1d/src/arm/64/ipred16.S | 3076 ++++++++ third_party/dav1d/src/arm/64/itx.S | 3288 +++++++++ third_party/dav1d/src/arm/64/itx16.S | 3526 +++++++++ third_party/dav1d/src/arm/64/loopfilter.S | 1123 +++ third_party/dav1d/src/arm/64/loopfilter16.S | 907 +++ third_party/dav1d/src/arm/64/looprestoration.S | 1152 +++ third_party/dav1d/src/arm/64/looprestoration16.S | 1239 ++++ .../dav1d/src/arm/64/looprestoration_common.S | 432 ++ .../dav1d/src/arm/64/looprestoration_tmpl.S | 597 ++ third_party/dav1d/src/arm/64/mc.S | 3247 +++++++++ third_party/dav1d/src/arm/64/mc16.S | 3575 ++++++++++ third_party/dav1d/src/arm/64/msac.S | 480 ++ third_party/dav1d/src/arm/64/util.S | 197 + third_party/dav1d/src/arm/asm.S | 155 + third_party/dav1d/src/arm/cdef_init_tmpl.c | 85 + third_party/dav1d/src/arm/cpu.c | 99 + third_party/dav1d/src/arm/cpu.h | 37 + third_party/dav1d/src/arm/ipred_init_tmpl.c | 82 + third_party/dav1d/src/arm/itx_init_tmpl.c | 143 + third_party/dav1d/src/arm/loopfilter_init_tmpl.c | 47 + .../dav1d/src/arm/looprestoration_init_tmpl.c | 298 + third_party/dav1d/src/arm/mc_init_tmpl.c | 118 + third_party/dav1d/src/arm/msac.h | 52 + third_party/dav1d/src/cdef.h | 73 + third_party/dav1d/src/cdef_apply.h | 38 + third_party/dav1d/src/cdef_apply_tmpl.c | 234 + third_party/dav1d/src/cdef_tmpl.c | 312 + third_party/dav1d/src/cdf.c | 4142 +++++++++++ third_party/dav1d/src/cdf.h | 157 + third_party/dav1d/src/cpu.c | 63 + third_party/dav1d/src/cpu.h | 49 + third_party/dav1d/src/ctx.h | 91 + third_party/dav1d/src/data.c | 148 + third_party/dav1d/src/data.h | 60 + third_party/dav1d/src/dav1d.rc.in | 32 + third_party/dav1d/src/decode.c | 3638 ++++++++++ third_party/dav1d/src/decode.h | 35 + third_party/dav1d/src/dequant_tables.c | 229 + third_party/dav1d/src/dequant_tables.h | 37 + third_party/dav1d/src/env.h | 521 ++ third_party/dav1d/src/ext/x86/x86inc.asm | 1827 +++++ third_party/dav1d/src/fg_apply.h | 41 + third_party/dav1d/src/fg_apply_tmpl.c | 209 + third_party/dav1d/src/film_grain.h | 85 + third_party/dav1d/src/film_grain_tmpl.c | 437 ++ third_party/dav1d/src/getbits.c | 162 + third_party/dav1d/src/getbits.h | 59 + third_party/dav1d/src/internal.h | 357 + third_party/dav1d/src/intra_edge.c | 165 + third_party/dav1d/src/intra_edge.h | 57 + third_party/dav1d/src/ipred.h | 95 + third_party/dav1d/src/ipred_prepare.h | 108 + third_party/dav1d/src/ipred_prepare_tmpl.c | 204 + third_party/dav1d/src/ipred_tmpl.c | 763 ++ third_party/dav1d/src/itx.h | 50 + third_party/dav1d/src/itx_1d.c | 1034 +++ third_party/dav1d/src/itx_1d.h | 59 + third_party/dav1d/src/itx_tmpl.c | 256 + third_party/dav1d/src/levels.h | 288 + third_party/dav1d/src/lf_apply.h | 42 + third_party/dav1d/src/lf_apply_tmpl.c | 306 + third_party/dav1d/src/lf_mask.c | 482 ++ third_party/dav1d/src/lf_mask.h | 83 + third_party/dav1d/src/lib.c | 649 ++ third_party/dav1d/src/log.c | 57 + third_party/dav1d/src/log.h | 47 + third_party/dav1d/src/loopfilter.h | 59 + third_party/dav1d/src/loopfilter_tmpl.c | 260 + third_party/dav1d/src/looprestoration.h | 80 + third_party/dav1d/src/looprestoration_tmpl.c | 521 ++ third_party/dav1d/src/lr_apply.h | 44 + third_party/dav1d/src/lr_apply_tmpl.c | 302 + third_party/dav1d/src/mc.h | 138 + third_party/dav1d/src/mc_tmpl.c | 954 +++ third_party/dav1d/src/mem.c | 74 + third_party/dav1d/src/mem.h | 100 + third_party/dav1d/src/meson.build | 337 + third_party/dav1d/src/msac.c | 208 + third_party/dav1d/src/msac.h | 108 + third_party/dav1d/src/obu.c | 1603 +++++ third_party/dav1d/src/obu.h | 36 + third_party/dav1d/src/picture.c | 324 + third_party/dav1d/src/picture.h | 117 + third_party/dav1d/src/ppc/cdef_init_tmpl.c | 488 ++ third_party/dav1d/src/ppc/cpu.c | 51 + third_party/dav1d/src/ppc/cpu.h | 37 + .../dav1d/src/ppc/looprestoration_init_tmpl.c | 350 + third_party/dav1d/src/ppc/types.h | 54 + third_party/dav1d/src/qm.c | 3152 ++++++++ third_party/dav1d/src/qm.h | 37 + third_party/dav1d/src/recon.h | 75 + third_party/dav1d/src/recon_tmpl.c | 2062 ++++++ third_party/dav1d/src/ref.c | 111 + third_party/dav1d/src/ref.h | 58 + third_party/dav1d/src/refmvs.c | 909 +++ third_party/dav1d/src/refmvs.h | 233 + third_party/dav1d/src/scan.c | 444 ++ third_party/dav1d/src/scan.h | 37 + third_party/dav1d/src/tables.c | 1022 +++ third_party/dav1d/src/tables.h | 125 + third_party/dav1d/src/thread.h | 180 + third_party/dav1d/src/thread_data.h | 40 + third_party/dav1d/src/thread_task.c | 142 + third_party/dav1d/src/thread_task.h | 44 + third_party/dav1d/src/warpmv.c | 209 + third_party/dav1d/src/warpmv.h | 39 + third_party/dav1d/src/wedge.c | 342 + third_party/dav1d/src/wedge.h | 41 + third_party/dav1d/src/win32/thread.c | 96 + third_party/dav1d/src/x86/cdef_avx2.asm | 1799 +++++ third_party/dav1d/src/x86/cdef_avx512.asm | 868 +++ third_party/dav1d/src/x86/cdef_init_tmpl.c | 94 + third_party/dav1d/src/x86/cdef_sse.asm | 1355 ++++ third_party/dav1d/src/x86/cpu.c | 82 + third_party/dav1d/src/x86/cpu.h | 42 + third_party/dav1d/src/x86/cpuid.asm | 55 + third_party/dav1d/src/x86/film_grain.asm | 2405 +++++++ third_party/dav1d/src/x86/film_grain_init_tmpl.c | 77 + third_party/dav1d/src/x86/film_grain_ssse3.asm | 3301 +++++++++ third_party/dav1d/src/x86/ipred.asm | 5387 ++++++++++++++ third_party/dav1d/src/x86/ipred_init_tmpl.c | 139 + third_party/dav1d/src/x86/ipred_ssse3.asm | 3109 ++++++++ third_party/dav1d/src/x86/itx.asm | 5563 +++++++++++++++ third_party/dav1d/src/x86/itx_init_tmpl.c | 187 + third_party/dav1d/src/x86/itx_ssse3.asm | 6559 +++++++++++++++++ third_party/dav1d/src/x86/loopfilter.asm | 1601 +++++ third_party/dav1d/src/x86/loopfilter_init_tmpl.c | 60 + third_party/dav1d/src/x86/loopfilter_ssse3.asm | 2348 ++++++ third_party/dav1d/src/x86/looprestoration.asm | 1158 +++ .../dav1d/src/x86/looprestoration_init_tmpl.c | 233 + .../dav1d/src/x86/looprestoration_ssse3.asm | 1953 +++++ third_party/dav1d/src/x86/mc_avx2.asm | 5704 +++++++++++++++ third_party/dav1d/src/x86/mc_avx512.asm | 2395 +++++++ third_party/dav1d/src/x86/mc_init_tmpl.c | 366 + third_party/dav1d/src/x86/mc_sse.asm | 7544 ++++++++++++++++++++ third_party/dav1d/src/x86/msac.asm | 669 ++ third_party/dav1d/src/x86/msac.h | 64 + third_party/dav1d/src/x86/msac_init.c | 43 + 154 files changed, 134631 insertions(+) create mode 100644 third_party/dav1d/src/arm/32/cdef.S create mode 100644 third_party/dav1d/src/arm/32/cdef16.S create mode 100644 third_party/dav1d/src/arm/32/cdef_tmpl.S create mode 100644 third_party/dav1d/src/arm/32/ipred.S create mode 100644 third_party/dav1d/src/arm/32/itx.S create mode 100644 third_party/dav1d/src/arm/32/loopfilter.S create mode 100644 third_party/dav1d/src/arm/32/looprestoration.S create mode 100644 third_party/dav1d/src/arm/32/looprestoration16.S create mode 100644 third_party/dav1d/src/arm/32/mc.S create mode 100644 third_party/dav1d/src/arm/32/mc16.S create mode 100644 third_party/dav1d/src/arm/32/msac.S create mode 100644 third_party/dav1d/src/arm/32/util.S create mode 100644 third_party/dav1d/src/arm/64/cdef.S create mode 100644 third_party/dav1d/src/arm/64/cdef16.S create mode 100644 third_party/dav1d/src/arm/64/cdef_tmpl.S create mode 100644 third_party/dav1d/src/arm/64/ipred.S create mode 100644 third_party/dav1d/src/arm/64/ipred16.S create mode 100644 third_party/dav1d/src/arm/64/itx.S create mode 100644 third_party/dav1d/src/arm/64/itx16.S create mode 100644 third_party/dav1d/src/arm/64/loopfilter.S create mode 100644 third_party/dav1d/src/arm/64/loopfilter16.S create mode 100644 third_party/dav1d/src/arm/64/looprestoration.S create mode 100644 third_party/dav1d/src/arm/64/looprestoration16.S create mode 100644 third_party/dav1d/src/arm/64/looprestoration_common.S create mode 100644 third_party/dav1d/src/arm/64/looprestoration_tmpl.S create mode 100644 third_party/dav1d/src/arm/64/mc.S create mode 100644 third_party/dav1d/src/arm/64/mc16.S create mode 100644 third_party/dav1d/src/arm/64/msac.S create mode 100644 third_party/dav1d/src/arm/64/util.S create mode 100644 third_party/dav1d/src/arm/asm.S create mode 100644 third_party/dav1d/src/arm/cdef_init_tmpl.c create mode 100644 third_party/dav1d/src/arm/cpu.c create mode 100644 third_party/dav1d/src/arm/cpu.h create mode 100644 third_party/dav1d/src/arm/ipred_init_tmpl.c create mode 100644 third_party/dav1d/src/arm/itx_init_tmpl.c create mode 100644 third_party/dav1d/src/arm/loopfilter_init_tmpl.c create mode 100644 third_party/dav1d/src/arm/looprestoration_init_tmpl.c create mode 100644 third_party/dav1d/src/arm/mc_init_tmpl.c create mode 100644 third_party/dav1d/src/arm/msac.h create mode 100644 third_party/dav1d/src/cdef.h create mode 100644 third_party/dav1d/src/cdef_apply.h create mode 100644 third_party/dav1d/src/cdef_apply_tmpl.c create mode 100644 third_party/dav1d/src/cdef_tmpl.c create mode 100644 third_party/dav1d/src/cdf.c create mode 100644 third_party/dav1d/src/cdf.h create mode 100644 third_party/dav1d/src/cpu.c create mode 100644 third_party/dav1d/src/cpu.h create mode 100644 third_party/dav1d/src/ctx.h create mode 100644 third_party/dav1d/src/data.c create mode 100644 third_party/dav1d/src/data.h create mode 100644 third_party/dav1d/src/dav1d.rc.in create mode 100644 third_party/dav1d/src/decode.c create mode 100644 third_party/dav1d/src/decode.h create mode 100644 third_party/dav1d/src/dequant_tables.c create mode 100644 third_party/dav1d/src/dequant_tables.h create mode 100644 third_party/dav1d/src/env.h create mode 100644 third_party/dav1d/src/ext/x86/x86inc.asm create mode 100644 third_party/dav1d/src/fg_apply.h create mode 100644 third_party/dav1d/src/fg_apply_tmpl.c create mode 100644 third_party/dav1d/src/film_grain.h create mode 100644 third_party/dav1d/src/film_grain_tmpl.c create mode 100644 third_party/dav1d/src/getbits.c create mode 100644 third_party/dav1d/src/getbits.h create mode 100644 third_party/dav1d/src/internal.h create mode 100644 third_party/dav1d/src/intra_edge.c create mode 100644 third_party/dav1d/src/intra_edge.h create mode 100644 third_party/dav1d/src/ipred.h create mode 100644 third_party/dav1d/src/ipred_prepare.h create mode 100644 third_party/dav1d/src/ipred_prepare_tmpl.c create mode 100644 third_party/dav1d/src/ipred_tmpl.c create mode 100644 third_party/dav1d/src/itx.h create mode 100644 third_party/dav1d/src/itx_1d.c create mode 100644 third_party/dav1d/src/itx_1d.h create mode 100644 third_party/dav1d/src/itx_tmpl.c create mode 100644 third_party/dav1d/src/levels.h create mode 100644 third_party/dav1d/src/lf_apply.h create mode 100644 third_party/dav1d/src/lf_apply_tmpl.c create mode 100644 third_party/dav1d/src/lf_mask.c create mode 100644 third_party/dav1d/src/lf_mask.h create mode 100644 third_party/dav1d/src/lib.c create mode 100644 third_party/dav1d/src/log.c create mode 100644 third_party/dav1d/src/log.h create mode 100644 third_party/dav1d/src/loopfilter.h create mode 100644 third_party/dav1d/src/loopfilter_tmpl.c create mode 100644 third_party/dav1d/src/looprestoration.h create mode 100644 third_party/dav1d/src/looprestoration_tmpl.c create mode 100644 third_party/dav1d/src/lr_apply.h create mode 100644 third_party/dav1d/src/lr_apply_tmpl.c create mode 100644 third_party/dav1d/src/mc.h create mode 100644 third_party/dav1d/src/mc_tmpl.c create mode 100644 third_party/dav1d/src/mem.c create mode 100644 third_party/dav1d/src/mem.h create mode 100644 third_party/dav1d/src/meson.build create mode 100644 third_party/dav1d/src/msac.c create mode 100644 third_party/dav1d/src/msac.h create mode 100644 third_party/dav1d/src/obu.c create mode 100644 third_party/dav1d/src/obu.h create mode 100644 third_party/dav1d/src/picture.c create mode 100644 third_party/dav1d/src/picture.h create mode 100644 third_party/dav1d/src/ppc/cdef_init_tmpl.c create mode 100644 third_party/dav1d/src/ppc/cpu.c create mode 100644 third_party/dav1d/src/ppc/cpu.h create mode 100644 third_party/dav1d/src/ppc/looprestoration_init_tmpl.c create mode 100644 third_party/dav1d/src/ppc/types.h create mode 100644 third_party/dav1d/src/qm.c create mode 100644 third_party/dav1d/src/qm.h create mode 100644 third_party/dav1d/src/recon.h create mode 100644 third_party/dav1d/src/recon_tmpl.c create mode 100644 third_party/dav1d/src/ref.c create mode 100644 third_party/dav1d/src/ref.h create mode 100644 third_party/dav1d/src/refmvs.c create mode 100644 third_party/dav1d/src/refmvs.h create mode 100644 third_party/dav1d/src/scan.c create mode 100644 third_party/dav1d/src/scan.h create mode 100644 third_party/dav1d/src/tables.c create mode 100644 third_party/dav1d/src/tables.h create mode 100644 third_party/dav1d/src/thread.h create mode 100644 third_party/dav1d/src/thread_data.h create mode 100644 third_party/dav1d/src/thread_task.c create mode 100644 third_party/dav1d/src/thread_task.h create mode 100644 third_party/dav1d/src/warpmv.c create mode 100644 third_party/dav1d/src/warpmv.h create mode 100644 third_party/dav1d/src/wedge.c create mode 100644 third_party/dav1d/src/wedge.h create mode 100644 third_party/dav1d/src/win32/thread.c create mode 100644 third_party/dav1d/src/x86/cdef_avx2.asm create mode 100644 third_party/dav1d/src/x86/cdef_avx512.asm create mode 100644 third_party/dav1d/src/x86/cdef_init_tmpl.c create mode 100644 third_party/dav1d/src/x86/cdef_sse.asm create mode 100644 third_party/dav1d/src/x86/cpu.c create mode 100644 third_party/dav1d/src/x86/cpu.h create mode 100644 third_party/dav1d/src/x86/cpuid.asm create mode 100644 third_party/dav1d/src/x86/film_grain.asm create mode 100644 third_party/dav1d/src/x86/film_grain_init_tmpl.c create mode 100644 third_party/dav1d/src/x86/film_grain_ssse3.asm create mode 100644 third_party/dav1d/src/x86/ipred.asm create mode 100644 third_party/dav1d/src/x86/ipred_init_tmpl.c create mode 100644 third_party/dav1d/src/x86/ipred_ssse3.asm create mode 100644 third_party/dav1d/src/x86/itx.asm create mode 100644 third_party/dav1d/src/x86/itx_init_tmpl.c create mode 100644 third_party/dav1d/src/x86/itx_ssse3.asm create mode 100644 third_party/dav1d/src/x86/loopfilter.asm create mode 100644 third_party/dav1d/src/x86/loopfilter_init_tmpl.c create mode 100644 third_party/dav1d/src/x86/loopfilter_ssse3.asm create mode 100644 third_party/dav1d/src/x86/looprestoration.asm create mode 100644 third_party/dav1d/src/x86/looprestoration_init_tmpl.c create mode 100644 third_party/dav1d/src/x86/looprestoration_ssse3.asm create mode 100644 third_party/dav1d/src/x86/mc_avx2.asm create mode 100644 third_party/dav1d/src/x86/mc_avx512.asm create mode 100644 third_party/dav1d/src/x86/mc_init_tmpl.c create mode 100644 third_party/dav1d/src/x86/mc_sse.asm create mode 100644 third_party/dav1d/src/x86/msac.asm create mode 100644 third_party/dav1d/src/x86/msac.h create mode 100644 third_party/dav1d/src/x86/msac_init.c (limited to 'third_party/dav1d/src') diff --git a/third_party/dav1d/src/arm/32/cdef.S b/third_party/dav1d/src/arm/32/cdef.S new file mode 100644 index 0000000000..166ce91a82 --- /dev/null +++ b/third_party/dav1d/src/arm/32/cdef.S @@ -0,0 +1,538 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2019, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" +#include "cdef_tmpl.S" + +// n1 = s0/d0 +// w1 = d0/q0 +// n2 = s4/d2 +// w2 = d2/q1 +.macro pad_top_bottom s1, s2, w, stride, n1, w1, n2, w2, align, ret + tst r6, #1 // CDEF_HAVE_LEFT + beq 2f + // CDEF_HAVE_LEFT + tst r6, #2 // CDEF_HAVE_RIGHT + beq 1f + // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT + ldrh r12, [\s1, #-2] + vldr \n1, [\s1] + vdup.16 d4, r12 + ldrh r12, [\s1, #\w] + vmov.16 d4[1], r12 + ldrh r12, [\s2, #-2] + vldr \n2, [\s2] + vmov.16 d4[2], r12 + ldrh r12, [\s2, #\w] + vmovl.u8 q0, d0 + vmov.16 d4[3], r12 + vmovl.u8 q1, d2 + vmovl.u8 q2, d4 + vstr s8, [r0, #-4] + vst1.16 {\w1}, [r0, :\align] + vstr s9, [r0, #2*\w] + add r0, r0, #2*\stride + vstr s10, [r0, #-4] + vst1.16 {\w2}, [r0, :\align] + vstr s11, [r0, #2*\w] +.if \ret + pop {r4-r7,pc} +.else + add r0, r0, #2*\stride + b 3f +.endif + +1: + // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT + ldrh r12, [\s1, #-2] + vldr \n1, [\s1] + vdup.16 d4, r12 + ldrh r12, [\s2, #-2] + vldr \n2, [\s2] + vmovl.u8 q0, d0 + vmov.16 d4[1], r12 + vmovl.u8 q1, d2 + vmovl.u8 q2, d4 + vstr s8, [r0, #-4] + vst1.16 {\w1}, [r0, :\align] + vstr s12, [r0, #2*\w] + add r0, r0, #2*\stride + vstr s9, [r0, #-4] + vst1.16 {\w2}, [r0, :\align] + vstr s12, [r0, #2*\w] +.if \ret + pop {r4-r7,pc} +.else + add r0, r0, #2*\stride + b 3f +.endif + +2: + // !CDEF_HAVE_LEFT + tst r6, #2 // CDEF_HAVE_RIGHT + beq 1f + // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT + vldr \n1, [\s1] + ldrh r12, [\s1, #\w] + vldr \n2, [\s2] + vdup.16 d4, r12 + ldrh r12, [\s2, #\w] + vmovl.u8 q0, d0 + vmov.16 d4[1], r12 + vmovl.u8 q1, d2 + vmovl.u8 q2, d4 + vstr s12, [r0, #-4] + vst1.16 {\w1}, [r0, :\align] + vstr s8, [r0, #2*\w] + add r0, r0, #2*\stride + vstr s12, [r0, #-4] + vst1.16 {\w2}, [r0, :\align] + vstr s9, [r0, #2*\w] +.if \ret + pop {r4-r7,pc} +.else + add r0, r0, #2*\stride + b 3f +.endif + +1: + // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT + vldr \n1, [\s1] + vldr \n2, [\s2] + vmovl.u8 q0, d0 + vmovl.u8 q1, d2 + vstr s12, [r0, #-4] + vst1.16 {\w1}, [r0, :\align] + vstr s12, [r0, #2*\w] + add r0, r0, #2*\stride + vstr s12, [r0, #-4] + vst1.16 {\w2}, [r0, :\align] + vstr s12, [r0, #2*\w] +.if \ret + pop {r4-r7,pc} +.else + add r0, r0, #2*\stride +.endif +3: +.endm + +.macro load_n_incr dst, src, incr, w +.if \w == 4 + vld1.32 {\dst\()[0]}, [\src, :32], \incr +.else + vld1.8 {\dst\()}, [\src, :64], \incr +.endif +.endm + +// void dav1d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src, +// ptrdiff_t src_stride, const pixel (*left)[2], +// const pixel *const top, int h, +// enum CdefEdgeFlags edges); + +// n1 = s0/d0 +// w1 = d0/q0 +// n2 = s4/d2 +// w2 = d2/q1 +.macro padding_func w, stride, n1, w1, n2, w2, align +function cdef_padding\w\()_8bpc_neon, export=1 + push {r4-r7,lr} + ldrd r4, r5, [sp, #20] + ldr r6, [sp, #28] + cmp r6, #0xf // fully edged + beq cdef_padding\w\()_edged_8bpc_neon + vmov.i16 q3, #0x8000 + tst r6, #4 // CDEF_HAVE_TOP + bne 1f + // !CDEF_HAVE_TOP + sub r12, r0, #2*(2*\stride+2) + vmov.i16 q2, #0x8000 + vst1.16 {q2,q3}, [r12]! +.if \w == 8 + vst1.16 {q2,q3}, [r12]! +.endif + b 3f +1: + // CDEF_HAVE_TOP + add r7, r4, r2 + sub r0, r0, #2*(2*\stride) + pad_top_bottom r4, r7, \w, \stride, \n1, \w1, \n2, \w2, \align, 0 + + // Middle section +3: + tst r6, #1 // CDEF_HAVE_LEFT + beq 2f + // CDEF_HAVE_LEFT + tst r6, #2 // CDEF_HAVE_RIGHT + beq 1f + // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT +0: + vld1.16 {d2[]}, [r3, :16]! + ldrh r12, [r1, #\w] + load_n_incr d0, r1, r2, \w + subs r5, r5, #1 + vmov.16 d2[1], r12 + vmovl.u8 q0, d0 + vmovl.u8 q1, d2 + vstr s4, [r0, #-4] + vst1.16 {\w1}, [r0, :\align] + vstr s5, [r0, #2*\w] + add r0, r0, #2*\stride + bgt 0b + b 3f +1: + // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT + vld1.16 {d2[]}, [r3, :16]! + load_n_incr d0, r1, r2, \w + subs r5, r5, #1 + vmovl.u8 q0, d0 + vmovl.u8 q1, d2 + vstr s4, [r0, #-4] + vst1.16 {\w1}, [r0, :\align] + vstr s12, [r0, #2*\w] + add r0, r0, #2*\stride + bgt 1b + b 3f +2: + tst r6, #2 // CDEF_HAVE_RIGHT + beq 1f + // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT +0: + ldrh r12, [r1, #\w] + load_n_incr d0, r1, r2, \w + vdup.16 d2, r12 + subs r5, r5, #1 + vmovl.u8 q0, d0 + vmovl.u8 q1, d2 + vstr s12, [r0, #-4] + vst1.16 {\w1}, [r0, :\align] + vstr s4, [r0, #2*\w] + add r0, r0, #2*\stride + bgt 0b + b 3f +1: + // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT + load_n_incr d0, r1, r2, \w + subs r5, r5, #1 + vmovl.u8 q0, d0 + vstr s12, [r0, #-4] + vst1.16 {\w1}, [r0, :\align] + vstr s12, [r0, #2*\w] + add r0, r0, #2*\stride + bgt 1b + +3: + tst r6, #8 // CDEF_HAVE_BOTTOM + bne 1f + // !CDEF_HAVE_BOTTOM + sub r12, r0, #4 + vmov.i16 q2, #0x8000 + vst1.16 {q2,q3}, [r12]! +.if \w == 8 + vst1.16 {q2,q3}, [r12]! +.endif + pop {r4-r7,pc} +1: + // CDEF_HAVE_BOTTOM + add r7, r1, r2 + pad_top_bottom r1, r7, \w, \stride, \n1, \w1, \n2, \w2, \align, 1 +endfunc +.endm + +padding_func 8, 16, d0, q0, d2, q1, 128 +padding_func 4, 8, s0, d0, s4, d2, 64 + +// void cdef_paddingX_edged_8bpc_neon(uint16_t *tmp, const pixel *src, +// ptrdiff_t src_stride, const pixel (*left)[2], +// const pixel *const top, int h, +// enum CdefEdgeFlags edges); + +.macro padding_func_edged w, stride, reg, align +function cdef_padding\w\()_edged_8bpc_neon + sub r0, r0, #(2*\stride) + + ldrh r12, [r4, #-2] + vldr \reg, [r4] + add r7, r4, r2 + strh r12, [r0, #-2] + ldrh r12, [r4, #\w] + vstr \reg, [r0] + strh r12, [r0, #\w] + + ldrh r12, [r7, #-2] + vldr \reg, [r7] + strh r12, [r0, #\stride-2] + ldrh r12, [r7, #\w] + vstr \reg, [r0, #\stride] + strh r12, [r0, #\stride+\w] + add r0, r0, #2*\stride + +0: + ldrh r12, [r3], #2 + vldr \reg, [r1] + str r12, [r0, #-2] + ldrh r12, [r1, #\w] + add r1, r1, r2 + subs r5, r5, #1 + vstr \reg, [r0] + str r12, [r0, #\w] + add r0, r0, #\stride + bgt 0b + + ldrh r12, [r1, #-2] + vldr \reg, [r1] + add r7, r1, r2 + strh r12, [r0, #-2] + ldrh r12, [r1, #\w] + vstr \reg, [r0] + strh r12, [r0, #\w] + + ldrh r12, [r7, #-2] + vldr \reg, [r7] + strh r12, [r0, #\stride-2] + ldrh r12, [r7, #\w] + vstr \reg, [r0, #\stride] + strh r12, [r0, #\stride+\w] + + pop {r4-r7,pc} +endfunc +.endm + +padding_func_edged 8, 16, d0, 64 +padding_func_edged 4, 8, s0, 32 + +tables + +filter 8, 8 +filter 4, 8 + +find_dir 8 + +.macro load_px_8 d11, d12, d21, d22, w +.if \w == 8 + add r6, r2, r9 // x + off + sub r9, r2, r9 // x - off + vld1.8 {\d11}, [r6] // p0 + add r6, r6, #16 // += stride + vld1.8 {\d21}, [r9] // p1 + add r9, r9, #16 // += stride + vld1.8 {\d12}, [r6] // p0 + vld1.8 {\d22}, [r9] // p1 +.else + add r6, r2, r9 // x + off + sub r9, r2, r9 // x - off + vld1.32 {\d11[0]}, [r6] // p0 + add r6, r6, #8 // += stride + vld1.32 {\d21[0]}, [r9] // p1 + add r9, r9, #8 // += stride + vld1.32 {\d11[1]}, [r6] // p0 + add r6, r6, #8 // += stride + vld1.32 {\d21[1]}, [r9] // p1 + add r9, r9, #8 // += stride + vld1.32 {\d12[0]}, [r6] // p0 + add r6, r6, #8 // += stride + vld1.32 {\d22[0]}, [r9] // p1 + add r9, r9, #8 // += stride + vld1.32 {\d12[1]}, [r6] // p0 + vld1.32 {\d22[1]}, [r9] // p1 +.endif +.endm +.macro handle_pixel_8 s1, s2, thresh_vec, shift, tap, min +.if \min + vmin.u8 q3, q3, \s1 + vmax.u8 q4, q4, \s1 + vmin.u8 q3, q3, \s2 + vmax.u8 q4, q4, \s2 +.endif + vabd.u8 q8, q0, \s1 // abs(diff) + vabd.u8 q11, q0, \s2 // abs(diff) + vshl.u8 q9, q8, \shift // abs(diff) >> shift + vshl.u8 q12, q11, \shift // abs(diff) >> shift + vqsub.u8 q9, \thresh_vec, q9 // clip = imax(0, threshold - (abs(diff) >> shift)) + vqsub.u8 q12, \thresh_vec, q12// clip = imax(0, threshold - (abs(diff) >> shift)) + vcgt.u8 q10, q0, \s1 // px > p0 + vcgt.u8 q13, q0, \s2 // px > p1 + vmin.u8 q9, q9, q8 // imin(abs(diff), clip) + vmin.u8 q12, q12, q11 // imin(abs(diff), clip) + vneg.s8 q8, q9 // -imin() + vneg.s8 q11, q12 // -imin() + vbsl q10, q8, q9 // constrain() = imax(imin(diff, clip), -clip) + vdup.8 d18, \tap // taps[k] + vbsl q13, q11, q12 // constrain() = imax(imin(diff, clip), -clip) + vmlal.s8 q1, d20, d18 // sum += taps[k] * constrain() + vmlal.s8 q1, d26, d18 // sum += taps[k] * constrain() + vmlal.s8 q2, d21, d18 // sum += taps[k] * constrain() + vmlal.s8 q2, d27, d18 // sum += taps[k] * constrain() +.endm + +// void cdef_filterX_edged_neon(pixel *dst, ptrdiff_t dst_stride, +// const uint16_t *tmp, int pri_strength, +// int sec_strength, int dir, int damping, +// int h, size_t edges); +.macro filter_func_8 w, pri, sec, min, suffix +function cdef_filter\w\suffix\()_edged_neon +.if \pri + movrel_local r8, pri_taps + and r9, r3, #1 + add r8, r8, r9, lsl #1 +.endif + movrel_local r9, directions\w + add r5, r9, r5, lsl #1 + vmov.u8 d17, #7 + vdup.8 d16, r6 // damping + + vmov.8 d8[0], r3 + vmov.8 d8[1], r4 + vclz.i8 d8, d8 // clz(threshold) + vsub.i8 d8, d17, d8 // ulog2(threshold) + vqsub.u8 d8, d16, d8 // shift = imax(0, damping - ulog2(threshold)) + vneg.s8 d8, d8 // -shift +.if \sec + vdup.8 q6, d8[1] +.endif +.if \pri + vdup.8 q5, d8[0] +.endif + +1: +.if \w == 8 + add r12, r2, #16 + vld1.8 {d0}, [r2, :64] // px + vld1.8 {d1}, [r12, :64] // px +.else + add r12, r2, #8 + vld1.32 {d0[0]}, [r2, :32] // px + add r9, r2, #2*8 + vld1.32 {d0[1]}, [r12, :32] // px + add r12, r12, #2*8 + vld1.32 {d1[0]}, [r9, :32] // px + vld1.32 {d1[1]}, [r12, :32] // px +.endif + + vmov.u8 q1, #0 // sum + vmov.u8 q2, #0 // sum +.if \min + vmov.u16 q3, q0 // min + vmov.u16 q4, q0 // max +.endif + + // Instead of loading sec_taps 2, 1 from memory, just set it + // to 2 initially and decrease for the second round. + // This is also used as loop counter. + mov lr, #2 // sec_taps[0] + +2: +.if \pri + ldrsb r9, [r5] // off1 + + load_px_8 d28, d29, d30, d31, \w +.endif + +.if \sec + add r5, r5, #4 // +2*2 + ldrsb r9, [r5] // off2 +.endif + +.if \pri + ldrb r12, [r8] // *pri_taps + vdup.8 q7, r3 // threshold + + handle_pixel_8 q14, q15, q7, q5, r12, \min +.endif + +.if \sec + load_px_8 d28, d29, d30, d31, \w + + add r5, r5, #8 // +2*4 + ldrsb r9, [r5] // off3 + + vdup.8 q7, r4 // threshold + + handle_pixel_8 q14, q15, q7, q6, lr, \min + + load_px_8 d28, d29, d30, d31, \w + + handle_pixel_8 q14, q15, q7, q6, lr, \min + + sub r5, r5, #11 // r5 -= 2*(2+4); r5 += 1; +.else + add r5, r5, #1 // r5 += 1 +.endif + subs lr, lr, #1 // sec_tap-- (value) +.if \pri + add r8, r8, #1 // pri_taps++ (pointer) +.endif + bne 2b + + vshr.s16 q14, q1, #15 // -(sum < 0) + vshr.s16 q15, q2, #15 // -(sum < 0) + vadd.i16 q1, q1, q14 // sum - (sum < 0) + vadd.i16 q2, q2, q15 // sum - (sum < 0) + vrshr.s16 q1, q1, #4 // (8 + sum - (sum < 0)) >> 4 + vrshr.s16 q2, q2, #4 // (8 + sum - (sum < 0)) >> 4 + vaddw.u8 q1, q1, d0 // px + (8 + sum ...) >> 4 + vaddw.u8 q2, q2, d1 // px + (8 + sum ...) >> 4 + vqmovun.s16 d0, q1 + vqmovun.s16 d1, q2 +.if \min + vmin.u8 q0, q0, q4 + vmax.u8 q0, q0, q3 // iclip(px + .., min, max) +.endif +.if \w == 8 + vst1.8 {d0}, [r0, :64], r1 + add r2, r2, #2*16 // tmp += 2*tmp_stride + subs r7, r7, #2 // h -= 2 + vst1.8 {d1}, [r0, :64], r1 +.else + vst1.32 {d0[0]}, [r0, :32], r1 + add r2, r2, #4*8 // tmp += 4*tmp_stride + vst1.32 {d0[1]}, [r0, :32], r1 + subs r7, r7, #4 // h -= 4 + vst1.32 {d1[0]}, [r0, :32], r1 + vst1.32 {d1[1]}, [r0, :32], r1 +.endif + + // Reset pri_taps and directions back to the original point + sub r5, r5, #2 +.if \pri + sub r8, r8, #2 +.endif + + bgt 1b + vpop {q4-q7} + pop {r4-r9,pc} +endfunc +.endm + +.macro filter_8 w +filter_func_8 \w, pri=1, sec=0, min=0, suffix=_pri +filter_func_8 \w, pri=0, sec=1, min=0, suffix=_sec +filter_func_8 \w, pri=1, sec=1, min=1, suffix=_pri_sec +.endm + +filter_8 8 +filter_8 4 diff --git a/third_party/dav1d/src/arm/32/cdef16.S b/third_party/dav1d/src/arm/32/cdef16.S new file mode 100644 index 0000000000..dee2d3dc9e --- /dev/null +++ b/third_party/dav1d/src/arm/32/cdef16.S @@ -0,0 +1,232 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" +#include "cdef_tmpl.S" + +// r1 = d0/q0 +// r2 = d2/q1 +.macro pad_top_bot_16 s1, s2, w, stride, r1, r2, align, ret + tst r6, #1 // CDEF_HAVE_LEFT + beq 2f + // CDEF_HAVE_LEFT + tst r6, #2 // CDEF_HAVE_RIGHT + beq 1f + // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT + vldr s8, [\s1, #-4] + vld1.16 {\r1}, [\s1, :\align] + vldr s9, [\s1, #2*\w] + vldr s10, [\s2, #-4] + vld1.16 {\r2}, [\s2, :\align] + vldr s11, [\s2, #2*\w] + vstr s8, [r0, #-4] + vst1.16 {\r1}, [r0, :\align] + vstr s9, [r0, #2*\w] + add r0, r0, #2*\stride + vstr s10, [r0, #-4] + vst1.16 {\r2}, [r0, :\align] + vstr s11, [r0, #2*\w] +.if \ret + pop {r4-r7,pc} +.else + add r0, r0, #2*\stride + b 3f +.endif + +1: + // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT + vldr s8, [\s1, #-4] + vld1.16 {\r1}, [\s1, :\align] + vldr s9, [\s2, #-4] + vld1.16 {\r2}, [\s2, :\align] + vstr s8, [r0, #-4] + vst1.16 {\r1}, [r0, :\align] + vstr s12, [r0, #2*\w] + add r0, r0, #2*\stride + vstr s9, [r0, #-4] + vst1.16 {\r2}, [r0, :\align] + vstr s12, [r0, #2*\w] +.if \ret + pop {r4-r7,pc} +.else + add r0, r0, #2*\stride + b 3f +.endif + +2: + // !CDEF_HAVE_LEFT + tst r6, #2 // CDEF_HAVE_RIGHT + beq 1f + // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT + vld1.16 {\r1}, [\s1, :\align] + vldr s8, [\s1, #2*\w] + vld1.16 {\r2}, [\s2, :\align] + vldr s9, [\s2, #2*\w] + vstr s12, [r0, #-4] + vst1.16 {\r1}, [r0, :\align] + vstr s8, [r0, #2*\w] + add r0, r0, #2*\stride + vstr s12, [r0, #-4] + vst1.16 {\r2}, [r0, :\align] + vstr s9, [r0, #2*\w] +.if \ret + pop {r4-r7,pc} +.else + add r0, r0, #2*\stride + b 3f +.endif + +1: + // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT + vld1.16 {\r1}, [\s1, :\align] + vld1.16 {\r2}, [\s2, :\align] + vstr s12, [r0, #-4] + vst1.16 {\r1}, [r0, :\align] + vstr s12, [r0, #2*\w] + add r0, r0, #2*\stride + vstr s12, [r0, #-4] + vst1.16 {\r2}, [r0, :\align] + vstr s12, [r0, #2*\w] +.if \ret + pop {r4-r7,pc} +.else + add r0, r0, #2*\stride +.endif +3: +.endm + +// void dav1d_cdef_paddingX_16bpc_neon(uint16_t *tmp, const pixel *src, +// ptrdiff_t src_stride, const pixel (*left)[2], +// const pixel *const top, int h, +// enum CdefEdgeFlags edges); + +// r1 = d0/q0 +// r2 = d2/q1 +.macro padding_func_16 w, stride, r1, r2, align +function cdef_padding\w\()_16bpc_neon, export=1 + push {r4-r7,lr} + ldrd r4, r5, [sp, #20] + ldr r6, [sp, #28] + vmov.i16 q3, #0x8000 + tst r6, #4 // CDEF_HAVE_TOP + bne 1f + // !CDEF_HAVE_TOP + sub r12, r0, #2*(2*\stride+2) + vmov.i16 q2, #0x8000 + vst1.16 {q2,q3}, [r12]! +.if \w == 8 + vst1.16 {q2,q3}, [r12]! +.endif + b 3f +1: + // CDEF_HAVE_TOP + add r7, r4, r2 + sub r0, r0, #2*(2*\stride) + pad_top_bot_16 r4, r7, \w, \stride, \r1, \r2, \align, 0 + + // Middle section +3: + tst r6, #1 // CDEF_HAVE_LEFT + beq 2f + // CDEF_HAVE_LEFT + tst r6, #2 // CDEF_HAVE_RIGHT + beq 1f + // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT +0: + vld1.32 {d2[]}, [r3, :32]! + vldr s5, [r1, #2*\w] + vld1.16 {\r1}, [r1, :\align], r2 + subs r5, r5, #1 + vstr s4, [r0, #-4] + vst1.16 {\r1}, [r0, :\align] + vstr s5, [r0, #2*\w] + add r0, r0, #2*\stride + bgt 0b + b 3f +1: + // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT + vld1.32 {d2[]}, [r3, :32]! + vld1.16 {\r1}, [r1, :\align], r2 + subs r5, r5, #1 + vstr s4, [r0, #-4] + vst1.16 {\r1}, [r0, :\align] + vstr s12, [r0, #2*\w] + add r0, r0, #2*\stride + bgt 1b + b 3f +2: + tst r6, #2 // CDEF_HAVE_RIGHT + beq 1f + // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT +0: + vldr s4, [r1, #2*\w] + vld1.16 {\r1}, [r1, :\align], r2 + subs r5, r5, #1 + vstr s12, [r0, #-4] + vst1.16 {\r1}, [r0, :\align] + vstr s4, [r0, #2*\w] + add r0, r0, #2*\stride + bgt 0b + b 3f +1: + // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT + vld1.16 {\r1}, [r1, :\align], r2 + subs r5, r5, #1 + vstr s12, [r0, #-4] + vst1.16 {\r1}, [r0, :\align] + vstr s12, [r0, #2*\w] + add r0, r0, #2*\stride + bgt 1b + +3: + tst r6, #8 // CDEF_HAVE_BOTTOM + bne 1f + // !CDEF_HAVE_BOTTOM + sub r12, r0, #4 + vmov.i16 q2, #0x8000 + vst1.16 {q2,q3}, [r12]! +.if \w == 8 + vst1.16 {q2,q3}, [r12]! +.endif + pop {r4-r7,pc} +1: + // CDEF_HAVE_BOTTOM + add r7, r1, r2 + pad_top_bot_16 r1, r7, \w, \stride, \r1, \r2, \align, 1 +endfunc +.endm + +padding_func_16 8, 16, q0, q1, 128 +padding_func_16 4, 8, d0, d2, 64 + +tables + +filter 8, 16 +filter 4, 16 + +find_dir 16 diff --git a/third_party/dav1d/src/arm/32/cdef_tmpl.S b/third_party/dav1d/src/arm/32/cdef_tmpl.S new file mode 100644 index 0000000000..33ff9e5816 --- /dev/null +++ b/third_party/dav1d/src/arm/32/cdef_tmpl.S @@ -0,0 +1,515 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +.macro dir_table w, stride +const directions\w + .byte -1 * \stride + 1, -2 * \stride + 2 + .byte 0 * \stride + 1, -1 * \stride + 2 + .byte 0 * \stride + 1, 0 * \stride + 2 + .byte 0 * \stride + 1, 1 * \stride + 2 + .byte 1 * \stride + 1, 2 * \stride + 2 + .byte 1 * \stride + 0, 2 * \stride + 1 + .byte 1 * \stride + 0, 2 * \stride + 0 + .byte 1 * \stride + 0, 2 * \stride - 1 +// Repeated, to avoid & 7 + .byte -1 * \stride + 1, -2 * \stride + 2 + .byte 0 * \stride + 1, -1 * \stride + 2 + .byte 0 * \stride + 1, 0 * \stride + 2 + .byte 0 * \stride + 1, 1 * \stride + 2 + .byte 1 * \stride + 1, 2 * \stride + 2 + .byte 1 * \stride + 0, 2 * \stride + 1 +endconst +.endm + +.macro tables +dir_table 8, 16 +dir_table 4, 8 + +const pri_taps + .byte 4, 2, 3, 3 +endconst +.endm + +.macro load_px d11, d12, d21, d22, w +.if \w == 8 + add r6, r2, r9, lsl #1 // x + off + sub r9, r2, r9, lsl #1 // x - off + vld1.16 {\d11,\d12}, [r6] // p0 + vld1.16 {\d21,\d22}, [r9] // p1 +.else + add r6, r2, r9, lsl #1 // x + off + sub r9, r2, r9, lsl #1 // x - off + vld1.16 {\d11}, [r6] // p0 + add r6, r6, #2*8 // += stride + vld1.16 {\d21}, [r9] // p1 + add r9, r9, #2*8 // += stride + vld1.16 {\d12}, [r6] // p0 + vld1.16 {\d22}, [r9] // p1 +.endif +.endm +.macro handle_pixel s1, s2, thresh_vec, shift, tap, min +.if \min + vmin.u16 q2, q2, \s1 + vmax.s16 q3, q3, \s1 + vmin.u16 q2, q2, \s2 + vmax.s16 q3, q3, \s2 +.endif + vabd.u16 q8, q0, \s1 // abs(diff) + vabd.u16 q11, q0, \s2 // abs(diff) + vshl.u16 q9, q8, \shift // abs(diff) >> shift + vshl.u16 q12, q11, \shift // abs(diff) >> shift + vqsub.u16 q9, \thresh_vec, q9 // clip = imax(0, threshold - (abs(diff) >> shift)) + vqsub.u16 q12, \thresh_vec, q12// clip = imax(0, threshold - (abs(diff) >> shift)) + vsub.i16 q10, \s1, q0 // diff = p0 - px + vsub.i16 q13, \s2, q0 // diff = p1 - px + vneg.s16 q8, q9 // -clip + vneg.s16 q11, q12 // -clip + vmin.s16 q10, q10, q9 // imin(diff, clip) + vmin.s16 q13, q13, q12 // imin(diff, clip) + vdup.16 q9, \tap // taps[k] + vmax.s16 q10, q10, q8 // constrain() = imax(imin(diff, clip), -clip) + vmax.s16 q13, q13, q11 // constrain() = imax(imin(diff, clip), -clip) + vmla.i16 q1, q10, q9 // sum += taps[k] * constrain() + vmla.i16 q1, q13, q9 // sum += taps[k] * constrain() +.endm + +// void dav1d_cdef_filterX_Ybpc_neon(pixel *dst, ptrdiff_t dst_stride, +// const uint16_t *tmp, int pri_strength, +// int sec_strength, int dir, int damping, +// int h, size_t edges); +.macro filter_func w, bpc, pri, sec, min, suffix +function cdef_filter\w\suffix\()_\bpc\()bpc_neon +.if \bpc == 8 + cmp r8, #0xf + beq cdef_filter\w\suffix\()_edged_neon +.endif +.if \pri +.if \bpc == 16 + clz r9, r9 + sub r9, r9, #24 // -bitdepth_min_8 + neg r9, r9 // bitdepth_min_8 +.endif + movrel_local r8, pri_taps +.if \bpc == 16 + lsr r9, r3, r9 // pri_strength >> bitdepth_min_8 + and r9, r9, #1 // (pri_strength >> bitdepth_min_8) & 1 +.else + and r9, r3, #1 +.endif + add r8, r8, r9, lsl #1 +.endif + movrel_local r9, directions\w + add r5, r9, r5, lsl #1 + vmov.u16 d17, #15 + vdup.16 d16, r6 // damping + +.if \pri + vdup.16 q5, r3 // threshold +.endif +.if \sec + vdup.16 q7, r4 // threshold +.endif + vmov.16 d8[0], r3 + vmov.16 d8[1], r4 + vclz.i16 d8, d8 // clz(threshold) + vsub.i16 d8, d17, d8 // ulog2(threshold) + vqsub.u16 d8, d16, d8 // shift = imax(0, damping - ulog2(threshold)) + vneg.s16 d8, d8 // -shift +.if \sec + vdup.16 q6, d8[1] +.endif +.if \pri + vdup.16 q4, d8[0] +.endif + +1: +.if \w == 8 + vld1.16 {q0}, [r2, :128] // px +.else + add r12, r2, #2*8 + vld1.16 {d0}, [r2, :64] // px + vld1.16 {d1}, [r12, :64] // px +.endif + + vmov.u16 q1, #0 // sum +.if \min + vmov.u16 q2, q0 // min + vmov.u16 q3, q0 // max +.endif + + // Instead of loading sec_taps 2, 1 from memory, just set it + // to 2 initially and decrease for the second round. + // This is also used as loop counter. + mov lr, #2 // sec_taps[0] + +2: +.if \pri + ldrsb r9, [r5] // off1 + + load_px d28, d29, d30, d31, \w +.endif + +.if \sec + add r5, r5, #4 // +2*2 + ldrsb r9, [r5] // off2 +.endif + +.if \pri + ldrb r12, [r8] // *pri_taps + + handle_pixel q14, q15, q5, q4, r12, \min +.endif + +.if \sec + load_px d28, d29, d30, d31, \w + + add r5, r5, #8 // +2*4 + ldrsb r9, [r5] // off3 + + handle_pixel q14, q15, q7, q6, lr, \min + + load_px d28, d29, d30, d31, \w + + handle_pixel q14, q15, q7, q6, lr, \min + + sub r5, r5, #11 // r5 -= 2*(2+4); r5 += 1; +.else + add r5, r5, #1 // r5 += 1 +.endif + subs lr, lr, #1 // sec_tap-- (value) +.if \pri + add r8, r8, #1 // pri_taps++ (pointer) +.endif + bne 2b + + vshr.s16 q14, q1, #15 // -(sum < 0) + vadd.i16 q1, q1, q14 // sum - (sum < 0) + vrshr.s16 q1, q1, #4 // (8 + sum - (sum < 0)) >> 4 + vadd.i16 q0, q0, q1 // px + (8 + sum ...) >> 4 +.if \min + vmin.s16 q0, q0, q3 + vmax.s16 q0, q0, q2 // iclip(px + .., min, max) +.endif +.if \bpc == 8 + vmovn.u16 d0, q0 +.endif +.if \w == 8 + add r2, r2, #2*16 // tmp += tmp_stride + subs r7, r7, #1 // h-- +.if \bpc == 8 + vst1.8 {d0}, [r0, :64], r1 +.else + vst1.16 {q0}, [r0, :128], r1 +.endif +.else +.if \bpc == 8 + vst1.32 {d0[0]}, [r0, :32], r1 +.else + vst1.16 {d0}, [r0, :64], r1 +.endif + add r2, r2, #2*16 // tmp += 2*tmp_stride + subs r7, r7, #2 // h -= 2 +.if \bpc == 8 + vst1.32 {d0[1]}, [r0, :32], r1 +.else + vst1.16 {d1}, [r0, :64], r1 +.endif +.endif + + // Reset pri_taps and directions back to the original point + sub r5, r5, #2 +.if \pri + sub r8, r8, #2 +.endif + + bgt 1b + vpop {q4-q7} + pop {r4-r9,pc} +endfunc +.endm + +.macro filter w, bpc +filter_func \w, \bpc, pri=1, sec=0, min=0, suffix=_pri +filter_func \w, \bpc, pri=0, sec=1, min=0, suffix=_sec +filter_func \w, \bpc, pri=1, sec=1, min=1, suffix=_pri_sec + +function cdef_filter\w\()_\bpc\()bpc_neon, export=1 + push {r4-r9,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #92] + ldrd r6, r7, [sp, #100] +.if \bpc == 16 + ldrd r8, r9, [sp, #108] +.else + ldr r8, [sp, #108] +.endif + cmp r3, #0 // pri_strength + bne 1f + b cdef_filter\w\()_sec_\bpc\()bpc_neon // only sec +1: + cmp r4, #0 // sec_strength + bne 1f + b cdef_filter\w\()_pri_\bpc\()bpc_neon // only pri +1: + b cdef_filter\w\()_pri_sec_\bpc\()bpc_neon // both pri and sec +endfunc +.endm + +const div_table, align=4 + .short 840, 420, 280, 210, 168, 140, 120, 105 +endconst + +const alt_fact, align=4 + .short 420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0 +endconst + +.macro cost_alt dest, s1, s2, s3, s4, s5, s6 + vmull.s16 q1, \s1, \s1 // sum_alt[n]*sum_alt[n] + vmull.s16 q2, \s2, \s2 + vmull.s16 q3, \s3, \s3 + vmull.s16 q5, \s4, \s4 // sum_alt[n]*sum_alt[n] + vmull.s16 q12, \s5, \s5 + vmull.s16 q6, \s6, \s6 // q6 overlaps the first \s1-\s2 here + vmul.i32 q1, q1, q13 // sum_alt[n]^2*fact + vmla.i32 q1, q2, q14 + vmla.i32 q1, q3, q15 + vmul.i32 q5, q5, q13 // sum_alt[n]^2*fact + vmla.i32 q5, q12, q14 + vmla.i32 q5, q6, q15 + vadd.i32 d2, d2, d3 + vadd.i32 d3, d10, d11 + vpadd.i32 \dest, d2, d3 // *cost_ptr +.endm + +.macro find_best s1, s2, s3 +.ifnb \s2 + vmov.32 lr, \s2 +.endif + cmp r12, r1 // cost[n] > best_cost + itt gt + movgt r0, r3 // best_dir = n + movgt r1, r12 // best_cost = cost[n] +.ifnb \s2 + add r3, r3, #1 // n++ + cmp lr, r1 // cost[n] > best_cost + vmov.32 r12, \s3 + itt gt + movgt r0, r3 // best_dir = n + movgt r1, lr // best_cost = cost[n] + add r3, r3, #1 // n++ +.endif +.endm + +// int dav1d_cdef_find_dir_Xbpc_neon(const pixel *img, const ptrdiff_t stride, +// unsigned *const var) +.macro find_dir bpc +function cdef_find_dir_\bpc\()bpc_neon, export=1 + push {lr} + vpush {q4-q7} +.if \bpc == 16 + clz r3, r3 // clz(bitdepth_max) + sub lr, r3, #24 // -bitdepth_min_8 +.endif + sub sp, sp, #32 // cost + mov r3, #8 + vmov.u16 q1, #0 // q0-q1 sum_diag[0] + vmov.u16 q3, #0 // q2-q3 sum_diag[1] + vmov.u16 q5, #0 // q4-q5 sum_hv[0-1] + vmov.u16 q8, #0 // q6,d16 sum_alt[0] + // q7,d17 sum_alt[1] + vmov.u16 q9, #0 // q9,d22 sum_alt[2] + vmov.u16 q11, #0 + vmov.u16 q10, #0 // q10,d23 sum_alt[3] + + +.irpc i, 01234567 +.if \bpc == 8 + vld1.8 {d30}, [r0, :64], r1 + vmov.u8 d31, #128 + vsubl.u8 q15, d30, d31 // img[x] - 128 +.else + vld1.16 {q15}, [r0, :128], r1 + vdup.16 q14, lr // -bitdepth_min_8 + vshl.u16 q15, q15, q14 + vmov.u16 q14, #128 + vsub.i16 q15, q15, q14 // img[x] - 128 +.endif + vmov.u16 q14, #0 + +.if \i == 0 + vmov q0, q15 // sum_diag[0] +.else + vext.8 q12, q14, q15, #(16-2*\i) + vext.8 q13, q15, q14, #(16-2*\i) + vadd.i16 q0, q0, q12 // sum_diag[0] + vadd.i16 q1, q1, q13 // sum_diag[0] +.endif + vrev64.16 q13, q15 + vswp d26, d27 // [-x] +.if \i == 0 + vmov q2, q13 // sum_diag[1] +.else + vext.8 q12, q14, q13, #(16-2*\i) + vext.8 q13, q13, q14, #(16-2*\i) + vadd.i16 q2, q2, q12 // sum_diag[1] + vadd.i16 q3, q3, q13 // sum_diag[1] +.endif + + vpadd.u16 d26, d30, d31 // [(x >> 1)] + vmov.u16 d27, #0 + vpadd.u16 d24, d26, d28 + vpadd.u16 d24, d24, d28 // [y] + vmov.u16 r12, d24[0] + vadd.i16 q5, q5, q15 // sum_hv[1] +.if \i < 4 + vmov.16 d8[\i], r12 // sum_hv[0] +.else + vmov.16 d9[\i-4], r12 // sum_hv[0] +.endif + +.if \i == 0 + vmov.u16 q6, q13 // sum_alt[0] +.else + vext.8 q12, q14, q13, #(16-2*\i) + vext.8 q14, q13, q14, #(16-2*\i) + vadd.i16 q6, q6, q12 // sum_alt[0] + vadd.i16 d16, d16, d28 // sum_alt[0] +.endif + vrev64.16 d26, d26 // [-(x >> 1)] + vmov.u16 q14, #0 +.if \i == 0 + vmov q7, q13 // sum_alt[1] +.else + vext.8 q12, q14, q13, #(16-2*\i) + vext.8 q13, q13, q14, #(16-2*\i) + vadd.i16 q7, q7, q12 // sum_alt[1] + vadd.i16 d17, d17, d26 // sum_alt[1] +.endif + +.if \i < 6 + vext.8 q12, q14, q15, #(16-2*(3-(\i/2))) + vext.8 q13, q15, q14, #(16-2*(3-(\i/2))) + vadd.i16 q9, q9, q12 // sum_alt[2] + vadd.i16 d22, d22, d26 // sum_alt[2] +.else + vadd.i16 q9, q9, q15 // sum_alt[2] +.endif +.if \i == 0 + vmov q10, q15 // sum_alt[3] +.elseif \i == 1 + vadd.i16 q10, q10, q15 // sum_alt[3] +.else + vext.8 q12, q14, q15, #(16-2*(\i/2)) + vext.8 q13, q15, q14, #(16-2*(\i/2)) + vadd.i16 q10, q10, q12 // sum_alt[3] + vadd.i16 d23, d23, d26 // sum_alt[3] +.endif +.endr + + vmov.u32 q15, #105 + + vmull.s16 q12, d8, d8 // sum_hv[0]*sum_hv[0] + vmlal.s16 q12, d9, d9 + vmull.s16 q13, d10, d10 // sum_hv[1]*sum_hv[1] + vmlal.s16 q13, d11, d11 + vadd.s32 d8, d24, d25 + vadd.s32 d9, d26, d27 + vpadd.s32 d8, d8, d9 // cost[2,6] (s16, s17) + vmul.i32 d8, d8, d30 // cost[2,6] *= 105 + + vrev64.16 q1, q1 + vrev64.16 q3, q3 + vext.8 q1, q1, q1, #10 // sum_diag[0][14-n] + vext.8 q3, q3, q3, #10 // sum_diag[1][14-n] + + vstr s16, [sp, #2*4] // cost[2] + vstr s17, [sp, #6*4] // cost[6] + + movrel_local r12, div_table + vld1.16 {q14}, [r12, :128] + + vmull.s16 q5, d0, d0 // sum_diag[0]*sum_diag[0] + vmull.s16 q12, d1, d1 + vmlal.s16 q5, d2, d2 + vmlal.s16 q12, d3, d3 + vmull.s16 q0, d4, d4 // sum_diag[1]*sum_diag[1] + vmull.s16 q1, d5, d5 + vmlal.s16 q0, d6, d6 + vmlal.s16 q1, d7, d7 + vmovl.u16 q13, d28 // div_table + vmovl.u16 q14, d29 + vmul.i32 q5, q5, q13 // cost[0] + vmla.i32 q5, q12, q14 + vmul.i32 q0, q0, q13 // cost[4] + vmla.i32 q0, q1, q14 + vadd.i32 d10, d10, d11 + vadd.i32 d0, d0, d1 + vpadd.i32 d0, d10, d0 // cost[0,4] = s0,s1 + + movrel_local r12, alt_fact + vld1.16 {d29, d30, d31}, [r12, :64] // div_table[2*m+1] + 105 + + vstr s0, [sp, #0*4] // cost[0] + vstr s1, [sp, #4*4] // cost[4] + + vmovl.u16 q13, d29 // div_table[2*m+1] + 105 + vmovl.u16 q14, d30 + vmovl.u16 q15, d31 + + cost_alt d14, d12, d13, d16, d14, d15, d17 // cost[1], cost[3] + cost_alt d15, d18, d19, d22, d20, d21, d23 // cost[5], cost[7] + vstr s28, [sp, #1*4] // cost[1] + vstr s29, [sp, #3*4] // cost[3] + + mov r0, #0 // best_dir + vmov.32 r1, d0[0] // best_cost + mov r3, #1 // n + + vstr s30, [sp, #5*4] // cost[5] + vstr s31, [sp, #7*4] // cost[7] + + vmov.32 r12, d14[0] + + find_best d14[0], d8[0], d14[1] + find_best d14[1], d0[1], d15[0] + find_best d15[0], d8[1], d15[1] + find_best d15[1] + + eor r3, r0, #4 // best_dir ^4 + ldr r12, [sp, r3, lsl #2] + sub r1, r1, r12 // best_cost - cost[best_dir ^ 4] + lsr r1, r1, #10 + str r1, [r2] // *var + + add sp, sp, #32 + vpop {q4-q7} + pop {pc} +endfunc +.endm diff --git a/third_party/dav1d/src/arm/32/ipred.S b/third_party/dav1d/src/arm/32/ipred.S new file mode 100644 index 0000000000..788c0625d5 --- /dev/null +++ b/third_party/dav1d/src/arm/32/ipred.S @@ -0,0 +1,2959 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2020, Martin Storsjo + * Copyright © 2019, B Krishnan Iyer + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +// void ipred_dc_128_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_128_8bpc_neon, export=1 + push {r4, lr} + ldr r4, [sp, #8] + clz r3, r3 + adr r2, L(ipred_dc_128_tbl) + sub r3, r3, #25 + ldr r3, [r2, r3, lsl #2] + mov lr, #128 + vdup.8 q0, lr + add r2, r2, r3 + add r12, r0, r1 + lsl r1, r1, #1 + bx r2 + + .align 2 +L(ipred_dc_128_tbl): + .word 640f - L(ipred_dc_128_tbl) + CONFIG_THUMB + .word 320f - L(ipred_dc_128_tbl) + CONFIG_THUMB + .word 16f - L(ipred_dc_128_tbl) + CONFIG_THUMB + .word 8f - L(ipred_dc_128_tbl) + CONFIG_THUMB + .word 4f - L(ipred_dc_128_tbl) + CONFIG_THUMB +4: + vst1.32 {d0[0]}, [r0, :32], r1 + vst1.32 {d0[0]}, [r12, :32], r1 + subs r4, r4, #4 + vst1.32 {d0[0]}, [r0, :32], r1 + vst1.32 {d0[0]}, [r12, :32], r1 + bgt 4b + pop {r4, pc} +8: + vst1.8 {d0}, [r0, :64], r1 + vst1.8 {d0}, [r12, :64], r1 + subs r4, r4, #4 + vst1.8 {d0}, [r0, :64], r1 + vst1.8 {d0}, [r12, :64], r1 + bgt 8b + pop {r4, pc} +16: + vst1.8 {d0, d1}, [r0, :128], r1 + vst1.8 {d0, d1}, [r12, :128], r1 + subs r4, r4, #4 + vst1.8 {d0, d1}, [r0, :128], r1 + vst1.8 {d0, d1}, [r12, :128], r1 + bgt 16b + pop {r4, pc} +320: + vdup.8 q1, lr +32: + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 + subs r4, r4, #4 + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 32b + pop {r4, pc} +640: + vdup.8 q1, lr + vdup.8 q2, lr + vdup.8 q3, lr + sub r1, r1, #32 +64: + vst1.8 {d0, d1, d2, d3}, [r0, :128]! + vst1.8 {d0, d1, d2, d3}, [r12, :128]! + vst1.8 {d4, d5, d6, d7}, [r0, :128], r1 + vst1.8 {d4, d5, d6, d7}, [r12, :128], r1 + subs r4, r4, #4 + vst1.8 {d0, d1, d2, d3}, [r0, :128]! + vst1.8 {d0, d1, d2, d3}, [r12, :128]! + vst1.8 {d4, d5, d6, d7}, [r0, :128], r1 + vst1.8 {d4, d5, d6, d7}, [r12, :128], r1 + bgt 64b + pop {r4, pc} +endfunc + +// void ipred_v_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_v_8bpc_neon, export=1 + push {r4, lr} + ldr lr, [sp, #8] + clz r3, r3 + adr r4, L(ipred_v_tbl) + sub r3, r3, #25 + ldr r3, [r4, r3, lsl #2] + add r2, r2, #1 + add r4, r4, r3 + add r12, r0, r1 + lsl r1, r1, #1 + bx r4 + + .align 2 +L(ipred_v_tbl): + .word 640f - L(ipred_v_tbl) + CONFIG_THUMB + .word 320f - L(ipred_v_tbl) + CONFIG_THUMB + .word 160f - L(ipred_v_tbl) + CONFIG_THUMB + .word 80f - L(ipred_v_tbl) + CONFIG_THUMB + .word 40f - L(ipred_v_tbl) + CONFIG_THUMB +40: + vld1.32 {d0[]}, [r2] +4: + vst1.32 {d0[0]}, [r0, :32], r1 + vst1.32 {d0[0]}, [r12, :32], r1 + subs lr, lr, #4 + vst1.32 {d0[0]}, [r0, :32], r1 + vst1.32 {d0[0]}, [r12, :32], r1 + bgt 4b + pop {r4, pc} +80: + vld1.8 {d0}, [r2] +8: + vst1.8 {d0}, [r0, :64], r1 + vst1.8 {d0}, [r12, :64], r1 + subs lr, lr, #4 + vst1.8 {d0}, [r0, :64], r1 + vst1.8 {d0}, [r12, :64], r1 + bgt 8b + pop {r4, pc} +160: + vld1.8 {q0}, [r2] +16: + vst1.8 {d0, d1}, [r0, :128], r1 + vst1.8 {d0, d1}, [r12, :128], r1 + subs lr, lr, #4 + vst1.8 {d0, d1}, [r0, :128], r1 + vst1.8 {d0, d1}, [r12, :128], r1 + bgt 16b + pop {r4, pc} +320: + vld1.8 {q0, q1}, [r2] +32: + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 + subs lr, lr, #4 + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 32b + pop {r4, pc} +640: + vld1.8 {q0, q1}, [r2]! + sub r1, r1, #32 + vld1.8 {q2, q3}, [r2] +64: + vst1.8 {d0, d1, d2, d3}, [r0, :128]! + vst1.8 {d0, d1, d2, d3}, [r12, :128]! + vst1.8 {d4, d5, d6, d7}, [r0, :128], r1 + vst1.8 {d4, d5, d6, d7}, [r12, :128], r1 + subs lr, lr, #4 + vst1.8 {d0, d1, d2, d3}, [r0, :128]! + vst1.8 {d0, d1, d2, d3}, [r12, :128]! + vst1.8 {d4, d5, d6, d7}, [r0, :128], r1 + vst1.8 {d4, d5, d6, d7}, [r12, :128], r1 + bgt 64b + pop {r4, pc} +endfunc + +// void ipred_h_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_h_8bpc_neon, export=1 + push {r4-r5, lr} + ldr r4, [sp, #12] + clz r3, r3 + adr r5, L(ipred_h_tbl) + sub r3, r3, #25 + ldr r3, [r5, r3, lsl #2] + sub r2, r2, #4 + mov lr, #-4 + add r5, r5, r3 + add r12, r0, r1 + lsl r1, r1, #1 + bx r5 + + .align 2 +L(ipred_h_tbl): + .word 640f - L(ipred_h_tbl) + CONFIG_THUMB + .word 320f - L(ipred_h_tbl) + CONFIG_THUMB + .word 160f - L(ipred_h_tbl) + CONFIG_THUMB + .word 8f - L(ipred_h_tbl) + CONFIG_THUMB + .word 4f - L(ipred_h_tbl) + CONFIG_THUMB +4: + vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], lr + vst1.32 {d3[0]}, [r0, :32], r1 + vst1.32 {d2[0]}, [r12, :32], r1 + subs r4, r4, #4 + vst1.32 {d1[0]}, [r0, :32], r1 + vst1.32 {d0[0]}, [r12, :32], r1 + bgt 4b + pop {r4-r5, pc} +8: + vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], lr + vst1.8 {d3}, [r0, :64], r1 + vst1.8 {d2}, [r12, :64], r1 + subs r4, r4, #4 + vst1.8 {d1}, [r0, :64], r1 + vst1.8 {d0}, [r12, :64], r1 + bgt 8b + pop {r4-r5, pc} +160: + add r2, r2, #3 + mov lr, #-1 +16: + vld1.8 {d0[], d1[]}, [r2], lr + subs r4, r4, #4 + vld1.8 {d2[], d3[]}, [r2], lr + vst1.8 {q0}, [r0, :128], r1 + vld1.8 {d4[], d5[]}, [r2], lr + vst1.8 {q1}, [r12, :128], r1 + vld1.8 {d6[], d7[]}, [r2], lr + vst1.8 {q2}, [r0, :128], r1 + vst1.8 {q3}, [r12, :128], r1 + bgt 16b + pop {r4-r5, pc} +320: + add r2, r2, #3 + mov lr, #-1 + sub r1, r1, #16 +32: + vld1.8 {d0[], d1[]}, [r2], lr + subs r4, r4, #4 + vld1.8 {d2[], d3[]}, [r2], lr + vst1.8 {q0}, [r0, :128]! + vld1.8 {d4[], d5[]}, [r2], lr + vst1.8 {q1}, [r12, :128]! + vld1.8 {d6[], d7[]}, [r2], lr + vst1.8 {q0}, [r0, :128], r1 + vst1.8 {q1}, [r12, :128], r1 + vst1.8 {q2}, [r0, :128]! + vst1.8 {q3}, [r12, :128]! + vst1.8 {q2}, [r0, :128], r1 + vst1.8 {q3}, [r12, :128], r1 + bgt 32b + pop {r4-r5, pc} +640: + add r2, r2, #3 + mov lr, #-1 + sub r1, r1, #48 +64: + vld1.8 {d0[], d1[]}, [r2], lr + subs r4, r4, #4 + vld1.8 {d2[], d3[]}, [r2], lr + vst1.8 {q0}, [r0, :128]! + vld1.8 {d4[], d5[]}, [r2], lr + vst1.8 {q1}, [r12, :128]! + vld1.8 {d6[], d7[]}, [r2], lr + vst1.8 {q0}, [r0, :128]! + vst1.8 {q1}, [r12, :128]! + vst1.8 {q0}, [r0, :128]! + vst1.8 {q1}, [r12, :128]! + vst1.8 {q0}, [r0, :128], r1 + vst1.8 {q1}, [r12, :128], r1 + vst1.8 {q2}, [r0, :128]! + vst1.8 {q3}, [r12, :128]! + vst1.8 {q2}, [r0, :128]! + vst1.8 {q3}, [r12, :128]! + vst1.8 {q2}, [r0, :128]! + vst1.8 {q3}, [r12, :128]! + vst1.8 {q2}, [r0, :128], r1 + vst1.8 {q3}, [r12, :128], r1 + bgt 64b + pop {r4-r5, pc} +endfunc + +// void ipred_dc_top_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_top_8bpc_neon, export=1 + push {r4-r5, lr} + ldr r4, [sp, #12] + clz r3, r3 + adr r5, L(ipred_dc_top_tbl) + sub r3, r3, #25 + ldr r3, [r5, r3, lsl #2] + add r2, r2, #1 + add r5, r5, r3 + add r12, r0, r1 + lsl r1, r1, #1 + bx r5 + + .align 2 +L(ipred_dc_top_tbl): + .word 640f - L(ipred_dc_top_tbl) + CONFIG_THUMB + .word 320f - L(ipred_dc_top_tbl) + CONFIG_THUMB + .word 160f - L(ipred_dc_top_tbl) + CONFIG_THUMB + .word 80f - L(ipred_dc_top_tbl) + CONFIG_THUMB + .word 40f - L(ipred_dc_top_tbl) + CONFIG_THUMB +40: + vld1.32 {d0[]}, [r2] + vpaddl.u8 d0, d0 + vpadd.u16 d0, d0 + vrshrn.u16 d0, q0, #2 + vdup.8 d0, d0[0] +4: + vst1.32 {d0[0]}, [r0, :32], r1 + vst1.32 {d0[0]}, [r12, :32], r1 + subs r4, r4, #4 + vst1.32 {d0[0]}, [r0, :32], r1 + vst1.32 {d0[0]}, [r12, :32], r1 + bgt 4b + pop {r4-r5, pc} +80: + vld1.8 {d0}, [r2] + vpaddl.u8 d0, d0 + vpadd.u16 d0, d0 + vpadd.u16 d0, d0 + vrshrn.u16 d0, q0, #3 + vdup.8 d0, d0[0] +8: + vst1.8 {d0}, [r0, :64], r1 + vst1.8 {d0}, [r12, :64], r1 + subs r4, r4, #4 + vst1.8 {d0}, [r0, :64], r1 + vst1.8 {d0}, [r12, :64], r1 + bgt 8b + pop {r4-r5, pc} +160: + vld1.8 {d0, d1}, [r2] + vaddl.u8 q0, d0, d1 + vadd.u16 d0, d0, d1 + vpadd.u16 d0, d0 + vpadd.u16 d0, d0 + vrshrn.u16 d0, q0, #4 + vdup.8 q0, d0[0] +16: + vst1.8 {d0, d1}, [r0, :128], r1 + vst1.8 {d0, d1}, [r12, :128], r1 + subs r4, r4, #4 + vst1.8 {d0, d1}, [r0, :128], r1 + vst1.8 {d0, d1}, [r12, :128], r1 + bgt 16b + pop {r4-r5, pc} +320: + vld1.8 {d0, d1, d2, d3}, [r2] + vaddl.u8 q0, d0, d1 + vaddl.u8 q1, d2, d3 + vadd.u16 q0, q0, q1 + vadd.u16 d0, d0, d1 + vpadd.u16 d0, d0 + vpadd.u16 d0, d0 + vrshrn.u16 d4, q0, #5 + vdup.8 q0, d4[0] + vdup.8 q1, d4[0] +32: + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 + subs r4, r4, #4 + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 32b + pop {r4-r5, pc} +640: + vld1.8 {d0, d1, d2, d3}, [r2]! + vaddl.u8 q0, d0, d1 + vld1.8 {d4, d5, d6, d7}, [r2] + vaddl.u8 q1, d2, d3 + vaddl.u8 q2, d4, d5 + vaddl.u8 q3, d6, d7 + vadd.u16 q0, q0, q1 + vadd.u16 q1, q2, q3 + vadd.u16 q0, q0, q1 + vadd.u16 d0, d0, d1 + vpadd.u16 d0, d0 + vpadd.u16 d0, d0 + vrshrn.u16 d18, q0, #6 + vdup.8 q0, d18[0] + vdup.8 q1, d18[0] + vdup.8 q2, d18[0] + vdup.8 q3, d18[0] + sub r1, r1, #32 +64: + vst1.8 {d0, d1, d2, d3}, [r0, :128]! + vst1.8 {d0, d1, d2, d3}, [r12, :128]! + vst1.8 {d4, d5, d6, d7}, [r0, :128], r1 + vst1.8 {d4, d5, d6, d7}, [r12, :128], r1 + subs r4, r4, #4 + vst1.8 {d0, d1, d2, d3}, [r0, :128]! + vst1.8 {d0, d1, d2, d3}, [r12, :128]! + vst1.8 {d4, d5, d6, d7}, [r0, :128], r1 + vst1.8 {d4, d5, d6, d7}, [r12, :128], r1 + bgt 64b + pop {r4-r5, pc} +endfunc + +// void ipred_dc_left_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_left_8bpc_neon, export=1 + push {r4-r5, lr} + ldr r4, [sp, #12] + sub r2, r2, r4 + clz r3, r3 + clz lr, r4 + sub lr, lr, #25 + adr r5, L(ipred_dc_left_tbl) + sub r3, r3, #20 + ldr r3, [r5, r3, lsl #2] + ldr lr, [r5, lr, lsl #2] + add r3, r5, r3 + add r5, r5, lr + add r12, r0, r1 + lsl r1, r1, #1 + bx r5 + + .align 2 +L(ipred_dc_left_tbl): + .word L(ipred_dc_left_h64) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_h32) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_h16) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_h8) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_h4) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_w64) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_w32) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_w16) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_w8) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_w4) - L(ipred_dc_left_tbl) + CONFIG_THUMB + +L(ipred_dc_left_h4): + vld1.32 {d0[]}, [r2, :32] + vpaddl.u8 d0, d0 + vpadd.u16 d0, d0 + vrshrn.u16 d0, q0, #2 + vdup.8 q0, d0[0] + bx r3 +L(ipred_dc_left_w4): + vst1.32 {d0[0]}, [r0, :32], r1 + vst1.32 {d0[0]}, [r12, :32], r1 + subs r4, r4, #4 + vst1.32 {d0[0]}, [r0, :32], r1 + vst1.32 {d0[0]}, [r12, :32], r1 + bgt L(ipred_dc_left_w4) + pop {r4-r5, pc} +L(ipred_dc_left_h8): + vld1.8 {d0}, [r2, :64] + vpaddl.u8 d0, d0 + vpadd.u16 d0, d0 + vpadd.u16 d0, d0 + vrshrn.u16 d0, q0, #3 + vdup.8 q0, d0[0] + bx r3 +L(ipred_dc_left_w8): + vst1.8 {d0}, [r0, :64], r1 + vst1.8 {d0}, [r12, :64], r1 + subs r4, r4, #4 + vst1.8 {d0}, [r0, :64], r1 + vst1.8 {d0}, [r12, :64], r1 + bgt L(ipred_dc_left_w8) + pop {r4-r5, pc} +L(ipred_dc_left_h16): + vld1.8 {d0, d1}, [r2, :128] + vaddl.u8 q0, d0, d1 + vadd.u16 d0, d0, d1 + vpadd.u16 d0, d0 + vpadd.u16 d0, d0 + vrshrn.u16 d0, q0, #4 + vdup.8 q0, d0[0] + bx r3 +L(ipred_dc_left_w16): + vst1.8 {d0, d1}, [r0, :128], r1 + vst1.8 {d0, d1}, [r12, :128], r1 + subs r4, r4, #4 + vst1.8 {d0, d1}, [r0, :128], r1 + vst1.8 {d0, d1}, [r12, :128], r1 + bgt L(ipred_dc_left_w16) + pop {r4-r5, pc} +L(ipred_dc_left_h32): + vld1.8 {d0, d1, d2, d3}, [r2, :128] + vaddl.u8 q0, d0, d1 + vaddl.u8 q1, d2, d3 + vadd.u16 q0, q0, q1 + vadd.u16 d0, d0, d1 + vpadd.u16 d0, d0 + vpadd.u16 d0, d0 + vrshrn.u16 d0, q0, #5 + vdup.8 q0, d0[0] + bx r3 +L(ipred_dc_left_w32): + vmov.8 q1, q0 +1: + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 + subs r4, r4, #4 + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 1b + pop {r4-r5, pc} +L(ipred_dc_left_h64): + vld1.8 {d0, d1, d2, d3}, [r2, :128]! + vld1.8 {d4, d5, d6, d7}, [r2, :128] + vaddl.u8 q0, d0, d1 + vaddl.u8 q1, d2, d3 + vaddl.u8 q2, d4, d5 + vaddl.u8 q3, d6, d7 + vadd.u16 q0, q0, q1 + vadd.u16 q1, q2, q3 + vadd.u16 q0, q0, q1 + vadd.u16 d0, d0, d1 + vpadd.u16 d0, d0 + vpadd.u16 d0, d0 + vrshrn.u16 d0, q0, #6 + vdup.8 q0, d0[0] + bx r3 +L(ipred_dc_left_w64): + sub r1, r1, #32 + vmov.8 q1, q0 + vmov.8 q2, q0 + vmov.8 q3, q0 +1: + vst1.8 {d0, d1, d2, d3}, [r0, :128]! + vst1.8 {d0, d1, d2, d3}, [r12, :128]! + vst1.8 {d4, d5, d6, d7}, [r0, :128], r1 + vst1.8 {d4, d5, d6, d7}, [r12, :128], r1 + subs r4, r4, #4 + vst1.8 {d0, d1, d2, d3}, [r0, :128]! + vst1.8 {d0, d1, d2, d3}, [r12, :128]! + vst1.8 {d4, d5, d6, d7}, [r0, :128], r1 + vst1.8 {d4, d5, d6, d7}, [r12, :128], r1 + bgt 1b + pop {r4-r5, pc} +endfunc + +// void ipred_dc_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_8bpc_neon, export=1 + push {r4-r6, lr} + ldr r4, [sp, #16] + sub r2, r2, r4 + add lr, r3, r4 // width + height + clz r3, r3 + clz r12, r4 + vdup.16 q15, lr // width + height + adr r5, L(ipred_dc_tbl) + rbit lr, lr // rbit(width + height) + sub r3, r3, #20 // 25 leading bits, minus table offset 5 + sub r12, r12, #25 + clz lr, lr // ctz(width + height) + ldr r3, [r5, r3, lsl #2] + ldr r12, [r5, r12, lsl #2] + neg lr, lr // -ctz(width + height) + add r3, r5, r3 + add r5, r5, r12 + vshr.u16 q15, q15, #1 // (width + height) >> 1 + vdup.16 q14, lr // -ctz(width + height) + add r12, r0, r1 + lsl r1, r1, #1 + bx r5 + + .align 2 +L(ipred_dc_tbl): + .word L(ipred_dc_h64) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_h32) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_h16) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_h8) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_h4) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_w64) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_w32) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_w16) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_w8) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_w4) - L(ipred_dc_tbl) + CONFIG_THUMB + +L(ipred_dc_h4): + vld1.32 {d0[]}, [r2, :32]! + vpaddl.u8 d0, d0 + vpadd.u16 d0, d0 + bx r3 +L(ipred_dc_w4): + add r2, r2, #1 + vld1.32 {d1[]}, [r2] + vadd.s16 d0, d0, d30 + vpaddl.u8 d1, d1 + vpadd.u16 d1, d1 + cmp r4, #4 + vadd.s16 d0, d0, d1 + vshl.u16 d0, d0, d28 + beq 1f + // h = 8/16 + movw lr, #(0x3334/2) + movw r5, #(0x5556/2) + cmp r4, #16 + it ne + movne lr, r5 + vdup.16 d30, lr + vqdmulh.s16 d0, d0, d30 +1: + vdup.8 d0, d0[0] +2: + vst1.32 {d0[0]}, [r0, :32], r1 + vst1.32 {d0[0]}, [r12, :32], r1 + subs r4, r4, #4 + vst1.32 {d0[0]}, [r0, :32], r1 + vst1.32 {d0[0]}, [r12, :32], r1 + bgt 2b + pop {r4-r6, pc} + +L(ipred_dc_h8): + vld1.8 {d0}, [r2, :64]! + vpaddl.u8 d0, d0 + vpadd.u16 d0, d0 + vpadd.u16 d0, d0 + bx r3 +L(ipred_dc_w8): + add r2, r2, #1 + vld1.8 {d2}, [r2] + vadd.s16 d0, d0, d30 + vpaddl.u8 d2, d2 + vpadd.u16 d2, d2 + vpadd.u16 d2, d2 + cmp r4, #8 + vadd.s16 d0, d0, d2 + vshl.u16 d0, d0, d28 + beq 1f + // h = 4/16/32 + cmp r4, #32 + movw lr, #(0x3334/2) + movw r5, #(0x5556/2) + it ne + movne lr, r5 + vdup.16 d24, lr + vqdmulh.s16 d0, d0, d24 +1: + vdup.8 d0, d0[0] +2: + vst1.8 {d0}, [r0, :64], r1 + vst1.8 {d0}, [r12, :64], r1 + subs r4, r4, #4 + vst1.8 {d0}, [r0, :64], r1 + vst1.8 {d0}, [r12, :64], r1 + bgt 2b + pop {r4-r6, pc} + +L(ipred_dc_h16): + vld1.8 {d0, d1}, [r2, :128]! + vaddl.u8 q0, d0, d1 + vadd.u16 d0, d0, d1 + vpadd.u16 d0, d0 + vpadd.u16 d0, d0 + bx r3 +L(ipred_dc_w16): + add r2, r2, #1 + vld1.8 {d2, d3}, [r2] + vadd.s16 d0, d0, d30 + vaddl.u8 q1, d2, d3 + vadd.u16 d2, d2, d3 + vpadd.u16 d2, d2 + vpadd.u16 d2, d2 + cmp r4, #16 + vadd.s16 d0, d0, d2 + vshl.u16 d0, d0, d28 + beq 1f + // h = 4/8/32/64 + tst r4, #(32+16+8) // 16 added to make a consecutive bitmask + movw lr, #(0x3334/2) + movw r5, #(0x5556/2) + it ne + movne lr, r5 + vdup.16 d24, lr + vqdmulh.s16 d0, d0, d24 +1: + vdup.8 q0, d0[0] +2: + vst1.8 {d0, d1}, [r0, :128], r1 + vst1.8 {d0, d1}, [r12, :128], r1 + subs r4, r4, #4 + vst1.8 {d0, d1}, [r0, :128], r1 + vst1.8 {d0, d1}, [r12, :128], r1 + bgt 2b + pop {r4-r6, pc} + +L(ipred_dc_h32): + vld1.8 {d0, d1, d2, d3}, [r2, :128]! + vaddl.u8 q0, d0, d1 + vaddl.u8 q1, d2, d3 + vadd.u16 q0, q0, q1 + vadd.u16 d0, d0, d1 + vpadd.u16 d0, d0 + vpadd.u16 d0, d0 + bx r3 +L(ipred_dc_w32): + add r2, r2, #1 + vld1.8 {d2, d3, d4, d5}, [r2] + vadd.s16 d0, d0, d30 + vaddl.u8 q1, d2, d3 + vaddl.u8 q2, d4, d5 + vadd.u16 q1, q1, q2 + vadd.u16 d2, d2, d3 + vpadd.u16 d2, d2 + vpadd.u16 d2, d2 + cmp r4, #32 + vadd.s16 d0, d0, d2 + vshl.u16 d4, d0, d28 + beq 1f + // h = 8/16/64 + cmp r4, #8 + movw lr, #(0x3334/2) + movw r5, #(0x5556/2) + it ne + movne lr, r5 + vdup.16 d24, lr + vqdmulh.s16 d4, d4, d24 +1: + vdup.8 q0, d4[0] + vdup.8 q1, d4[0] +2: + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 + subs r4, r4, #4 + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 2b + pop {r4-r6, pc} + +L(ipred_dc_h64): + vld1.8 {d0, d1, d2, d3}, [r2, :128]! + vaddl.u8 q0, d0, d1 + vld1.8 {d4, d5, d6, d7}, [r2, :128]! + vaddl.u8 q1, d2, d3 + vaddl.u8 q2, d4, d5 + vaddl.u8 q3, d6, d7 + vadd.u16 q0, q0, q1 + vadd.u16 q1, q2, q3 + vadd.u16 q0, q0, q1 + vadd.u16 d0, d0, d1 + vpadd.u16 d0, d0 + vpadd.u16 d0, d0 + bx r3 +L(ipred_dc_w64): + add r2, r2, #1 + vld1.8 {d2, d3, d4, d5}, [r2]! + vadd.s16 d0, d0, d30 + vaddl.u8 q2, d4, d5 + vaddl.u8 q1, d2, d3 + vadd.u16 d4, d4, d5 + vadd.u16 d2, d2, d3 + vld1.8 {d16, d17, d18, d19}, [r2] + vpadd.u16 d4, d4 + vpadd.u16 d2, d2 + vpadd.u16 d4, d4 + vpadd.u16 d2, d2 + vaddl.u8 q8, d16, d17 + vaddl.u8 q9, d18, d19 + vadd.u16 d16, d16, d17 + vadd.u16 d18, d18, d19 + vpadd.u16 d16, d16 + vpadd.u16 d18, d18 + vpadd.u16 d16, d16 + vpadd.u16 d18, d18 + vadd.u16 d2, d2, d4 + vadd.u16 d3, d16, d18 + cmp r4, #64 + vadd.s16 d0, d0, d2 + vadd.s16 d0, d0, d3 + vshl.u16 d18, d0, d28 + beq 1f // h = 16/32 + movw lr, #(0x5556/2) + movt lr, #(0x3334/2) + mov r5, r4 + and r5, r5, #31 + lsr lr, lr, r5 + vdup.16 d30, lr + vqdmulh.s16 d18, d18, d30 +1: + sub r1, r1, #32 + vdup.8 q0, d18[0] + vdup.8 q1, d18[0] + vdup.8 q2, d18[0] + vdup.8 q3, d18[0] +2: + vst1.8 {d0, d1, d2, d3}, [r0, :128]! + vst1.8 {d0, d1, d2, d3}, [r12, :128]! + vst1.8 {d4, d5, d6, d7}, [r0, :128], r1 + vst1.8 {d4, d5, d6, d7}, [r12, :128], r1 + subs r4, r4, #4 + vst1.8 {d0, d1, d2, d3}, [r0, :128]! + vst1.8 {d0, d1, d2, d3}, [r12, :128]! + vst1.8 {d4, d5, d6, d7}, [r0, :128], r1 + vst1.8 {d4, d5, d6, d7}, [r12, :128], r1 + bgt 2b + pop {r4-r6, pc} +endfunc + +// void ipred_paeth_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_paeth_8bpc_neon, export=1 + push {r4-r8, lr} + ldr r4, [sp, #24] + clz lr, r3 + adr r5, L(ipred_paeth_tbl) + sub lr, lr, #25 + ldr lr, [r5, lr, lsl #2] + vld1.8 {d4[], d5[]}, [r2] + add r8, r2, #1 + sub r2, r2, #4 + add r5, r5, lr + mov r7, #-4 + add r6, r0, r1 + lsl r1, r1, #1 + bx r5 + + .align 2 +L(ipred_paeth_tbl): + .word 640f - L(ipred_paeth_tbl) + CONFIG_THUMB + .word 320f - L(ipred_paeth_tbl) + CONFIG_THUMB + .word 160f - L(ipred_paeth_tbl) + CONFIG_THUMB + .word 80f - L(ipred_paeth_tbl) + CONFIG_THUMB + .word 40f - L(ipred_paeth_tbl) + CONFIG_THUMB + +40: + vld1.32 {d6[], d7[]}, [r8] + vsubl.u8 q8, d6, d4 // top - topleft +4: + vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7 + vzip.32 d0, d1 + vzip.32 d2, d3 + vaddw.u8 q9, q8, d0 + vaddw.u8 q10, q8, d2 + vqmovun.s16 d18, q9 // base + vqmovun.s16 d19, q10 + vmov d1, d2 + vabd.u8 q10, q3, q9 // tdiff + vabd.u8 q11, q2, q9 // tldiff + vabd.u8 q9, q0, q9 // ldiff + vmin.u8 q12, q10, q11 // min(tdiff, tldiff) + vcge.u8 q10, q11, q10 // tldiff >= tdiff + vcge.u8 q9, q12, q9 // min(tdiff, tldiff) >= ldiff + vbsl q10, q3, q2 // tdiff <= tldiff ? top : topleft + vbit q10, q0, q9 // ldiff <= min ? left : ... + vst1.32 {d21[1]}, [r0, :32], r1 + vst1.32 {d21[0]}, [r6, :32], r1 + subs r4, r4, #4 + vst1.32 {d20[1]}, [r0, :32], r1 + vst1.32 {d20[0]}, [r6, :32], r1 + bgt 4b + pop {r4-r8, pc} +80: + vld1.8 {d6}, [r8] + vsubl.u8 q8, d6, d4 // top - topleft + vmov d7, d6 +8: + vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7 + vaddw.u8 q9, q8, d0 + vaddw.u8 q10, q8, d1 + vaddw.u8 q11, q8, d2 + vaddw.u8 q12, q8, d3 + vqmovun.s16 d18, q9 // base + vqmovun.s16 d19, q10 + vqmovun.s16 d20, q11 + vqmovun.s16 d21, q12 + vabd.u8 q11, q3, q9 // tdiff + vabd.u8 q12, q3, q10 + vabd.u8 q13, q2, q9 // tldiff + vabd.u8 q14, q2, q10 + vabd.u8 q10, q1, q10 // ldiff + vabd.u8 q9, q0, q9 + vmin.u8 q15, q12, q14 // min(tdiff, tldiff) + vcge.u8 q12, q14, q12 // tldiff >= tdiff + vmin.u8 q14, q11, q13 // min(tdiff, tldiff) + vcge.u8 q11, q13, q11 // tldiff >= tdiff + vcge.u8 q10, q15, q10 // min(tdiff, tldiff) >= ldiff + vcge.u8 q9, q14, q9 + vbsl q12, q3, q2 // tdiff <= tldiff ? top : topleft + vbsl q11, q3, q2 + vbit q12, q1, q10 // ldiff <= min ? left : ... + vbit q11, q0, q9 + vst1.8 {d25}, [r0, :64], r1 + vst1.8 {d24}, [r6, :64], r1 + subs r4, r4, #4 + vst1.8 {d23}, [r0, :64], r1 + vst1.8 {d22}, [r6, :64], r1 + bgt 8b + pop {r4-r8, pc} +160: +320: +640: + vld1.8 {d6}, [r8]! + mov r12, r3 + // Set up pointers for four rows in parallel; r0, r6, r5, lr + add r5, r0, r1 + add lr, r6, r1 + lsl r1, r1, #1 + sub r1, r1, r3 +1: + vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7 +2: + vsubl.u8 q8, d6, d4 // top - topleft + vmov d7, d6 + vaddw.u8 q9, q8, d0 + vaddw.u8 q10, q8, d1 + vaddw.u8 q11, q8, d2 + vaddw.u8 q12, q8, d3 + vqmovun.s16 d18, q9 // base + vqmovun.s16 d19, q10 + vqmovun.s16 d20, q11 + vqmovun.s16 d21, q12 + vabd.u8 q11, q3, q9 // tdiff + vabd.u8 q12, q3, q10 + vabd.u8 q13, q2, q9 // tldiff + vabd.u8 q14, q2, q10 + vabd.u8 q10, q1, q10 // ldiff + vabd.u8 q9, q0, q9 + vmin.u8 q15, q12, q14 // min(tdiff, tldiff) + vcge.u8 q12, q14, q12 // tldiff >= tdiff + vmin.u8 q14, q11, q13 // min(tdiff, tldiff) + vcge.u8 q11, q13, q11 // tldiff >= tdiff + vcge.u8 q10, q15, q10 // min(tdiff, tldiff) >= ldiff + vcge.u8 q9, q14, q9 + vbsl q12, q3, q2 // tdiff <= tldiff ? top : topleft + vbsl q11, q3, q2 + vbit q12, q1, q10 // ldiff <= min ? left : ... + vbit q11, q0, q9 + subs r3, r3, #8 + vst1.8 {d25}, [r0, :64]! + vst1.8 {d24}, [r6, :64]! + vst1.8 {d23}, [r5, :64]! + vst1.8 {d22}, [lr, :64]! + ble 8f + vld1.8 {d6}, [r8]! + b 2b +8: + subs r4, r4, #4 + ble 9f + // End of horizontal loop, move pointers to next four rows + sub r8, r8, r12 + add r0, r0, r1 + add r6, r6, r1 + vld1.8 {d6}, [r8]! + add r5, r5, r1 + add lr, lr, r1 + mov r3, r12 + b 1b +9: + pop {r4-r8, pc} +endfunc + +// void ipred_smooth_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_smooth_8bpc_neon, export=1 + push {r4-r10, lr} + ldr r4, [sp, #32] + movrel r10, X(sm_weights) + add r12, r10, r4 + add r10, r10, r3 + clz r9, r3 + adr r5, L(ipred_smooth_tbl) + sub lr, r2, r4 + sub r9, r9, #25 + ldr r9, [r5, r9, lsl #2] + vld1.8 {d4[]}, [lr] // bottom + add r8, r2, #1 + add r5, r5, r9 + add r6, r0, r1 + lsl r1, r1, #1 + bx r5 + + .align 2 +L(ipred_smooth_tbl): + .word 640f - L(ipred_smooth_tbl) + CONFIG_THUMB + .word 320f - L(ipred_smooth_tbl) + CONFIG_THUMB + .word 160f - L(ipred_smooth_tbl) + CONFIG_THUMB + .word 80f - L(ipred_smooth_tbl) + CONFIG_THUMB + .word 40f - L(ipred_smooth_tbl) + CONFIG_THUMB + +40: + vld1.32 {d16[]}, [r8] // top + vld1.32 {d18[]}, [r10, :32] // weights_hor + sub r2, r2, #4 + mov r7, #-4 + vdup.8 q3, d16[3] // right + vsubl.u8 q8, d16, d4 // top-bottom + vmovl.u8 q9, d18 // weights_hor +4: + vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7 // left + vld4.8 {d20[], d21[], d22[], d23[]}, [r12, :32]! // weights_ver + vshll.i8 q12, d6, #8 // right*256 + vshll.i8 q13, d6, #8 + vzip.32 d1, d0 // left, flipped + vzip.32 d3, d2 + vzip.32 d20, d21 // weights_ver + vzip.32 d22, d23 + vshll.i8 q14, d4, #8 // bottom*256 + vshll.i8 q15, d4, #8 + vsubl.u8 q0, d1, d6 // left-right + vsubl.u8 q1, d3, d6 + vmovl.u8 q10, d20 // weights_ver + vmovl.u8 q11, d22 + vmla.i16 q12, q1, q9 // right*256 + (left-right)*weights_hor + vmla.i16 q13, q0, q9 // (left flipped) + vmla.i16 q14, q8, q10 // bottom*256 + (top-bottom)*weights_ver + vmla.i16 q15, q8, q11 + vhadd.u16 q12, q12, q14 + vhadd.u16 q13, q13, q15 + vrshrn.i16 d24, q12, #8 + vrshrn.i16 d25, q13, #8 + vst1.32 {d24[0]}, [r0, :32], r1 + vst1.32 {d24[1]}, [r6, :32], r1 + subs r4, r4, #4 + vst1.32 {d25[0]}, [r0, :32], r1 + vst1.32 {d25[1]}, [r6, :32], r1 + bgt 4b + pop {r4-r10, pc} +80: + vld1.8 {d16}, [r8] // top + vld1.8 {d18}, [r10, :64] // weights_hor + sub r2, r2, #2 + mov r7, #-2 + vdup.8 q3, d16[7] // right + vsubl.u8 q8, d16, d4 // top-bottom + vmovl.u8 q9, d18 // weights_hor +8: + vld2.8 {d0[], d1[]}, [r2, :16], r7 // left + vld2.8 {d20[], d22[]}, [r12, :16]! // weights_ver + vshll.i8 q12, d6, #8 // right*256 + vshll.i8 q13, d6, #8 + vshll.i8 q14, d4, #8 // bottom*256 + vshll.i8 q15, d4, #8 + vsubl.u8 q1, d0, d6 // left-right (left flipped) + vsubl.u8 q0, d1, d6 + vmovl.u8 q10, d20 // weights_ver + vmovl.u8 q11, d22 + vmla.i16 q12, q0, q9 // right*256 + (left-right)*weights_hor + vmla.i16 q13, q1, q9 + vmla.i16 q14, q8, q10 // bottom*256 + (top-bottom)*weights_ver + vmla.i16 q15, q8, q11 + vhadd.u16 q12, q12, q14 + vhadd.u16 q13, q13, q15 + vrshrn.i16 d24, q12, #8 + vrshrn.i16 d25, q13, #8 + subs r4, r4, #2 + vst1.8 {d24}, [r0, :64], r1 + vst1.8 {d25}, [r6, :64], r1 + bgt 8b + pop {r4-r10, pc} +160: +320: +640: + add lr, r2, r3 + sub r2, r2, #2 + mov r7, #-2 + vld1.8 {d6[], d7[]}, [lr] // right + sub r1, r1, r3 + mov r9, r3 + +1: + vld2.8 {d0[], d1[]}, [r2, :16], r7 // left + vld2.8 {d20[], d22[]}, [r12, :16]! // weights_ver + vsubl.u8 q1, d0, d6 // left-right (left flipped) + vsubl.u8 q0, d1, d6 + vmovl.u8 q10, d20 // weights_ver + vmovl.u8 q11, d22 +2: + vld1.8 {d16}, [r8]! // top + vld1.8 {d18}, [r10, :64]! // weights_hor + vshll.i8 q12, d6, #8 // right*256 + vshll.i8 q13, d6, #8 + vmovl.u8 q9, d18 // weights_hor + vshll.i8 q14, d4, #8 // bottom*256 + vshll.i8 q15, d4, #8 + vsubl.u8 q8, d16, d4 // top-bottom + vmla.i16 q12, q0, q9 // right*256 + (left-right)*weights_hor + vmla.i16 q13, q1, q9 + vmla.i16 q14, q8, q10 // bottom*256 + (top-bottom)*weights_ver + vmla.i16 q15, q8, q11 + vhadd.u16 q12, q12, q14 + vhadd.u16 q13, q13, q15 + vrshrn.i16 d24, q12, #8 + vrshrn.i16 d25, q13, #8 + subs r3, r3, #8 + vst1.8 {d24}, [r0, :64]! + vst1.8 {d25}, [r6, :64]! + bgt 2b + subs r4, r4, #2 + ble 9f + sub r8, r8, r9 + sub r10, r10, r9 + add r0, r0, r1 + add r6, r6, r1 + mov r3, r9 + b 1b +9: + pop {r4-r10, pc} +endfunc + +// void ipred_smooth_v_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_smooth_v_8bpc_neon, export=1 + push {r4-r7, lr} + ldr r4, [sp, #20] + movrel r7, X(sm_weights) + add r7, r7, r4 + clz lr, r3 + adr r5, L(ipred_smooth_v_tbl) + sub r12, r2, r4 + sub lr, lr, #25 + ldr lr, [r5, lr, lsl #2] + vld1.8 {d4[]}, [r12] // bottom + add r2, r2, #1 + add r5, r5, lr + add r6, r0, r1 + lsl r1, r1, #1 + bx r5 + + .align 2 +L(ipred_smooth_v_tbl): + .word 640f - L(ipred_smooth_v_tbl) + CONFIG_THUMB + .word 320f - L(ipred_smooth_v_tbl) + CONFIG_THUMB + .word 160f - L(ipred_smooth_v_tbl) + CONFIG_THUMB + .word 80f - L(ipred_smooth_v_tbl) + CONFIG_THUMB + .word 40f - L(ipred_smooth_v_tbl) + CONFIG_THUMB + +40: + vld1.32 {d6[]}, [r2] // top + vsubl.u8 q3, d6, d4 // top-bottom +4: + vld4.8 {d16[], d17[], d18[], d19[]}, [r7, :32]! // weights_ver + vshll.i8 q10, d4, #8 // bottom*256 + vshll.i8 q11, d4, #8 + vzip.32 d16, d17 // weights_ver + vzip.32 d18, d19 + vmovl.u8 q8, d16 // weights_ver + vmovl.u8 q9, d18 + subs r4, r4, #4 + vmla.i16 q10, q3, q8 // bottom*256 + (top-bottom)*weights_ver + vmla.i16 q11, q3, q9 + vrshrn.i16 d20, q10, #8 + vrshrn.i16 d21, q11, #8 + vst1.32 {d20[0]}, [r0, :32], r1 + vst1.32 {d20[1]}, [r6, :32], r1 + vst1.32 {d21[0]}, [r0, :32], r1 + vst1.32 {d21[1]}, [r6, :32], r1 + bgt 4b + pop {r4-r7, pc} +80: + vld1.8 {d6}, [r2] // top + vsubl.u8 q3, d6, d4 // top-bottom +8: + vld4.8 {d16[], d18[], d20[], d22[]}, [r7, :32]! // weights_ver + vshll.i8 q12, d4, #8 // bottom*256 + vshll.i8 q13, d4, #8 + vshll.i8 q14, d4, #8 + vshll.i8 q15, d4, #8 + vmovl.u8 q8, d16 // weights_ver + vmovl.u8 q9, d18 + vmovl.u8 q10, d20 + vmovl.u8 q11, d22 + vmla.i16 q12, q3, q8 // bottom*256 + (top-bottom)*weights_ver + vmla.i16 q13, q3, q9 + vmla.i16 q14, q3, q10 + vmla.i16 q15, q3, q11 + vrshrn.i16 d24, q12, #8 + vrshrn.i16 d25, q13, #8 + vrshrn.i16 d26, q14, #8 + vrshrn.i16 d27, q15, #8 + vst1.8 {d24}, [r0, :64], r1 + vst1.8 {d25}, [r6, :64], r1 + subs r4, r4, #4 + vst1.8 {d26}, [r0, :64], r1 + vst1.8 {d27}, [r6, :64], r1 + bgt 8b + pop {r4-r7, pc} +160: +320: +640: + vpush {q4-q7} + // Set up pointers for four rows in parallel; r0, r6, r5, lr + add r5, r0, r1 + add lr, r6, r1 + lsl r1, r1, #1 + sub r1, r1, r3 + mov r12, r3 + +1: + vld4.8 {d8[], d10[], d12[], d14[]}, [r7, :32]! // weights_ver + vmovl.u8 q4, d8 // weights_ver + vmovl.u8 q5, d10 + vmovl.u8 q6, d12 + vmovl.u8 q7, d14 +2: + vld1.8 {q3}, [r2]! // top + vshll.i8 q8, d4, #8 // bottom*256 + vshll.i8 q9, d4, #8 + vshll.i8 q10, d4, #8 + vshll.i8 q11, d4, #8 + vsubl.u8 q0, d6, d4 // top-bottom + vsubl.u8 q1, d7, d4 + vshll.i8 q12, d4, #8 + vshll.i8 q13, d4, #8 + vshll.i8 q14, d4, #8 + vshll.i8 q15, d4, #8 + vmla.i16 q8, q0, q4 // bottom*256 + (top-bottom)*weights_ver + vmla.i16 q9, q1, q4 + vmla.i16 q10, q0, q5 + vmla.i16 q11, q1, q5 + vmla.i16 q12, q0, q6 // bottom*256 + (top-bottom)*weights_ver + vmla.i16 q13, q1, q6 + vmla.i16 q14, q0, q7 + vmla.i16 q15, q1, q7 + vrshrn.i16 d16, q8, #8 + vrshrn.i16 d17, q9, #8 + vrshrn.i16 d18, q10, #8 + vrshrn.i16 d19, q11, #8 + vrshrn.i16 d20, q12, #8 + vrshrn.i16 d21, q13, #8 + vrshrn.i16 d22, q14, #8 + vrshrn.i16 d23, q15, #8 + subs r3, r3, #16 + vst1.8 {q8}, [r0, :128]! + vst1.8 {q9}, [r6, :128]! + vst1.8 {q10}, [r5, :128]! + vst1.8 {q11}, [lr, :128]! + bgt 2b + subs r4, r4, #4 + ble 9f + sub r2, r2, r12 + add r0, r0, r1 + add r6, r6, r1 + add r5, r5, r1 + add lr, lr, r1 + mov r3, r12 + b 1b +9: + vpop {q4-q7} + pop {r4-r7, pc} +endfunc + +// void ipred_smooth_h_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_smooth_h_8bpc_neon, export=1 + push {r4-r8, lr} + ldr r4, [sp, #24] + movrel r8, X(sm_weights) + add r8, r8, r3 + clz lr, r3 + adr r5, L(ipred_smooth_h_tbl) + add r12, r2, r3 + sub lr, lr, #25 + ldr lr, [r5, lr, lsl #2] + vld1.8 {d4[]}, [r12] // right + add r5, r5, lr + add r6, r0, r1 + lsl r1, r1, #1 + bx r5 + + .align 2 +L(ipred_smooth_h_tbl): + .word 640f - L(ipred_smooth_h_tbl) + CONFIG_THUMB + .word 320f - L(ipred_smooth_h_tbl) + CONFIG_THUMB + .word 160f - L(ipred_smooth_h_tbl) + CONFIG_THUMB + .word 80f - L(ipred_smooth_h_tbl) + CONFIG_THUMB + .word 40f - L(ipred_smooth_h_tbl) + CONFIG_THUMB + +40: + vld1.32 {d6[]}, [r8, :32] // weights_hor + sub r2, r2, #4 + mov r7, #-4 + vmovl.u8 q3, d6 // weights_hor +4: + vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7 // left + vshll.i8 q8, d4, #8 // right*256 + vshll.i8 q9, d4, #8 + vzip.32 d3, d2 // left, flipped + vzip.32 d1, d0 + vsubl.u8 q1, d3, d4 // left-right + vsubl.u8 q0, d1, d4 + subs r4, r4, #4 + vmla.i16 q8, q1, q3 // right*256 + (left-right)*weights_hor + vmla.i16 q9, q0, q3 + vrshrn.i16 d16, q8, #8 + vrshrn.i16 d17, q9, #8 + vst1.32 {d16[0]}, [r0, :32], r1 + vst1.32 {d16[1]}, [r6, :32], r1 + vst1.32 {d17[0]}, [r0, :32], r1 + vst1.32 {d17[1]}, [r6, :32], r1 + bgt 4b + pop {r4-r8, pc} +80: + vld1.8 {d6}, [r8, :64] // weights_hor + sub r2, r2, #4 + mov r7, #-4 + vmovl.u8 q3, d6 // weights_hor +8: + vld4.8 {d16[], d18[], d20[], d22[]}, [r2, :32], r7 // left + vshll.i8 q12, d4, #8 // right*256 + vshll.i8 q13, d4, #8 + vshll.i8 q14, d4, #8 + vshll.i8 q15, d4, #8 + vsubl.u8 q11, d22, d4 // left-right + vsubl.u8 q10, d20, d4 + vsubl.u8 q9, d18, d4 + vsubl.u8 q8, d16, d4 + vmla.i16 q12, q11, q3 // right*256 + (left-right)*weights_hor + vmla.i16 q13, q10, q3 // (left flipped) + vmla.i16 q14, q9, q3 + vmla.i16 q15, q8, q3 + vrshrn.i16 d24, q12, #8 + vrshrn.i16 d25, q13, #8 + vrshrn.i16 d26, q14, #8 + vrshrn.i16 d27, q15, #8 + vst1.8 {d24}, [r0, :64], r1 + vst1.8 {d25}, [r6, :64], r1 + subs r4, r4, #4 + vst1.8 {d26}, [r0, :64], r1 + vst1.8 {d27}, [r6, :64], r1 + bgt 8b + pop {r4-r8, pc} +160: +320: +640: + vpush {q4-q7} + sub r2, r2, #4 + mov r7, #-4 + // Set up pointers for four rows in parallel; r0, r6, r5, lr + add r5, r0, r1 + add lr, r6, r1 + lsl r1, r1, #1 + sub r1, r1, r3 + mov r12, r3 + +1: + vld4.8 {d8[], d10[], d12[], d14[]}, [r2, :32], r7 // left + vsubl.u8 q4, d8, d4 // left-right + vsubl.u8 q5, d10, d4 + vsubl.u8 q6, d12, d4 + vsubl.u8 q7, d14, d4 +2: + vld1.8 {q1}, [r8, :128]! // weights_hor + vshll.i8 q8, d4, #8 // right*256 + vshll.i8 q9, d4, #8 + vshll.i8 q10, d4, #8 + vshll.i8 q11, d4, #8 + vmovl.u8 q0, d2 // weights_hor + vmovl.u8 q1, d3 + vshll.i8 q12, d4, #8 + vshll.i8 q13, d4, #8 + vshll.i8 q14, d4, #8 + vshll.i8 q15, d4, #8 + vmla.i16 q8, q7, q0 // right*256 + (left-right)*weights_hor + vmla.i16 q9, q7, q1 // (left flipped) + vmla.i16 q10, q6, q0 + vmla.i16 q11, q6, q1 + vmla.i16 q12, q5, q0 + vmla.i16 q13, q5, q1 + vmla.i16 q14, q4, q0 + vmla.i16 q15, q4, q1 + vrshrn.i16 d16, q8, #8 + vrshrn.i16 d17, q9, #8 + vrshrn.i16 d18, q10, #8 + vrshrn.i16 d19, q11, #8 + vrshrn.i16 d20, q12, #8 + vrshrn.i16 d21, q13, #8 + vrshrn.i16 d22, q14, #8 + vrshrn.i16 d23, q15, #8 + subs r3, r3, #16 + vst1.8 {q8}, [r0, :128]! + vst1.8 {q9}, [r6, :128]! + vst1.8 {q10}, [r5, :128]! + vst1.8 {q11}, [lr, :128]! + bgt 2b + subs r4, r4, #4 + ble 9f + sub r8, r8, r12 + add r0, r0, r1 + add r6, r6, r1 + add r5, r5, r1 + add lr, lr, r1 + mov r3, r12 + b 1b +9: + vpop {q4-q7} + pop {r4-r8, pc} +endfunc + +// void ipred_filter_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int filt_idx, +// const int max_width, const int max_height); +function ipred_filter_8bpc_neon, export=1 + push {r4-r8, lr} + movw r12, #511 + ldr r5, [sp, #28] + ldr r4, [sp, #24] + and r5, r5, r12 // 511 + movrel r6, X(filter_intra_taps) + lsl r5, r5, #6 + add r6, r6, r5 + vld1.8 {d20, d21, d22, d23}, [r6, :128]! + clz lr, r3 + adr r5, L(ipred_filter_tbl) + vld1.8 {d27, d28, d29}, [r6, :64] + sub lr, lr, #26 + ldr lr, [r5, lr, lsl #2] + vmovl.s8 q8, d20 + vmovl.s8 q9, d21 + add r5, r5, lr + vmovl.s8 q10, d22 + vmovl.s8 q11, d23 + add r6, r0, r1 + lsl r1, r1, #1 + vmovl.s8 q12, d27 + vmovl.s8 q13, d28 + vmovl.s8 q14, d29 + add r8, r2, #1 + bx r5 + + .align 2 +L(ipred_filter_tbl): + .word 320f - L(ipred_filter_tbl) + CONFIG_THUMB + .word 160f - L(ipred_filter_tbl) + CONFIG_THUMB + .word 80f - L(ipred_filter_tbl) + CONFIG_THUMB + .word 40f - L(ipred_filter_tbl) + CONFIG_THUMB + +40: + vld1.32 {d0[]}, [r8] // top (0-3) + sub r2, r2, #2 + mov r7, #-2 + vmovl.u8 q0, d0 // top (0-3) +4: + vld1.32 {d2[]}, [r2], r7 // left (0-1) + topleft (2) + vmul.i16 q2, q9, d0[0] // p1(top[0]) * filter(1) + vmla.i16 q2, q10, d0[1] // p2(top[1]) * filter(2) + vmla.i16 q2, q11, d0[2] // p3(top[2]) * filter(3) + vmovl.u8 q1, d2 // left (0-1) + topleft (2) + vmla.i16 q2, q12, d0[3] // p4(top[3]) * filter(4) + vmla.i16 q2, q8, d2[2] // p0(topleft) * filter(0) + vmla.i16 q2, q13, d2[1] // p5(left[0]) * filter(5) + vmla.i16 q2, q14, d2[0] // p6(left[1]) * filter(6) + vqrshrun.s16 d4, q2, #4 + subs r4, r4, #2 + vst1.32 {d4[0]}, [r0, :32], r1 + vmovl.u8 q0, d4 + vst1.32 {d4[1]}, [r6, :32], r1 + vext.8 q0, q0, q0, #8 // move top from [4-7] to [0-3] + bgt 4b + pop {r4-r8, pc} +80: + vld1.8 {d0}, [r8] // top (0-7) + sub r2, r2, #2 + mov r7, #-2 + vmovl.u8 q0, d0 // top (0-7) +8: + vld1.32 {d2[]}, [r2], r7 // left (0-1) + topleft (2) + vmul.i16 q2, q9, d0[0] // p1(top[0]) * filter(1) + vmla.i16 q2, q10, d0[1] // p2(top[1]) * filter(2) + vmla.i16 q2, q11, d0[2] // p3(top[2]) * filter(3) + vmovl.u8 q1, d2 // left (0-1) + topleft (2) + vmla.i16 q2, q12, d0[3] // p4(top[3]) * filter(4) + vmla.i16 q2, q8, d2[2] // p0(topleft) * filter(0) + vmla.i16 q2, q13, d2[1] // p5(left[0]) * filter(5) + vmla.i16 q2, q14, d2[0] // p6(left[1]) * filter(6) + vmul.i16 q3, q9, d1[0] // p1(top[0]) * filter(1) + vmla.i16 q3, q10, d1[1] // p2(top[1]) * filter(2) + vmla.i16 q3, q11, d1[2] // p3(top[2]) * filter(3) + vqrshrun.s16 d4, q2, #4 + vmovl.u8 q1, d4 // first block, in 16 bit + vmla.i16 q3, q12, d1[3] // p4(top[3]) * filter(4) + vmla.i16 q3, q8, d0[3] // p0(topleft) * filter(0) + vmla.i16 q3, q13, d2[3] // p5(left[0]) * filter(5) + vmla.i16 q3, q14, d3[3] // p6(left[1]) * filter(6) + vqrshrun.s16 d5, q3, #4 + vzip.32 d4, d5 + subs r4, r4, #2 + vst1.64 {d4}, [r0, :64], r1 + vmovl.u8 q0, d5 + vst1.64 {d5}, [r6, :64], r1 + bgt 8b + pop {r4-r8, pc} +160: +320: + vpush {q4-q5} + sub r2, r2, #2 + mov r7, #-2 + sub r1, r1, r3 + mov lr, r3 + +1: + vld1.32 {d0[]}, [r2], r7 // left (0-1) + topleft (2) + vmovl.u8 q0, d0 // left (0-1) + topleft (2) +2: + vld1.8 {q2}, [r8]! // top(0-15) + vmul.i16 q3, q8, d0[2] // p0(topleft) * filter(0) + vmla.i16 q3, q13, d0[1] // p5(left[0]) * filter(5) + vmovl.u8 q1, d4 // top(0-7) + vmovl.u8 q2, d5 // top(8-15) + vmla.i16 q3, q14, d0[0] // p6(left[1]) * filter(6) + vmla.i16 q3, q9, d2[0] // p1(top[0]) * filter(1) + vmla.i16 q3, q10, d2[1] // p2(top[1]) * filter(2) + vmla.i16 q3, q11, d2[2] // p3(top[2]) * filter(3) + vmla.i16 q3, q12, d2[3] // p4(top[3]) * filter(4) + + vmul.i16 q4, q9, d3[0] // p1(top[0]) * filter(1) + vmla.i16 q4, q10, d3[1] // p2(top[1]) * filter(2) + vmla.i16 q4, q11, d3[2] // p3(top[2]) * filter(3) + vqrshrun.s16 d6, q3, #4 + vmovl.u8 q0, d6 // first block, in 16 bit + vmla.i16 q4, q12, d3[3] // p4(top[3]) * filter(4) + vmla.i16 q4, q8, d2[3] // p0(topleft) * filter(0) + vmla.i16 q4, q13, d0[3] // p5(left[0]) * filter(5) + vmla.i16 q4, q14, d1[3] // p6(left[1]) * filter(6) + + vmul.i16 q5, q9, d4[0] // p1(top[0]) * filter(1) + vmla.i16 q5, q10, d4[1] // p2(top[1]) * filter(2) + vmla.i16 q5, q11, d4[2] // p3(top[2]) * filter(3) + vqrshrun.s16 d7, q4, #4 + vmovl.u8 q0, d7 // second block, in 16 bit + vmla.i16 q5, q12, d4[3] // p4(top[3]) * filter(4) + vmla.i16 q5, q8, d3[3] // p0(topleft) * filter(0) + vmla.i16 q5, q13, d0[3] // p5(left[0]) * filter(5) + vmla.i16 q5, q14, d1[3] // p6(left[1]) * filter(6) + + vmul.i16 q15, q9, d5[0] // p1(top[0]) * filter(1) + vmla.i16 q15, q10, d5[1] // p2(top[1]) * filter(2) + vmla.i16 q15, q11, d5[2] // p3(top[2]) * filter(3) + vqrshrun.s16 d8, q5, #4 + vmovl.u8 q0, d8 // third block, in 16 bit + vmov.u8 r12, d5[6] + vmla.i16 q15, q12, d5[3] // p4(top[3]) * filter(4) + vmla.i16 q15, q8, d4[3] // p0(topleft) * filter(0) + vmla.i16 q15, q13, d0[3] // p5(left[0]) * filter(5) + vmla.i16 q15, q14, d1[3] // p6(left[1]) * filter(6) + vmov.8 d0[4], r12 + + subs r3, r3, #16 + vqrshrun.s16 d9, q15, #4 + + vst4.32 {d6[0], d7[0], d8[0], d9[0]}, [r0, :128]! + vst4.32 {d6[1], d7[1], d8[1], d9[1]}, [r6, :128]! + ble 8f + vmov.u8 r12, d9[7] + vmov.8 d0[0], r12 + vmov.u8 r12, d9[3] + vmov.8 d0[2], r12 + b 2b +8: + subs r4, r4, #2 + + ble 9f + sub r8, r6, lr + add r0, r0, r1 + add r6, r6, r1 + mov r3, lr + b 1b +9: + vpop {q4-q5} + pop {r4-r8, pc} +endfunc + +// void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const uint16_t *const pal, const uint8_t *idx, +// const int w, const int h); +function pal_pred_8bpc_neon, export=1 + push {r4-r5, lr} + ldr r4, [sp, #12] + ldr r5, [sp, #16] + vld1.16 {q0}, [r2, :128] + clz lr, r4 + adr r12, L(pal_pred_tbl) + sub lr, lr, #25 + ldr lr, [r12, lr, lsl #2] + vmovn.i16 d0, q0 + add r12, r12, lr + add r2, r0, r1 + bx r12 + + .align 2 +L(pal_pred_tbl): + .word 640f - L(pal_pred_tbl) + CONFIG_THUMB + .word 320f - L(pal_pred_tbl) + CONFIG_THUMB + .word 160f - L(pal_pred_tbl) + CONFIG_THUMB + .word 80f - L(pal_pred_tbl) + CONFIG_THUMB + .word 40f - L(pal_pred_tbl) + CONFIG_THUMB + +40: + lsl r1, r1, #1 +4: + vld1.8 {q1}, [r3, :128]! + subs r5, r5, #4 + vtbl.8 d2, {d0}, d2 + vtbl.8 d3, {d0}, d3 + vst1.32 {d2[0]}, [r0, :32], r1 + vst1.32 {d2[1]}, [r2, :32], r1 + vst1.32 {d3[0]}, [r0, :32], r1 + vst1.32 {d3[1]}, [r2, :32], r1 + bgt 4b + pop {r4-r5, pc} +80: + lsl r1, r1, #1 +8: + vld1.8 {q1, q2}, [r3, :128]! + subs r5, r5, #4 + vtbl.8 d2, {d0}, d2 + vtbl.8 d3, {d0}, d3 + vst1.8 {d2}, [r0, :64], r1 + vtbl.8 d4, {d0}, d4 + vst1.8 {d3}, [r2, :64], r1 + vtbl.8 d5, {d0}, d5 + vst1.8 {d4}, [r0, :64], r1 + vst1.8 {d5}, [r2, :64], r1 + bgt 8b + pop {r4-r5, pc} +160: + lsl r1, r1, #1 +16: + vld1.8 {q8, q9}, [r3, :128]! + subs r5, r5, #4 + vld1.8 {q10, q11}, [r3, :128]! + vtbl.8 d16, {d0}, d16 + vtbl.8 d17, {d0}, d17 + vtbl.8 d18, {d0}, d18 + vtbl.8 d19, {d0}, d19 + vtbl.8 d20, {d0}, d20 + vtbl.8 d21, {d0}, d21 + vst1.8 {q8}, [r0, :128], r1 + vtbl.8 d22, {d0}, d22 + vst1.8 {q9}, [r2, :128], r1 + vtbl.8 d23, {d0}, d23 + vst1.8 {q10}, [r0, :128], r1 + vst1.8 {q11}, [r2, :128], r1 + bgt 16b + pop {r4-r5, pc} +320: + lsl r1, r1, #1 +32: + vld1.8 {q8, q9}, [r3, :128]! + subs r5, r5, #2 + vld1.8 {q10, q11}, [r3, :128]! + vtbl.8 d16, {d0}, d16 + vtbl.8 d17, {d0}, d17 + vtbl.8 d18, {d0}, d18 + vtbl.8 d19, {d0}, d19 + vtbl.8 d20, {d0}, d20 + vtbl.8 d21, {d0}, d21 + vst1.8 {q8, q9}, [r0, :128], r1 + vtbl.8 d22, {d0}, d22 + vtbl.8 d23, {d0}, d23 + vst1.8 {q10, q11}, [r2, :128], r1 + bgt 32b + pop {r4-r5, pc} +640: + sub r1, r1, #32 +64: + vld1.8 {q8, q9}, [r3, :128]! + subs r5, r5, #1 + vld1.8 {q10, q11}, [r3, :128]! + vtbl.8 d16, {d0}, d16 + vtbl.8 d17, {d0}, d17 + vtbl.8 d18, {d0}, d18 + vtbl.8 d19, {d0}, d19 + vtbl.8 d20, {d0}, d20 + vtbl.8 d21, {d0}, d21 + vst1.8 {q8, q9}, [r0, :128]! + vtbl.8 d22, {d0}, d22 + vtbl.8 d23, {d0}, d23 + vst1.8 {q10, q11}, [r0, :128], r1 + bgt 64b + pop {r4-r5, pc} +endfunc + +// void ipred_cfl_128_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha); +function ipred_cfl_128_8bpc_neon, export=1 + push {r4-r8, lr} + ldr r4, [sp, #24] + ldr r5, [sp, #28] + ldr r6, [sp, #32] + clz lr, r3 + adr r12, L(ipred_cfl_128_tbl) + sub lr, lr, #26 + ldr lr, [r12, lr, lsl #2] + vmov.i16 q0, #128 // dc + vdup.i16 q1, r6 // alpha + add r12, r12, lr + add r6, r0, r1 + lsl r1, r1, #1 + bx r12 + + .align 2 +L(ipred_cfl_128_tbl): +L(ipred_cfl_splat_tbl): + .word L(ipred_cfl_splat_w16) - L(ipred_cfl_128_tbl) + CONFIG_THUMB + .word L(ipred_cfl_splat_w16) - L(ipred_cfl_128_tbl) + CONFIG_THUMB + .word L(ipred_cfl_splat_w8) - L(ipred_cfl_128_tbl) + CONFIG_THUMB + .word L(ipred_cfl_splat_w4) - L(ipred_cfl_128_tbl) + CONFIG_THUMB + +L(ipred_cfl_splat_w4): + vld1.16 {q2, q3}, [r5, :128]! + vmul.i16 q2, q2, q1 // diff = ac * alpha + vmul.i16 q3, q3, q1 + vshr.s16 q8, q2, #15 // sign = diff >> 15 + vshr.s16 q9, q3, #15 + vadd.i16 q2, q2, q8 // diff + sign + vadd.i16 q3, q3, q9 + vrshr.s16 q2, q2, #6 // (diff + sign + 32) >> 6 = apply_sign() + vrshr.s16 q3, q3, #6 + vadd.i16 q2, q2, q0 // dc + apply_sign() + vadd.i16 q3, q3, q0 + vqmovun.s16 d4, q2 // iclip_pixel(dc + apply_sign()) + vqmovun.s16 d5, q3 + vst1.32 {d4[0]}, [r0, :32], r1 + vst1.32 {d4[1]}, [r6, :32], r1 + subs r4, r4, #4 + vst1.32 {d5[0]}, [r0, :32], r1 + vst1.32 {d5[1]}, [r6, :32], r1 + bgt L(ipred_cfl_splat_w4) + pop {r4-r8, pc} +L(ipred_cfl_splat_w8): + vld1.16 {q8, q9}, [r5, :128]! + vld1.16 {q10, q11}, [r5, :128]! + vmul.i16 q8, q8, q1 // diff = ac * alpha + vmul.i16 q9, q9, q1 + vmul.i16 q10, q10, q1 + vmul.i16 q11, q11, q1 + vshr.s16 q12, q8, #15 // sign = diff >> 15 + vshr.s16 q13, q9, #15 + vshr.s16 q14, q10, #15 + vshr.s16 q15, q11, #15 + vadd.i16 q8, q8, q12 // diff + sign + vadd.i16 q9, q9, q13 + vadd.i16 q10, q10, q14 + vadd.i16 q11, q11, q15 + vrshr.s16 q8, q8, #6 // (diff + sign + 32) >> 6 = apply_sign() + vrshr.s16 q9, q9, #6 + vrshr.s16 q10, q10, #6 + vrshr.s16 q11, q11, #6 + vadd.i16 q8, q8, q0 // dc + apply_sign() + vadd.i16 q9, q9, q0 + vadd.i16 q10, q10, q0 + vadd.i16 q11, q11, q0 + vqmovun.s16 d16, q8 // iclip_pixel(dc + apply_sign()) + vqmovun.s16 d17, q9 + vqmovun.s16 d18, q10 + vqmovun.s16 d19, q11 + vst1.8 {d16}, [r0, :64], r1 + vst1.8 {d17}, [r6, :64], r1 + subs r4, r4, #4 + vst1.8 {d18}, [r0, :64], r1 + vst1.8 {d19}, [r6, :64], r1 + bgt L(ipred_cfl_splat_w8) + pop {r4-r8, pc} +L(ipred_cfl_splat_w16): + add r12, r5, r3, lsl #1 + sub r1, r1, r3 + mov lr, r3 +1: + vld1.16 {q8, q9}, [r5, :128]! + vmul.i16 q8, q8, q1 // diff = ac * alpha + vld1.16 {q10, q11}, [r12, :128]! + vmul.i16 q9, q9, q1 + vmul.i16 q10, q10, q1 + vmul.i16 q11, q11, q1 + vshr.s16 q12, q8, #15 // sign = diff >> 15 + vshr.s16 q13, q9, #15 + vshr.s16 q14, q10, #15 + vshr.s16 q15, q11, #15 + vadd.i16 q8, q8, q12 // diff + sign + vadd.i16 q9, q9, q13 + vadd.i16 q10, q10, q14 + vadd.i16 q11, q11, q15 + vrshr.s16 q8, q8, #6 // (diff + sign + 32) >> 6 = apply_sign() + vrshr.s16 q9, q9, #6 + vrshr.s16 q10, q10, #6 + vrshr.s16 q11, q11, #6 + vadd.i16 q8, q8, q0 // dc + apply_sign() + vadd.i16 q9, q9, q0 + vadd.i16 q10, q10, q0 + vadd.i16 q11, q11, q0 + vqmovun.s16 d16, q8 // iclip_pixel(dc + apply_sign()) + vqmovun.s16 d17, q9 + vqmovun.s16 d18, q10 + vqmovun.s16 d19, q11 + subs r3, r3, #16 + vst1.16 {q8}, [r0, :128]! + vst1.16 {q9}, [r6, :128]! + bgt 1b + subs r4, r4, #2 + add r5, r5, lr, lsl #1 + add r12, r12, lr, lsl #1 + add r0, r0, r1 + add r6, r6, r1 + mov r3, lr + bgt 1b + pop {r4-r8, pc} +endfunc + +// void ipred_cfl_top_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha); +function ipred_cfl_top_8bpc_neon, export=1 + push {r4-r8, lr} + ldr r4, [sp, #24] + ldr r5, [sp, #28] + ldr r6, [sp, #32] + clz lr, r3 + adr r12, L(ipred_cfl_top_tbl) + sub lr, lr, #26 + ldr lr, [r12, lr, lsl #2] + vdup.16 q1, r6 // alpha + add r2, r2, #1 + add r12, r12, lr + add r6, r0, r1 + lsl r1, r1, #1 + bx r12 + + .align 2 +L(ipred_cfl_top_tbl): + .word 32f - L(ipred_cfl_top_tbl) + CONFIG_THUMB + .word 16f - L(ipred_cfl_top_tbl) + CONFIG_THUMB + .word 8f - L(ipred_cfl_top_tbl) + CONFIG_THUMB + .word 4f - L(ipred_cfl_top_tbl) + CONFIG_THUMB + +4: + vld1.32 {d0[]}, [r2] + vpaddl.u8 d0, d0 + vpadd.u16 d0, d0 + vrshr.u16 d0, d0, #2 + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w4) +8: + vld1.8 {d0}, [r2] + vpaddl.u8 d0, d0 + vpadd.u16 d0, d0 + vpadd.u16 d0, d0 + vrshr.u16 d0, d0, #3 + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w8) +16: + vld1.8 {q0}, [r2] + vaddl.u8 q0, d0, d1 + vadd.u16 d0, d0, d1 + vpadd.u16 d0, d0 + vpadd.u16 d0, d0 + vrshr.u16 d0, d0, #4 + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w16) +32: + vld1.8 {q2, q3}, [r2] + vaddl.u8 q2, d4, d5 + vaddl.u8 q3, d6, d7 + vadd.u16 q0, q2, q3 + vadd.u16 d0, d0, d1 + vpadd.u16 d0, d0 + vpadd.u16 d0, d0 + vrshr.u16 d0, d0, #5 + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w16) +endfunc + +// void ipred_cfl_left_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha); +function ipred_cfl_left_8bpc_neon, export=1 + push {r4-r8, lr} + ldr r4, [sp, #24] + ldr r5, [sp, #28] + ldr r6, [sp, #32] + sub r2, r2, r4 + clz lr, r3 + clz r8, r4 + adr r12, L(ipred_cfl_splat_tbl) + adr r7, L(ipred_cfl_left_tbl) + sub lr, lr, #26 + sub r8, r8, #26 + ldr lr, [r12, lr, lsl #2] + ldr r8, [r7, r8, lsl #2] + vdup.16 q1, r6 // alpha + add r12, r12, lr + add r7, r7, r8 + add r6, r0, r1 + lsl r1, r1, #1 + bx r7 + + .align 2 +L(ipred_cfl_left_tbl): + .word L(ipred_cfl_left_h32) - L(ipred_cfl_left_tbl) + CONFIG_THUMB + .word L(ipred_cfl_left_h16) - L(ipred_cfl_left_tbl) + CONFIG_THUMB + .word L(ipred_cfl_left_h8) - L(ipred_cfl_left_tbl) + CONFIG_THUMB + .word L(ipred_cfl_left_h4) - L(ipred_cfl_left_tbl) + CONFIG_THUMB + +L(ipred_cfl_left_h4): + vld1.32 {d0[]}, [r2, :32] + vpaddl.u8 d0, d0 + vpadd.u16 d0, d0 + vrshr.u16 d0, d0, #2 + vdup.16 q0, d0[0] + bx r12 + +L(ipred_cfl_left_h8): + vld1.8 {d0}, [r2, :64] + vpaddl.u8 d0, d0 + vpadd.u16 d0, d0 + vpadd.u16 d0, d0 + vrshr.u16 d0, d0, #3 + vdup.16 q0, d0[0] + bx r12 + +L(ipred_cfl_left_h16): + vld1.8 {q0}, [r2, :128] + vaddl.u8 q0, d0, d1 + vadd.u16 d0, d0, d1 + vpadd.u16 d0, d0 + vpadd.u16 d0, d0 + vrshr.u16 d0, d0, #4 + vdup.16 q0, d0[0] + bx r12 + +L(ipred_cfl_left_h32): + vld1.8 {q2, q3}, [r2, :128] + vaddl.u8 q2, d4, d5 + vaddl.u8 q3, d6, d7 + vadd.u16 q0, q2, q3 + vadd.u16 d0, d0, d1 + vpadd.u16 d0, d0 + vpadd.u16 d0, d0 + vrshr.u16 d0, d0, #5 + vdup.16 q0, d0[0] + bx r12 +endfunc + +// void ipred_cfl_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha); +function ipred_cfl_8bpc_neon, export=1 + push {r4-r8, lr} + ldr r4, [sp, #24] + ldr r5, [sp, #28] + ldr r6, [sp, #32] + sub r2, r2, r4 + add r8, r3, r4 // width + height + vdup.16 q1, r6 // alpha + clz lr, r3 + clz r6, r4 + vdup.16 d16, r8 // width + height + adr r7, L(ipred_cfl_tbl) + rbit r8, r8 // rbit(width + height) + sub lr, lr, #22 // 26 leading bits, minus table offset 4 + sub r6, r6, #26 + clz r8, r8 // ctz(width + height) + ldr lr, [r7, lr, lsl #2] + ldr r6, [r7, r6, lsl #2] + neg r8, r8 // -ctz(width + height) + add r12, r7, lr + add r7, r7, r6 + vshr.u16 d16, d16, #1 // (width + height) >> 1 + vdup.16 d17, r8 // -ctz(width + height) + add r6, r0, r1 + lsl r1, r1, #1 + bx r7 + + .align 2 +L(ipred_cfl_tbl): + .word L(ipred_cfl_h32) - L(ipred_cfl_tbl) + CONFIG_THUMB + .word L(ipred_cfl_h16) - L(ipred_cfl_tbl) + CONFIG_THUMB + .word L(ipred_cfl_h8) - L(ipred_cfl_tbl) + CONFIG_THUMB + .word L(ipred_cfl_h4) - L(ipred_cfl_tbl) + CONFIG_THUMB + .word L(ipred_cfl_w32) - L(ipred_cfl_tbl) + CONFIG_THUMB + .word L(ipred_cfl_w16) - L(ipred_cfl_tbl) + CONFIG_THUMB + .word L(ipred_cfl_w8) - L(ipred_cfl_tbl) + CONFIG_THUMB + .word L(ipred_cfl_w4) - L(ipred_cfl_tbl) + CONFIG_THUMB + +L(ipred_cfl_h4): + vld1.32 {d0[]}, [r2, :32]! + vpaddl.u8 d0, d0 + vpadd.i16 d0, d0 + bx r12 +L(ipred_cfl_w4): + add r2, r2, #1 + vld1.32 {d1[]}, [r2] + vadd.i16 d0, d0, d16 + vpaddl.u8 d1, d1 + vpadd.u16 d1, d1 + cmp r4, #4 + vadd.i16 d0, d0, d1 + vshl.u16 d0, d0, d17 + beq 1f + // h = 8/16 + movw lr, #(0x3334/2) + movw r8, #(0x5556/2) + cmp r4, #16 + it ne + movne lr, r8 + vdup.16 d18, lr + vqdmulh.s16 d0, d0, d18 +1: + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w4) + +L(ipred_cfl_h8): + vld1.8 {d0}, [r2, :64]! + vpaddl.u8 d0, d0 + vpadd.i16 d0, d0 + vpadd.i16 d0, d0 + bx r12 +L(ipred_cfl_w8): + add r2, r2, #1 + vld1.8 {d1}, [r2] + vadd.i16 d0, d0, d16 + vpaddl.u8 d1, d1 + vpadd.i16 d1, d1 + vpadd.i16 d1, d1 + cmp r4, #8 + vadd.i16 d0, d0, d1 + vshl.u16 d0, d0, d17 + beq 1f + // h = 4/16/32 + cmp r4, #32 + movw lr, #(0x3334/2) + movw r8, #(0x5556/2) + it ne + movne lr, r8 + vdup.16 d18, lr + vqdmulh.s16 d0, d0, d18 +1: + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w8) + +L(ipred_cfl_h16): + vld1.8 {q0}, [r2, :128]! + vaddl.u8 q0, d0, d1 + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0 + vpadd.i16 d0, d0 + bx r12 +L(ipred_cfl_w16): + add r2, r2, #1 + vld1.8 {q2}, [r2] + vadd.i16 d0, d0, d16 + vaddl.u8 q2, d4, d5 + vadd.i16 d4, d4, d5 + vpadd.i16 d4, d4 + vpadd.i16 d4, d4 + cmp r4, #16 + vadd.i16 d0, d0, d4 + vshl.u16 d0, d0, d17 + beq 1f + // h = 4/8/32/64 + tst r4, #(32+16+8) // 16 added to make a consecutive bitmask + movw lr, #(0x3334/2) + movw r8, #(0x5556/2) + it ne + movne lr, r8 + vdup.16 d18, lr + vqdmulh.s16 d0, d0, d18 +1: + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w16) + +L(ipred_cfl_h32): + vld1.8 {q2, q3}, [r2, :128]! + vaddl.u8 q2, d4, d5 + vaddl.u8 q3, d6, d7 + vadd.i16 q0, q2, q3 + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0 + vpadd.i16 d0, d0 + bx r12 +L(ipred_cfl_w32): + add r2, r2, #1 + vld1.8 {q2, q3}, [r2] + vadd.i16 d0, d0, d16 + vaddl.u8 q2, d4, d5 + vaddl.u8 q3, d6, d7 + vadd.i16 q2, q2, q3 + vadd.i16 d4, d4, d5 + vpadd.i16 d4, d4 + vpadd.i16 d4, d4 + cmp r4, #32 + vadd.i16 d0, d0, d4 + vshl.u16 d0, d0, d17 + beq 1f + // h = 8/16/64 + cmp r4, #8 + movw lr, #(0x3334/2) + movw r8, #(0x5556/2) + it ne + movne lr, r8 + vdup.16 d18, lr + vqdmulh.s16 d0, d0, d18 +1: + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w16) +endfunc + +// void cfl_ac_420_8bpc_neon(int16_t *const ac, const pixel *const ypx, +// const ptrdiff_t stride, const int w_pad, +// const int h_pad, const int cw, const int ch); +function ipred_cfl_ac_420_8bpc_neon, export=1 + push {r4-r8,lr} + ldr r4, [sp, #24] + ldr r5, [sp, #28] + ldr r6, [sp, #32] + clz r8, r5 + lsl r4, r4, #2 + adr r7, L(ipred_cfl_ac_420_tbl) + sub r8, r8, #27 + ldr r8, [r7, r8, lsl #2] + vmov.i16 q8, #0 + vmov.i16 q9, #0 + vmov.i16 q10, #0 + vmov.i16 q11, #0 + add r7, r7, r8 + sub r8, r6, r4 // height - h_pad + rbit lr, r5 // rbit(width) + rbit r12, r6 // rbit(height) + clz lr, lr // ctz(width) + clz r12, r12 // ctz(height) + add lr, lr, r12 // log2sz + add r12, r1, r2 + vdup.32 d31, lr + lsl r2, r2, #1 + vneg.s32 d31, d31 // -log2sz + bx r7 + + .align 2 +L(ipred_cfl_ac_420_tbl): + .word L(ipred_cfl_ac_420_w16) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_420_w8) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_420_w4) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB + +L(ipred_cfl_ac_420_w4): +1: // Copy and subsample input + vld1.8 {d0}, [r1, :64], r2 + vld1.8 {d2}, [r12, :64], r2 + vld1.8 {d1}, [r1, :64], r2 + vld1.8 {d3}, [r12, :64], r2 + vpaddl.u8 q0, q0 + vpaddl.u8 q1, q1 + vadd.i16 q0, q0, q1 + vshl.i16 q0, q0, #1 + subs r8, r8, #2 + vst1.16 {q0}, [r0, :128]! + vadd.i16 q8, q8, q0 + bgt 1b + cmp r4, #0 + vmov d0, d1 + vmov d2, d1 + vmov d3, d1 +L(ipred_cfl_ac_420_w4_hpad): + beq 3f // This assumes that all callers already did "cmp r4, #0" +2: // Vertical padding (h_pad > 0) + subs r4, r4, #4 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q8, q8, q1 + bgt 2b +3: +L(ipred_cfl_ac_420_w4_calc_subtract_dc): + // Aggregate the sums + vadd.i16 q0, q8, q9 + vadd.i16 q1, q10, q11 + vpaddl.u16 q0, q0 + vpaddl.u16 q1, q1 + vadd.i32 q0, q1 + vadd.i32 d0, d0, d1 + vpadd.i32 d0, d0, d0 // sum + sub r0, r0, r6, lsl #3 + vrshl.u32 d16, d0, d31 // (sum + (1 << (log2sz - 1))) >>= log2sz + vdup.16 q8, d16[0] +L(ipred_cfl_ac_420_w4_subtract_dc): +6: // Subtract dc from ac + vld1.16 {q0, q1}, [r0, :128] + subs r6, r6, #4 + vsub.i16 q0, q0, q8 + vsub.i16 q1, q1, q8 + vst1.16 {q0, q1}, [r0, :128]! + bgt 6b + pop {r4-r8, pc} + +L(ipred_cfl_ac_420_w8): + cmp r3, #0 + bne L(ipred_cfl_ac_420_w8_wpad) +1: // Copy and subsample input, without padding + vld1.8 {q0}, [r1, :128], r2 + vld1.8 {q1}, [r12, :128], r2 + vld1.8 {q2}, [r1, :128], r2 + vpaddl.u8 q0, q0 + vld1.8 {q3}, [r12, :128], r2 + vpaddl.u8 q1, q1 + vpaddl.u8 q2, q2 + vpaddl.u8 q3, q3 + vadd.i16 q0, q0, q1 + vadd.i16 q2, q2, q3 + vshl.i16 q0, q0, #1 + vshl.i16 q1, q2, #1 + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + bgt 1b + cmp r4, #0 + vmov q0, q1 + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_420_w8_wpad): +1: // Copy and subsample input, padding 4 + vld1.16 {d0}, [r1, :64], r2 + vld1.16 {d2}, [r12, :64], r2 + vld1.16 {d1}, [r1, :64], r2 + vld1.16 {d3}, [r12, :64], r2 + vpaddl.u8 q0, q0 + vpaddl.u8 q1, q1 + vadd.i16 q0, q0, q1 + vshl.i16 q0, q0, #1 + vdup.16 d3, d1[3] + vmov d2, d1 + vdup.16 d1, d0[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + bgt 1b + cmp r4, #0 + vmov q0, q1 + +L(ipred_cfl_ac_420_w8_hpad): + beq 3f // This assumes that all callers already did "cmp r4, #0" +2: // Vertical padding (h_pad > 0) + subs r4, r4, #4 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q10, q10, q0 + vadd.i16 q11, q11, q1 + bgt 2b +3: + + // Double the height and reuse the w4 summing/subtracting + lsl r6, r6, #1 + b L(ipred_cfl_ac_420_w4_calc_subtract_dc) + +L(ipred_cfl_ac_420_w16): + adr r7, L(ipred_cfl_ac_420_w16_tbl) + ldr r3, [r7, r3, lsl #2] + add r7, r7, r3 + bx r7 + + .align 2 +L(ipred_cfl_ac_420_w16_tbl): + .word L(ipred_cfl_ac_420_w16_wpad0) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_420_w16_wpad1) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_420_w16_wpad2) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_420_w16_wpad3) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB + +L(ipred_cfl_ac_420_w16_wpad0): +1: // Copy and subsample input, without padding + vld1.8 {q0, q1}, [r1, :128], r2 + vld1.8 {q2, q3}, [r12, :128], r2 + vpaddl.u8 q0, q0 + vld1.8 {q12, q13}, [r1, :128], r2 + vpaddl.u8 q1, q1 + vpaddl.u8 q2, q2 + vpaddl.u8 q3, q3 + vadd.i16 q0, q0, q2 + vadd.i16 q1, q1, q3 + vld1.8 {q2, q3}, [r12, :128], r2 + vpaddl.u8 q12, q12 + vpaddl.u8 q13, q13 + vpaddl.u8 q2, q2 + vpaddl.u8 q3, q3 + vadd.i16 q12, q12, q2 + vadd.i16 q13, q13, q3 + vshl.i16 q0, q0, #1 + vshl.i16 q1, q1, #1 + vshl.i16 q2, q12, #1 + vshl.i16 q3, q13, #1 + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_wpad1): +1: // Copy and subsample input, padding 4 + vldr d2, [r1, #16] + vld1.8 {q0}, [r1, :128], r2 + vldr d6, [r12, #16] + vld1.8 {q2}, [r12, :128], r2 + vpaddl.u8 d2, d2 + vldr d26, [r1, #16] + vpaddl.u8 q0, q0 + vld1.8 {q12}, [r1, :128], r2 + vpaddl.u8 d6, d6 + vldr d30, [r12, #16] + vpaddl.u8 q2, q2 + vld1.8 {q14}, [r12, :128], r2 + vpaddl.u8 d26, d26 + vpaddl.u8 q12, q12 + vpaddl.u8 d30, d30 + vpaddl.u8 q14, q14 + vadd.i16 d2, d2, d6 + vadd.i16 q0, q0, q2 + vadd.i16 d26, d26, d30 + vadd.i16 q12, q12, q14 + vshl.i16 d2, d2, #1 + vshl.i16 q0, q0, #1 + vshl.i16 d6, d26, #1 + vshl.i16 q2, q12, #1 + vdup.16 d3, d2[3] + vdup.16 d7, d6[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_wpad2): +1: // Copy and subsample input, padding 8 + vld1.8 {q0}, [r1, :128], r2 + vld1.8 {q1}, [r12, :128], r2 + vld1.8 {q2}, [r1, :128], r2 + vpaddl.u8 q0, q0 + vld1.8 {q3}, [r12, :128], r2 + vpaddl.u8 q1, q1 + vpaddl.u8 q2, q2 + vpaddl.u8 q3, q3 + vadd.i16 q0, q0, q1 + vadd.i16 q2, q2, q3 + vshl.i16 q0, q0, #1 + vshl.i16 q2, q2, #1 + vdup.16 q1, d1[3] + vdup.16 q3, d5[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_wpad3): +1: // Copy and subsample input, padding 12 + vld1.8 {d0}, [r1, :64], r2 + vld1.8 {d1}, [r12, :64], r2 + vld1.8 {d4}, [r1, :64], r2 + vpaddl.u8 q0, q0 + vld1.8 {d5}, [r12, :64], r2 + vpaddl.u8 q2, q2 + vadd.i16 d0, d0, d1 + vadd.i16 d4, d4, d5 + vshl.i16 d0, d0, #1 + vshl.i16 d4, d4, #1 + vdup.16 q1, d0[3] + vdup.16 q3, d4[3] + vdup.16 d1, d0[3] + vdup.16 d5, d4[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_hpad): + beq 3f // This assumes that all callers already did "cmp r4, #0" +2: // Vertical padding (h_pad > 0) + subs r4, r4, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + bgt 2b +3: + + // Quadruple the height and reuse the w4 summing/subtracting + lsl r6, r6, #2 + b L(ipred_cfl_ac_420_w4_calc_subtract_dc) +endfunc + +// void cfl_ac_422_8bpc_neon(int16_t *const ac, const pixel *const ypx, +// const ptrdiff_t stride, const int w_pad, +// const int h_pad, const int cw, const int ch); +function ipred_cfl_ac_422_8bpc_neon, export=1 + push {r4-r8,lr} + ldr r4, [sp, #24] + ldr r5, [sp, #28] + ldr r6, [sp, #32] + clz r8, r5 + lsl r4, r4, #2 + adr r7, L(ipred_cfl_ac_422_tbl) + sub r8, r8, #27 + ldr r8, [r7, r8, lsl #2] + vmov.i16 q8, #0 + vmov.i16 q9, #0 + vmov.i16 q10, #0 + vmov.i16 q11, #0 + add r7, r7, r8 + sub r8, r6, r4 // height - h_pad + rbit lr, r5 // rbit(width) + rbit r12, r6 // rbit(height) + clz lr, lr // ctz(width) + clz r12, r12 // ctz(height) + add lr, lr, r12 // log2sz + add r12, r1, r2 + vdup.32 d31, lr + lsl r2, r2, #1 + vneg.s32 d31, d31 // -log2sz + bx r7 + + .align 2 +L(ipred_cfl_ac_422_tbl): + .word L(ipred_cfl_ac_422_w16) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_422_w8) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_422_w4) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB + +L(ipred_cfl_ac_422_w4): +1: // Copy and subsample input + vld1.8 {d0}, [r1, :64], r2 + vld1.8 {d1}, [r12, :64], r2 + vld1.8 {d2}, [r1, :64], r2 + vld1.8 {d3}, [r12, :64], r2 + vpaddl.u8 q0, q0 + vpaddl.u8 q1, q1 + vshl.i16 q0, q0, #2 + vshl.i16 q1, q1, #2 + subs r8, r8, #4 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + bgt 1b + cmp r4, #0 + vmov d0, d3 + vmov d1, d3 + vmov d2, d3 + b L(ipred_cfl_ac_420_w4_hpad) + +L(ipred_cfl_ac_422_w8): + cmp r3, #0 + bne L(ipred_cfl_ac_422_w8_wpad) +1: // Copy and subsample input, without padding + vld1.8 {q0}, [r1, :128], r2 + vld1.8 {q1}, [r12, :128], r2 + vld1.8 {q2}, [r1, :128], r2 + vpaddl.u8 q0, q0 + vld1.8 {q3}, [r12, :128], r2 + vpaddl.u8 q1, q1 + vpaddl.u8 q2, q2 + vpaddl.u8 q3, q3 + vshl.i16 q0, q0, #2 + vshl.i16 q1, q1, #2 + vshl.i16 q2, q2, #2 + vshl.i16 q3, q3, #2 + subs r8, r8, #4 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + bgt 1b + cmp r4, #0 + vmov q0, q3 + vmov q1, q3 + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_422_w8_wpad): +1: // Copy and subsample input, padding 4 + vld1.8 {d0}, [r1, :64], r2 + vld1.8 {d1}, [r12, :64], r2 + vld1.8 {d2}, [r1, :64], r2 + vld1.8 {d3}, [r12, :64], r2 + vpaddl.u8 q0, q0 + vpaddl.u8 q1, q1 + vshl.i16 q0, q0, #2 + vshl.i16 q1, q1, #2 + vdup.16 d7, d3[3] + vmov d6, d3 + vdup.16 d5, d2[3] + vmov d4, d2 + vdup.16 d3, d1[3] + vmov d2, d1 + vdup.16 d1, d0[3] + subs r8, r8, #4 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + bgt 1b + cmp r4, #0 + vmov q0, q3 + vmov q1, q3 + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_422_w16): + adr r7, L(ipred_cfl_ac_422_w16_tbl) + ldr r3, [r7, r3, lsl #2] + add r7, r7, r3 + bx r7 + + .align 2 +L(ipred_cfl_ac_422_w16_tbl): + .word L(ipred_cfl_ac_422_w16_wpad0) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_422_w16_wpad1) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_422_w16_wpad2) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_422_w16_wpad3) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB + +L(ipred_cfl_ac_422_w16_wpad0): +1: // Copy and subsample input, without padding + vld1.8 {q0, q1}, [r1, :128], r2 + vld1.8 {q2, q3}, [r12, :128], r2 + vpaddl.u8 q0, q0 + vpaddl.u8 q1, q1 + vpaddl.u8 q2, q2 + vpaddl.u8 q3, q3 + vshl.i16 q0, q0, #2 + vshl.i16 q1, q1, #2 + vshl.i16 q2, q2, #2 + vshl.i16 q3, q3, #2 + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_422_w16_wpad1): +1: // Copy and subsample input, padding 4 + vldr d2, [r1, #16] + vld1.8 {q0}, [r1, :128], r2 + vldr d6, [r12, #16] + vld1.8 {q2}, [r12, :128], r2 + vpaddl.u8 d2, d2 + vpaddl.u8 q0, q0 + vpaddl.u8 d6, d6 + vpaddl.u8 q2, q2 + vshl.i16 d2, d2, #2 + vshl.i16 q0, q0, #2 + vshl.i16 d6, d6, #2 + vshl.i16 q2, q2, #2 + vdup.16 d3, d2[3] + vdup.16 d7, d6[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_422_w16_wpad2): +1: // Copy and subsample input, padding 8 + vld1.8 {q0}, [r1, :128], r2 + vld1.8 {q2}, [r12, :128], r2 + vpaddl.u8 q0, q0 + vpaddl.u8 q2, q2 + vshl.i16 q0, q0, #2 + vshl.i16 q2, q2, #2 + vdup.16 q1, d1[3] + vdup.16 q3, d5[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_422_w16_wpad3): +1: // Copy and subsample input, padding 12 + vld1.8 {d0}, [r1, :64], r2 + vld1.8 {d1}, [r12, :64], r2 + vpaddl.u8 q0, q0 + vshl.i16 q0, q0, #2 + vdup.16 q3, d1[3] + vdup.16 q1, d0[3] + vdup.16 d5, d1[3] + vmov d4, d1 + vdup.16 d1, d0[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) +endfunc + +// void cfl_ac_444_8bpc_neon(int16_t *const ac, const pixel *const ypx, +// const ptrdiff_t stride, const int w_pad, +// const int h_pad, const int cw, const int ch); +function ipred_cfl_ac_444_8bpc_neon, export=1 + push {r4-r8,lr} + ldr r4, [sp, #24] + ldr r5, [sp, #28] + ldr r6, [sp, #32] + clz r8, r5 + lsl r4, r4, #2 + adr r7, L(ipred_cfl_ac_444_tbl) + sub r8, r8, #26 + ldr r8, [r7, r8, lsl #2] + vmov.i16 q8, #0 + vmov.i16 q9, #0 + vmov.i16 q10, #0 + vmov.i16 q11, #0 + add r7, r7, r8 + sub r8, r6, r4 // height - h_pad + rbit lr, r5 // rbit(width) + rbit r12, r6 // rbit(height) + clz lr, lr // ctz(width) + clz r12, r12 // ctz(height) + add lr, lr, r12 // log2sz + add r12, r1, r2 + vdup.32 d31, lr + lsl r2, r2, #1 + vneg.s32 d31, d31 // -log2sz + bx r7 + + .align 2 +L(ipred_cfl_ac_444_tbl): + .word L(ipred_cfl_ac_444_w32) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_444_w16) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_444_w8) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_444_w4) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB + +L(ipred_cfl_ac_444_w4): +1: // Copy and expand input + vld1.32 {d0[]}, [r1, :32], r2 + vld1.32 {d0[1]}, [r12, :32], r2 + vld1.32 {d2[]}, [r1, :32], r2 + vld1.32 {d2[1]}, [r12, :32], r2 + vshll.u8 q0, d0, #3 + vshll.u8 q1, d2, #3 + subs r8, r8, #4 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + bgt 1b + cmp r4, #0 + vmov d0, d3 + vmov d1, d3 + vmov d2, d3 + b L(ipred_cfl_ac_420_w4_hpad) + +L(ipred_cfl_ac_444_w8): +1: // Copy and expand input + vld1.16 {d0}, [r1, :64], r2 + vld1.16 {d2}, [r12, :64], r2 + vld1.16 {d4}, [r1, :64], r2 + vshll.u8 q0, d0, #3 + vld1.16 {d6}, [r12, :64], r2 + vshll.u8 q1, d2, #3 + vshll.u8 q2, d4, #3 + vshll.u8 q3, d6, #3 + subs r8, r8, #4 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + bgt 1b + cmp r4, #0 + vmov q0, q3 + vmov q1, q3 + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_444_w16): + cmp r3, #0 + bne L(ipred_cfl_ac_444_w16_wpad) +1: // Copy and expand input, without padding + vld1.8 {q1}, [r1, :128], r2 + vld1.8 {q3}, [r12, :128], r2 + vshll.u8 q0, d2, #3 + vshll.u8 q1, d3, #3 + vshll.u8 q2, d6, #3 + vshll.u8 q3, d7, #3 + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_444_w16_wpad): +1: // Copy and expand input, padding 8 + vld1.8 {d0}, [r1, :64], r2 + vld1.8 {d4}, [r12, :64], r2 + vshll.u8 q0, d0, #3 + vshll.u8 q2, d4, #3 + vdup.16 q1, d1[3] + vdup.16 q3, d5[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_444_w32): + adr r7, L(ipred_cfl_ac_444_w32_tbl) + ldr r3, [r7, r3, lsl #1] // (w3>>1) << 2 + add r7, r7, r3 + bx r7 + + .align 2 +L(ipred_cfl_ac_444_w32_tbl): + .word L(ipred_cfl_ac_444_w32_wpad0) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_444_w32_wpad2) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_444_w32_wpad4) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_444_w32_wpad6) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB + +L(ipred_cfl_ac_444_w32_wpad0): +1: // Copy and expand input, without padding + vld1.8 {q2, q3}, [r1, :128], r2 + vld1.8 {q13, q14}, [r12, :128], r2 + vshll.u8 q0, d4, #3 + vshll.u8 q1, d5, #3 + vshll.u8 q2, d6, #3 + vshll.u8 q3, d7, #3 + vshll.u8 q12, d26, #3 + vshll.u8 q13, d27, #3 + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vshll.u8 q0, d28, #3 + vshll.u8 q1, d29, #3 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + vst1.16 {q12, q13}, [r0, :128]! + vadd.i16 q8, q8, q12 + vadd.i16 q9, q9, q13 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q10, q10, q0 + vadd.i16 q11, q11, q1 + bgt 1b + cmp r4, #0 + b L(ipred_cfl_ac_444_w32_hpad) + +L(ipred_cfl_ac_444_w32_wpad2): +1: // Copy and expand input, padding 8 + vldr d4, [r1, #16] + vld1.8 {q1}, [r1, :128], r2 + vldr d28, [r12, #16] + vld1.8 {q13}, [r12, :128], r2 + vshll.u8 q2, d4, #3 + vshll.u8 q0, d2, #3 + vshll.u8 q1, d3, #3 + vshll.u8 q12, d26, #3 + vshll.u8 q13, d27, #3 + vdup.16 q3, d5[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vshll.u8 q0, d28, #3 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + vdup.16 q1, d1[3] + vst1.16 {q12, q13}, [r0, :128]! + vadd.i16 q8, q8, q12 + vadd.i16 q9, q9, q13 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q10, q10, q0 + vadd.i16 q11, q11, q1 + bgt 1b + cmp r4, #0 + b L(ipred_cfl_ac_444_w32_hpad) + +L(ipred_cfl_ac_444_w32_wpad4): +1: // Copy and expand input, padding 16 + vld1.8 {q1}, [r1, :128], r2 + vld1.8 {q13}, [r12, :128], r2 + vshll.u8 q0, d2, #3 + vshll.u8 q1, d3, #3 + vshll.u8 q12, d26, #3 + vshll.u8 q13, d27, #3 + vdup.16 q2, d3[3] + vdup.16 q3, d3[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vdup.16 q0, d27[3] + vdup.16 q1, d27[3] + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + vst1.16 {q12, q13}, [r0, :128]! + vadd.i16 q8, q8, q12 + vadd.i16 q9, q9, q13 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q10, q10, q0 + vadd.i16 q11, q11, q1 + bgt 1b + cmp r4, #0 + b L(ipred_cfl_ac_444_w32_hpad) + +L(ipred_cfl_ac_444_w32_wpad6): +1: // Copy and expand input, padding 24 + vld1.8 {d0}, [r1, :64], r2 + vld1.8 {d24}, [r12, :64], r2 + vshll.u8 q0, d0, #3 + vshll.u8 q12, d24, #3 + subs r8, r8, #2 + vdup.16 q1, d1[3] + vdup.16 q2, d1[3] + vdup.16 q3, d1[3] + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vdup.16 q13, d25[3] + vdup.16 q0, d25[3] + vdup.16 q1, d25[3] + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + vst1.16 {q12, q13}, [r0, :128]! + vadd.i16 q8, q8, q12 + vadd.i16 q9, q9, q13 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q10, q10, q0 + vadd.i16 q11, q11, q1 + bgt 1b + cmp r4, #0 + +L(ipred_cfl_ac_444_w32_hpad): + beq 3f // This assumes that all callers already did "cmp r4, #0" +2: // Vertical padding (h_pad > 0) + subs r4, r4, #1 + vst1.16 {q12, q13}, [r0, :128]! + vadd.i16 q8, q8, q12 + vadd.i16 q9, q9, q13 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q10, q10, q0 + vadd.i16 q11, q11, q1 + bgt 2b +3: + + // Multiply the height by eight and reuse the w4 subtracting + lsl r6, r6, #3 + // Aggregate the sums, with wider intermediates earlier than in + // ipred_cfl_ac_420_w8_calc_subtract_dc. + vpaddl.u16 q0, q8 + vpaddl.u16 q1, q9 + vpaddl.u16 q2, q10 + vpaddl.u16 q3, q11 + vadd.i32 q0, q0, q1 + vadd.i32 q2, q2, q3 + vadd.i32 q0, q0, q2 + vadd.i32 d0, d0, d1 + vpadd.i32 d0, d0, d0 // sum + sub r0, r0, r6, lsl #3 + vrshl.u32 d16, d0, d31 // (sum + (1 << (log2sz - 1))) >>= log2sz + vdup.16 q8, d16[0] + b L(ipred_cfl_ac_420_w4_subtract_dc) +endfunc diff --git a/third_party/dav1d/src/arm/32/itx.S b/third_party/dav1d/src/arm/32/itx.S new file mode 100644 index 0000000000..867eb194df --- /dev/null +++ b/third_party/dav1d/src/arm/32/itx.S @@ -0,0 +1,3386 @@ +/****************************************************************************** + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include "src/arm/asm.S" +#include "util.S" + +// The exported functions in this file have got the following signature: +// void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob); + +// Most of the functions use the following register layout: +// r0-r3 external parameters +// r4 function pointer to first transform +// r5 function pointer to second transform +// r6 output parameter for helper function +// r7 input parameter for helper function +// r8 input stride for helper function +// r9 scratch variable for helper functions +// r10-r11 pointer to list of eob thresholds, eob threshold value, +// scratch variables within helper functions (backed up) + +// The SIMD registers most often use the following layout: +// d0-d3 multiplication coefficients +// d4-d7 scratch registers +// d8-d15 unused in some transforms, used for scratch registers in others +// d16-v31 inputs/outputs of transforms + +// Potential further optimizations, that are left unimplemented for now: +// - Trying to keep multiplication coefficients in registers across multiple +// transform functions. (The register layout is designed to potentially +// allow this.) +// - Use a simplified version of the transforms themselves for cases where +// we know a significant number of inputs are zero. E.g. if the eob value +// indicates only a quarter of input values are set, for idct16 and up, +// a significant amount of calculation can be skipped, at the cost of more +// code duplication and special casing. + +const idct_coeffs, align=4 + // idct4 + .short 2896, 2896*8, 1567, 3784 + // idct8 + .short 799, 4017, 3406, 2276 + // idct16 + .short 401, 4076, 3166, 2598 + .short 1931, 3612, 3920, 1189 + // idct32 + .short 201, 4091, 3035, 2751 + .short 1751, 3703, 3857, 1380 + .short 995, 3973, 3513, 2106 + .short 2440, 3290, 4052, 601 +endconst + +const idct64_coeffs, align=4 + .short 101*8, 4095*8, 2967*8, -2824*8 + .short 1660*8, 3745*8, 3822*8, -1474*8 + .short 4076, 401, 4017, 799 + + .short 4036*8, -700*8, 2359*8, 3349*8 + .short 3461*8, -2191*8, 897*8, 3996*8 + .short -3166, -2598, -799, -4017 + + .short 501*8, 4065*8, 3229*8, -2520*8 + .short 2019*8, 3564*8, 3948*8, -1092*8 + .short 3612, 1931, 2276, 3406 + + .short 4085*8, -301*8, 2675*8, 3102*8 + .short 3659*8, -1842*8, 1285*8, 3889*8 + .short -3920, -1189, -3406, -2276 +endconst + +const iadst4_coeffs, align=4 + // .h[4-5] can be interpreted as .s[2] + .short 1321, 3803, 2482, 3344, 3344, 0 +endconst + +const iadst8_coeffs, align=4 + .short 4076, 401, 3612, 1931 + .short 2598, 3166, 1189, 3920 + // idct_coeffs + .short 2896, 0, 1567, 3784, 0, 0, 0, 0 +endconst + +const iadst16_coeffs, align=4 + .short 4091, 201, 3973, 995 + .short 3703, 1751, 3290, 2440 + .short 2751, 3035, 2106, 3513 + .short 1380, 3857, 601, 4052 +endconst + +.macro vmull_vmlal d0, s0, s1, c0, c1 + vmull.s16 \d0, \s0, \c0 + vmlal.s16 \d0, \s1, \c1 +.endm + +.macro vmull_vmlal_8h d0, d1, s0, s1, s2, s3, c0, c1 + vmull.s16 \d0, \s0, \c0 + vmlal.s16 \d0, \s2, \c1 + vmull.s16 \d1, \s1, \c0 + vmlal.s16 \d1, \s3, \c1 +.endm + +.macro vmull_vmlsl d0, s0, s1, c0, c1 + vmull.s16 \d0, \s0, \c0 + vmlsl.s16 \d0, \s1, \c1 +.endm + +.macro vmull_vmlsl_8h d0, d1, s0, s1, s2, s3, c0, c1 + vmull.s16 \d0, \s0, \c0 + vmlsl.s16 \d0, \s2, \c1 + vmull.s16 \d1, \s1, \c0 + vmlsl.s16 \d1, \s3, \c1 +.endm + +.macro vrshrn_8h d0, d1, s0, s1, shift + vrshrn.i32 \d0, \s0, \shift + vrshrn.i32 \d1, \s1, \shift +.endm + +.macro scale_input c, r0, r1, r2 r3, r4, r5, r6, r7 + vqrdmulh.s16 \r0, \r0, \c + vqrdmulh.s16 \r1, \r1, \c +.ifnb \r2 + vqrdmulh.s16 \r2, \r2, \c + vqrdmulh.s16 \r3, \r3, \c +.endif +.ifnb \r4 + vqrdmulh.s16 \r4, \r4, \c + vqrdmulh.s16 \r5, \r5, \c + vqrdmulh.s16 \r6, \r6, \c + vqrdmulh.s16 \r7, \r7, \c +.endif +.endm + +.macro load_add_store load, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src, shiftbits=4 +.ifnb \load + vld1.8 {\load}, [\src, :64], r1 +.endif +.ifnb \shift + vrshr.s16 \shift, \shift, #\shiftbits +.endif +.ifnb \addsrc + vaddw.u8 \adddst, \adddst, \addsrc +.endif +.ifnb \narrowsrc + vqmovun.s16 \narrowdst, \narrowsrc +.endif +.ifnb \store + vst1.8 {\store}, [\dst, :64], r1 +.endif +.endm +.macro load_add_store_8x8 dst, src, shiftbits=4 + mov \src, \dst + load_add_store d2, q8, , , , , , \dst, \src, \shiftbits + load_add_store d3, q9, , , , , , \dst, \src, \shiftbits + load_add_store d4, q10, d2, q8, , , , \dst, \src, \shiftbits + load_add_store d5, q11, d3, q9, q8, d2, , \dst, \src, \shiftbits + load_add_store d6, q12, d4, q10, q9, d3, d2, \dst, \src, \shiftbits + load_add_store d7, q13, d5, q11, q10, d4, d3, \dst, \src, \shiftbits + load_add_store d2, q14, d6, q12, q11, d5, d4, \dst, \src, \shiftbits + load_add_store d3, q15, d7, q13, q12, d6, d5, \dst, \src, \shiftbits + load_add_store , , d2, q14, q13, d7, d6, \dst, \src, \shiftbits + load_add_store , , d3, q15, q14, d2, d7, \dst, \src, \shiftbits + load_add_store , , , , q15, d3, d2, \dst, \src, \shiftbits + load_add_store , , , , , , d3, \dst, \src, \shiftbits +.endm +.macro load_add_store_8x4 dst, src + mov \src, \dst + load_add_store d2, q8, , , , , , \dst, \src + load_add_store d3, q9, , , , , , \dst, \src + load_add_store d4, q10, d2, q8, , , , \dst, \src + load_add_store d5, q11, d3, q9, q8, d2, , \dst, \src + load_add_store , , d4, q10, q9, d3, d2, \dst, \src + load_add_store , , d5, q11, q10, d4, d3, \dst, \src + load_add_store , , , , q11, d5, d4, \dst, \src + load_add_store , , , , , , d5, \dst, \src +.endm +.macro load_add_store4 load, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src +.ifnb \load + vld1.32 {\load[0]}, [\src, :32], r1 +.endif +.ifnb \shift + vrshr.s16 \shift, \shift, #4 +.endif +.ifnb \load + vld1.32 {\load[1]}, [\src, :32], r1 +.endif +.ifnb \addsrc + vaddw.u8 \adddst, \adddst, \addsrc +.endif +.ifnb \store + vst1.32 {\store[0]}, [\dst, :32], r1 +.endif +.ifnb \narrowsrc + vqmovun.s16 \narrowdst, \narrowsrc +.endif +.ifnb \store + vst1.32 {\store[1]}, [\dst, :32], r1 +.endif +.endm +.macro load_add_store_4x16 dst, src + mov \src, \dst + load_add_store4 d0, , , , , , , \dst, \src + load_add_store4 d1, q8, , , , , , \dst, \src + load_add_store4 d2, q9, d0, q8, , , , \dst, \src + load_add_store4 d3, q10, d1, q9, q8, d0, , \dst, \src + load_add_store4 d4, q11, d2, q10, q9, d1, d0, \dst, \src + load_add_store4 d5, q12, d3, q11, q10, d2, d1, \dst, \src + load_add_store4 d6, q13, d4, q12, q11, d3, d2, \dst, \src + load_add_store4 d7, q14, d5, q13, q12, d4, d3, \dst, \src + load_add_store4 , q15, d6, q14, q13, d5, d4, \dst, \src + load_add_store4 , , d7, q15, q14, d6, d5, \dst, \src + load_add_store4 , , , , q15, d7, d6, \dst, \src + load_add_store4 , , , , , , d7, \dst, \src +.endm +.macro load_add_store_4x8 dst, src + mov \src, \dst + load_add_store4 d0, , , , , , , \dst, \src + load_add_store4 d1, q8, , , , , , \dst, \src + load_add_store4 d2, q9, d0, q8, , , , \dst, \src + load_add_store4 d3, q10, d1, q9, q8, d0, , \dst, \src + load_add_store4 , q11, d2, q10, q9, d1, d0, \dst, \src + load_add_store4 , , d3, q11, q10, d2, d1, \dst, \src + load_add_store4 , , , , q11, d3, d2, \dst, \src + load_add_store4 , , , , , , d3, \dst, \src +.endm + +.macro idct_dc w, h, shift + cmp r3, #0 + bne 1f + vmov.i16 d30, #0 + movw r12, #2896*8 + vld1.16 {d16[]}, [r2, :16] + vdup.16 d0, r12 + vqrdmulh.s16 d16, d16, d0[0] + vst1.16 {d30[0]}, [r2, :16] +.if (\w == 2*\h) || (2*\w == \h) + vqrdmulh.s16 d16, d16, d0[0] +.endif +.if \shift > 0 + vrshr.s16 d16, d16, #\shift +.endif + vqrdmulh.s16 d20, d16, d0[0] + mov r3, #\h + vrshr.s16 d16, d20, #4 + vrshr.s16 d17, d20, #4 + b idct_dc_w\w\()_neon +1: +.endm + +function idct_dc_w4_neon +1: + vld1.32 {d0[0]}, [r0, :32], r1 + vld1.32 {d0[1]}, [r0, :32], r1 + vld1.32 {d1[0]}, [r0, :32], r1 + vld1.32 {d1[1]}, [r0, :32], r1 + subs r3, r3, #4 + sub r0, r0, r1, lsl #2 + vaddw.u8 q10, q8, d0 + vqmovun.s16 d0, q10 + vaddw.u8 q11, q8, d1 + vst1.32 {d0[0]}, [r0, :32], r1 + vqmovun.s16 d1, q11 + vst1.32 {d0[1]}, [r0, :32], r1 + vst1.32 {d1[0]}, [r0, :32], r1 + vst1.32 {d1[1]}, [r0, :32], r1 + bgt 1b + bx lr +endfunc + +function idct_dc_w8_neon +1: + vld1.8 {d0}, [r0, :64], r1 + vld1.8 {d1}, [r0, :64], r1 + vld1.8 {d2}, [r0, :64], r1 + vaddw.u8 q10, q8, d0 + vld1.8 {d3}, [r0, :64], r1 + sub r0, r0, r1, lsl #2 + subs r3, r3, #4 + vaddw.u8 q11, q8, d1 + vqmovun.s16 d0, q10 + vaddw.u8 q12, q8, d2 + vqmovun.s16 d1, q11 + vaddw.u8 q13, q8, d3 + vst1.8 {d0}, [r0, :64], r1 + vqmovun.s16 d2, q12 + vst1.8 {d1}, [r0, :64], r1 + vqmovun.s16 d3, q13 + vst1.8 {d2}, [r0, :64], r1 + vst1.8 {d3}, [r0, :64], r1 + bgt 1b + bx lr +endfunc + +function idct_dc_w16_neon +1: + vld1.8 {q0}, [r0, :128], r1 + vld1.8 {q1}, [r0, :128], r1 + vld1.8 {q2}, [r0, :128], r1 + subs r3, r3, #4 + vaddw.u8 q10, q8, d0 + vaddw.u8 q11, q8, d1 + vld1.8 {q3}, [r0, :128], r1 + vaddw.u8 q12, q8, d2 + vaddw.u8 q13, q8, d3 + sub r0, r0, r1, lsl #2 + vaddw.u8 q14, q8, d4 + vaddw.u8 q15, q8, d5 + vqmovun.s16 d0, q10 + vqmovun.s16 d1, q11 + vaddw.u8 q10, q8, d6 + vaddw.u8 q11, q8, d7 + vqmovun.s16 d2, q12 + vqmovun.s16 d3, q13 + vqmovun.s16 d4, q14 + vqmovun.s16 d5, q15 + vst1.8 {q0}, [r0, :128], r1 + vqmovun.s16 d6, q10 + vqmovun.s16 d7, q11 + vst1.8 {q1}, [r0, :128], r1 + vst1.8 {q2}, [r0, :128], r1 + vst1.8 {q3}, [r0, :128], r1 + bgt 1b + bx lr +endfunc + +function idct_dc_w32_neon +1: + vld1.8 {q0, q1}, [r0, :128], r1 + subs r3, r3, #2 + vld1.8 {q2, q3}, [r0, :128], r1 + vaddw.u8 q10, q8, d0 + vaddw.u8 q11, q8, d1 + vaddw.u8 q12, q8, d2 + vaddw.u8 q13, q8, d3 + sub r0, r0, r1, lsl #1 + vaddw.u8 q14, q8, d4 + vaddw.u8 q15, q8, d5 + vqmovun.s16 d0, q10 + vqmovun.s16 d1, q11 + vaddw.u8 q10, q8, d6 + vaddw.u8 q11, q8, d7 + vqmovun.s16 d2, q12 + vqmovun.s16 d3, q13 + vqmovun.s16 d4, q14 + vqmovun.s16 d5, q15 + vst1.8 {q0, q1}, [r0, :128], r1 + vqmovun.s16 d6, q10 + vqmovun.s16 d7, q11 + vst1.8 {q2, q3}, [r0, :128], r1 + bgt 1b + bx lr +endfunc + +function idct_dc_w64_neon + sub r1, r1, #32 +1: + vld1.8 {q0, q1}, [r0, :128]! + subs r3, r3, #1 + vld1.8 {q2, q3}, [r0, :128] + vaddw.u8 q10, q8, d0 + vaddw.u8 q11, q8, d1 + vaddw.u8 q12, q8, d2 + vaddw.u8 q13, q8, d3 + sub r0, r0, #32 + vaddw.u8 q14, q8, d4 + vaddw.u8 q15, q8, d5 + vqmovun.s16 d0, q10 + vqmovun.s16 d1, q11 + vaddw.u8 q10, q8, d6 + vaddw.u8 q11, q8, d7 + vqmovun.s16 d2, q12 + vqmovun.s16 d3, q13 + vqmovun.s16 d4, q14 + vqmovun.s16 d5, q15 + vst1.8 {q0, q1}, [r0, :128]! + vqmovun.s16 d6, q10 + vqmovun.s16 d7, q11 + vst1.8 {q2, q3}, [r0, :128], r1 + bgt 1b + bx lr +endfunc + +.macro iwht4 + vadd.i16 d16, d16, d17 + vsub.i16 d21, d18, d19 + vsub.i16 d20, d16, d21 + vshr.s16 d20, d20, #1 + vsub.i16 d18, d20, d17 + vsub.i16 d17, d20, d19 + vadd.i16 d19, d21, d18 + vsub.i16 d16, d16, d17 +.endm + +.macro idct_4h_x4 r0, r1, r2, r3 + vmull_vmlal q3, \r1, \r3, d0[3], d0[2] + vmull_vmlsl q2, \r1, \r3, d0[2], d0[3] + vmull_vmlal q1, \r0, \r2, d0[0], d0[0] + vrshrn.i32 d6, q3, #12 + vrshrn.i32 d7, q2, #12 + vmull_vmlsl q2, \r0, \r2, d0[0], d0[0] + vrshrn.i32 d2, q1, #12 + vrshrn.i32 d3, q2, #12 + vqadd.s16 \r0, d2, d6 + vqsub.s16 \r3, d2, d6 + vqadd.s16 \r1, d3, d7 + vqsub.s16 \r2, d3, d7 +.endm + +.macro idct_8h_x4 q0, q1, q2, q3, r0, r1, r2, r3, r4, r5, r6, r7 + vmull_vmlal_8h q6, q7, \r2, \r3, \r6, \r7, d0[3], d0[2] + vmull_vmlsl_8h q4, q5, \r2, \r3, \r6, \r7, d0[2], d0[3] + vmull_vmlal_8h q2, q3, \r0, \r1, \r4, \r5, d0[0], d0[0] + vrshrn_8h d12, d13, q6, q7, #12 + vrshrn_8h d14, d15, q4, q5, #12 + vmull_vmlsl_8h q4, q5, \r0, \r1, \r4, \r5, d0[0], d0[0] + vrshrn_8h d4, d5, q2, q3, #12 + vrshrn_8h d6, d7, q4, q5, #12 + vqadd.s16 \q0, q2, q6 + vqsub.s16 \q3, q2, q6 + vqadd.s16 \q1, q3, q7 + vqsub.s16 \q2, q3, q7 +.endm + +function inv_dct_4h_x4_neon, export=1 + movrel_local r12, idct_coeffs + vld1.16 {d0}, [r12, :64] + idct_4h_x4 d16, d17, d18, d19 + bx lr +endfunc + +function inv_dct_8h_x4_neon, export=1 + movrel_local r12, idct_coeffs + vld1.16 {d0}, [r12, :64] + idct_8h_x4 q8, q9, q10, q11, d16, d17, d18, d19, d20, d21, d22, d23 + bx lr +endfunc + +.macro iadst_4x4 o0, o1, o2, o3 + movrel_local r12, iadst4_coeffs + vld1.16 {d0, d1}, [r12, :128] + + vsubl.s16 q1, d16, d18 + vmull.s16 q2, d16, d0[0] + vmlal.s16 q2, d18, d0[1] + vmlal.s16 q2, d19, d0[2] + vmull.s16 q10, d17, d0[3] + vaddw.s16 q1, q1, d19 + vmull.s16 q3, d16, d0[2] + vmlsl.s16 q3, d18, d0[0] + vmlsl.s16 q3, d19, d0[1] + + vadd.s32 q11, q2, q3 + vmul.s32 q1, q1, d1[0] + vadd.s32 q2, q2, q10 + vadd.s32 q3, q3, q10 + vsub.s32 q11, q11, q10 + + vrshrn.i32 \o0, q2, #12 + vrshrn.i32 \o2, q1, #12 + vrshrn.i32 \o1, q3, #12 + vrshrn.i32 \o3, q11, #12 +.endm + +function inv_adst_4h_x4_neon, export=1 + iadst_4x4 d16, d17, d18, d19 + bx lr +endfunc + +function inv_flipadst_4h_x4_neon, export=1 + iadst_4x4 d19, d18, d17, d16 + bx lr +endfunc + +.macro iadst_8x4 o0, o1, o2, o3, o4, o5, o6, o7 + movrel_local r12, iadst4_coeffs + vld1.16 {d0, d1}, [r12, :128] + + vsubl.s16 q2, d16, d20 + vsubl.s16 q3, d17, d21 + vmull.s16 q4, d16, d0[0] + vmlal.s16 q4, d20, d0[1] + vmlal.s16 q4, d22, d0[2] + vmull.s16 q5, d17, d0[0] + vmlal.s16 q5, d21, d0[1] + vmlal.s16 q5, d23, d0[2] + vaddw.s16 q2, q2, d22 + vaddw.s16 q3, q3, d23 + vmull.s16 q6, d16, d0[2] + vmlsl.s16 q6, d20, d0[0] + vmlsl.s16 q6, d22, d0[1] + vmull.s16 q7, d17, d0[2] + vmlsl.s16 q7, d21, d0[0] + vmlsl.s16 q7, d23, d0[1] + + vmul.s32 q10, q2, d1[0] + vmul.s32 q11, q3, d1[0] + + vmull.s16 q2, d18, d0[3] + vmull.s16 q3, d19, d0[3] + + vadd.s32 q8, q4, q2 // out0 + vadd.s32 q9, q5, q3 + + vadd.s32 q4, q4, q6 // out3 + vadd.s32 q5, q5, q7 + + vadd.s32 q6, q6, q2 // out1 + vadd.s32 q7, q7, q3 + + vsub.s32 q4, q4, q2 // out3 + vsub.s32 q5, q5, q3 + + vrshrn.i32 d20, q10, #12 + vrshrn.i32 d21, q11, #12 + + vrshrn.i32 \o0, q8, #12 + vrshrn.i32 \o1, q9, #12 + +.ifc \o4, d18 + vmov q9, q10 +.endif + + vrshrn.i32 \o2, q6, #12 + vrshrn.i32 \o3, q7, #12 + + vrshrn.i32 \o6, q4, #12 + vrshrn.i32 \o7, q5, #12 +.endm + +function inv_adst_8h_x4_neon, export=1 + iadst_8x4 d16, d17, d18, d19, d20, d21, d22, d23 + bx lr +endfunc + +function inv_flipadst_8h_x4_neon, export=1 + iadst_8x4 d22, d23, d20, d21, d18, d19, d16, d17 + bx lr +endfunc + +function inv_identity_4h_x4_neon, export=1 + movw r12, #(5793-4096)*8 + vdup.16 d0, r12 + vqrdmulh.s16 q2, q8, d0[0] + vqrdmulh.s16 q3, q9, d0[0] + vqadd.s16 q8, q8, q2 + vqadd.s16 q9, q9, q3 + bx lr +endfunc + +function inv_identity_8h_x4_neon, export=1 + movw r12, #(5793-4096)*8 + vdup.16 d0, r12 + vqrdmulh.s16 q1, q8, d0[0] + vqrdmulh.s16 q2, q9, d0[0] + vqrdmulh.s16 q3, q10, d0[0] + vqadd.s16 q8, q8, q1 + vqrdmulh.s16 q1, q11, d0[0] + vqadd.s16 q9, q9, q2 + vqadd.s16 q10, q10, q3 + vqadd.s16 q11, q11, q1 + bx lr +endfunc + +.macro identity_8x4_shift1 r0, r1, r2, r3, c +.irp i, \r0, \r1, \r2, \r3 + vqrdmulh.s16 q1, \i, \c + vrhadd.s16 \i, \i, q1 +.endr +.endm + +function inv_txfm_add_wht_wht_4x4_8bpc_neon, export=1 + push {r4-r5,lr} + vmov.i16 q15, #0 + vld1.16 {d16, d17, d18, d19}, [r2, :128] + vst1.16 {q15}, [r2, :128]! + + vshr.s16 q8, q8, #2 + vshr.s16 q9, q9, #2 + + iwht4 + + vst1.16 {q15}, [r2, :128]! + transpose_4x4h q8, q9, d16, d17, d18, d19 + + iwht4 + + vld1.32 {d0[]}, [r0, :32], r1 + vld1.32 {d0[1]}, [r0, :32], r1 + vld1.32 {d1[]}, [r0, :32], r1 + vld1.32 {d1[1]}, [r0, :32], r1 + + b L(itx_4x4_end) +endfunc + +function inv_txfm_add_4x4_neon + vmov.i16 q15, #0 + vld1.16 {d16, d17, d18, d19}, [r2, :128] + vst1.16 {q15}, [r2, :128]! + + blx r4 + + vst1.16 {q15}, [r2, :128]! + transpose_4x4h q8, q9, d16, d17, d18, d19 + + blx r5 + + vld1.32 {d0[]}, [r0, :32], r1 + vld1.32 {d0[1]}, [r0, :32], r1 + vld1.32 {d1[]}, [r0, :32], r1 + vld1.32 {d1[1]}, [r0, :32], r1 + vrshr.s16 q8, q8, #4 + vrshr.s16 q9, q9, #4 + +L(itx_4x4_end): + sub r0, r0, r1, lsl #2 + vaddw.u8 q8, q8, d0 + vqmovun.s16 d0, q8 + vaddw.u8 q9, q9, d1 + vst1.32 {d0[0]}, [r0, :32], r1 + vqmovun.s16 d1, q9 + vst1.32 {d0[1]}, [r0, :32], r1 + vst1.32 {d1[0]}, [r0, :32], r1 + vst1.32 {d1[1]}, [r0, :32], r1 + + pop {r4-r5,pc} +endfunc + +.macro def_fn_4x4 txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_neon, export=1 + push {r4-r5,lr} + +.ifc \txfm1\()_\txfm2, dct_dct + cmp r3, #0 + bne 1f + vmov.i16 d30, #0 + movw r12, #2896*8 + vld1.16 {d16[]}, [r2, :16] + vdup.16 d4, r12 + vst1.16 {d30[0]}, [r2, :16] + vqrdmulh.s16 d16, d16, d4[0] + vld1.32 {d0[0]}, [r0, :32], r1 + vqrdmulh.s16 d20, d16, d4[0] + vld1.32 {d0[1]}, [r0, :32], r1 + vrshr.s16 d16, d20, #4 + vrshr.s16 d17, d20, #4 + vld1.32 {d1[0]}, [r0, :32], r1 + vmov q9, q8 + vld1.32 {d1[1]}, [r0, :32], r1 + b L(itx_4x4_end) +1: +.endif + movrel_local r4, inv_\txfm1\()_4h_x4_neon + movrel_local r5, inv_\txfm2\()_4h_x4_neon + b inv_txfm_add_4x4_neon +endfunc +.endm + +def_fn_4x4 dct, dct +def_fn_4x4 identity, identity +def_fn_4x4 dct, adst +def_fn_4x4 dct, flipadst +def_fn_4x4 dct, identity +def_fn_4x4 adst, dct +def_fn_4x4 adst, adst +def_fn_4x4 adst, flipadst +def_fn_4x4 flipadst, dct +def_fn_4x4 flipadst, adst +def_fn_4x4 flipadst, flipadst +def_fn_4x4 identity, dct + +def_fn_4x4 adst, identity +def_fn_4x4 flipadst, identity +def_fn_4x4 identity, adst +def_fn_4x4 identity, flipadst + +.macro idct_8h_x8 q0, q1, q2, q3, q4, q5, q6, q7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15 + idct_8h_x4 \q0, \q2, \q4, \q6, \r0, \r1, \r4, \r5, \r8, \r9, \r12, \r13 + + vmull_vmlsl_8h q2, q3, \r2, \r3, \r14, \r15, d1[0], d1[1] // -> t4a + vmull_vmlal_8h q4, q5, \r2, \r3, \r14, \r15, d1[1], d1[0] // -> t7a + vmull_vmlsl_8h q6, q7, \r10, \r11, \r6, \r7, d1[2], d1[3] // -> t5a + vrshrn_8h \r2, \r3, q2, q3, #12 // t4a + vrshrn_8h \r14, \r15, q4, q5, #12 // t7a + vmull_vmlal_8h q2, q3, \r10, \r11, \r6, \r7, d1[3], d1[2] // -> t6a + vrshrn_8h \r6, \r7, q6, q7, #12 // t5a + vrshrn_8h \r10, \r11, q2, q3, #12 // taa + + vqadd.s16 q2, \q1, \q3 // t4 + vqsub.s16 \q1, \q1, \q3 // t5a + vqadd.s16 q3, \q7, \q5 // t7 + vqsub.s16 \q3, \q7, \q5 // t6a + + vmull_vmlsl_8h q4, q5, \r6, \r7, \r2, \r3, d0[0], d0[0] // -> t5 + vmull_vmlal_8h q6, q7, \r6, \r7, \r2, \r3, d0[0], d0[0] // -> t6 + vrshrn_8h d8, d9, q4, q5, #12 // t5 + vrshrn_8h d10, d11, q6, q7, #12 // t6 + + vqsub.s16 \q7, \q0, q3 // out7 + vqadd.s16 \q0, \q0, q3 // out0 + vqadd.s16 \q1, \q2, q5 // out1 + vqsub.s16 q6, \q2, q5 // out6 + vqadd.s16 \q2, \q4, q4 // out2 + vqsub.s16 \q5, \q4, q4 // out5 + vqadd.s16 \q3, \q6, q2 // out3 + vqsub.s16 \q4, \q6, q2 // out4 + vmov \q6, q6 // out6 +.endm + +.macro idct_4h_x8 r0, r1, r2, r3, r4, r5, r6, r7 + idct_4h_x4 \r0, \r2, \r4, \r6 + + vmull_vmlsl q1, \r1, \r7, d1[0], d1[1] // -> t4a + vmull_vmlal q2, \r1, \r7, d1[1], d1[0] // -> t7a + vmull_vmlsl q3, \r5, \r3, d1[2], d1[3] // -> t5a + vrshrn.i32 \r1, q1, #12 // t4a + vmull_vmlal q1, \r5, \r3, d1[3], d1[2] // -> t6a + vrshrn.i32 \r7, q2, #12 // t7a + vrshrn.i32 \r3, q3, #12 // t5a + vrshrn.i32 \r5, q1, #12 // taa + + vqadd.s16 d2, \r1, \r3 // t4 + vqsub.s16 \r1, \r1, \r3 // t5a + vqadd.s16 d3, \r7, \r5 // t7 + vqsub.s16 \r3, \r7, \r5 // t6a + + vmull_vmlsl q2, \r3, \r1, d0[0], d0[0] // -> t5 + vmull_vmlal q3, \r3, \r1, d0[0], d0[0] // -> t6 + vrshrn.i32 d4, q2, #12 // t5 + vrshrn.i32 d5, q3, #12 // t6 + + vqsub.s16 \r7, \r0, d3 // out7 + vqadd.s16 \r0, \r0, d3 // out0 + vqadd.s16 \r1, \r2, d5 // out1 + vqsub.s16 d6, \r2, d5 // out6 + vqadd.s16 \r2, \r4, d4 // out2 + vqsub.s16 \r5, \r4, d4 // out5 + vqadd.s16 \r3, \r6, d2 // out3 + vqsub.s16 \r4, \r6, d2 // out4 + vmov \r6, d6 // out6 +.endm + +function inv_dct_8h_x8_neon, export=1 + movrel_local r12, idct_coeffs + vld1.16 {q0}, [r12, :128] + idct_8h_x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + bx lr +endfunc + +function inv_dct_4h_x8_neon, export=1 + movrel_local r12, idct_coeffs + vld1.16 {q0}, [r12, :128] + idct_4h_x8 d16, d17, d18, d19, d20, d21, d22, d23 + bx lr +endfunc + +.macro iadst_8h_x8 q0, q1, q2, q3, q4, q5, q6, q7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15 + movrel_local r12, iadst8_coeffs + vld1.16 {d0, d1, d2}, [r12, :64] + + vmull_vmlal_8h q2, q3, d30, d31, d16, d17, d0[0], d0[1] + vmull_vmlsl_8h q4, q5, d30, d31, d16, d17, d0[1], d0[0] + vmull_vmlal_8h q6, q7, d26, d27, d20, d21, d0[2], d0[3] + vrshrn_8h d16, d17, q2, q3, #12 // t0a + vrshrn_8h d30, d31, q4, q5, #12 // t1a + vmull_vmlsl_8h q2, q3, d26, d27, d20, d21, d0[3], d0[2] + vmull_vmlal_8h q4, q5, d22, d23, d24, d25, d1[0], d1[1] + vrshrn_8h d20, d21, q6, q7, #12 // t2a + vrshrn_8h d26, d27, q2, q3, #12 // t3a + vmull_vmlsl_8h q6, q7, d22, d23, d24, d25, d1[1], d1[0] + vmull_vmlal_8h q2, q3, d18, d19, d28, d29, d1[2], d1[3] + vrshrn_8h d24, d25, q4, q5, #12 // t4a + vrshrn_8h d22, d23, q6, q7, #12 // t5a + vmull_vmlsl_8h q4, q5, d18, d19, d28, d29, d1[3], d1[2] + vrshrn_8h d28, d29, q2, q3, #12 // t6a + vrshrn_8h d18, d19, q4, q5, #12 // t7a + + vqadd.s16 q2, q8, q12 // t0 + vqsub.s16 q3, q8, q12 // t4 + vqadd.s16 q4, q15, q11 // t1 + vqsub.s16 q5, q15, q11 // t5 + vqadd.s16 q6, q10, q14 // t2 + vqsub.s16 q7, q10, q14 // t6 + vqadd.s16 q10, q13, q9 // t3 + vqsub.s16 q11, q13, q9 // t7 + + vmull_vmlal_8h q8, q9, d6, d7, d10, d11, d2[3], d2[2] + vmull_vmlsl_8h q12, q13, d6, d7, d10, d11, d2[2], d2[3] + vmull_vmlsl_8h q14, q15, d22, d23, d14, d15, d2[3], d2[2] + + vrshrn_8h d6, d7, q8, q9, #12 // t4a + vrshrn_8h d10, d11, q12, q13, #12 // t5a + + vmull_vmlal_8h q8, q9, d22, d23, d14, d15, d2[2], d2[3] + + vrshrn_8h d14, d15, q14, q15, #12 // t6a + vrshrn_8h d22, d23, q8, q9, #12 // t7a + + vqadd.s16 \q0, q2, q6 // out0 + vqsub.s16 q2, q2, q6 // t2 + vqadd.s16 \q7, q4, q10 // out7 + vqsub.s16 q4, q4, q10 // t3 + vqneg.s16 \q7, \q7 // out7 + + vqadd.s16 \q1, q3, q7 // out1 + vqsub.s16 q3, q3, q7 // t6 + vqadd.s16 \q6, q5, q11 // out6 + vqsub.s16 q5, q5, q11 // t7 + vqneg.s16 \q1, \q1 // out1 + + vmull_vmlal_8h q10, q11, d4, d5, d8, d9, d2[0], d2[0] // -> out3 (q11 or q12) + vmull_vmlsl_8h q6, q7, d4, d5, d8, d9, d2[0], d2[0] // -> out4 (q12 or q11) + vmull_vmlsl_8h q12, q13, d6, d7, d10, d11, d2[0], d2[0] // -> out5 (q13 or q10) + vrshrn_8h d4, d5, q10, q11, #12 // out3 + vmull_vmlal_8h q10, q11, d6, d7, d10, d11, d2[0], d2[0] // -> out2 (q10 or q13) + vrshrn_8h d6, d7, q12, q13, #12 // out5 + vrshrn_8h \r4, \r5, q10, q11, #12 // out2 (q10 or q13) + vrshrn_8h \r8, \r9, q6, q7, #12 // out4 (q12 or q11) + + vqneg.s16 \q3, q2 // out3 + vqneg.s16 \q5, q3 // out5 +.endm + +.macro iadst_4h_x8 r0, r1, r2, r3, r4, r5, r6, r7 + movrel_local r12, iadst8_coeffs + vld1.16 {d0, d1, d2}, [r12, :64] + + vmull_vmlal q2, d23, d16, d0[0], d0[1] + vmull_vmlsl q3, d23, d16, d0[1], d0[0] + vmull_vmlal q4, d21, d18, d0[2], d0[3] + vrshrn.i32 d16, q2, #12 // t0a + vrshrn.i32 d23, q3, #12 // t1a + vmull_vmlsl q5, d21, d18, d0[3], d0[2] + vmull_vmlal q6, d19, d20, d1[0], d1[1] + vrshrn.i32 d18, q4, #12 // t2a + vrshrn.i32 d21, q5, #12 // t3a + vmull_vmlsl q7, d19, d20, d1[1], d1[0] + vmull_vmlal q2, d17, d22, d1[2], d1[3] + vrshrn.i32 d20, q6, #12 // t4a + vrshrn.i32 d19, q7, #12 // t5a + vmull_vmlsl q3, d17, d22, d1[3], d1[2] + vrshrn.i32 d22, q2, #12 // t6a + vrshrn.i32 d17, q3, #12 // t7a + + vqadd.s16 d4, d16, d20 // t0 + vqsub.s16 d5, d16, d20 // t4 + vqadd.s16 d6, d23, d19 // t1 + vqsub.s16 d7, d23, d19 // t5 + vqadd.s16 d8, d18, d22 // t2 + vqsub.s16 d9, d18, d22 // t6 + vqadd.s16 d18, d21, d17 // t3 + vqsub.s16 d19, d21, d17 // t7 + + vmull_vmlal q8, d5, d7, d2[3], d2[2] + vmull_vmlsl q10, d5, d7, d2[2], d2[3] + vmull_vmlsl q11, d19, d9, d2[3], d2[2] + + vrshrn.i32 d5, q8, #12 // t4a + vrshrn.i32 d7, q10, #12 // t5a + + vmull_vmlal q8, d19, d9, d2[2], d2[3] + + vrshrn.i32 d9, q11, #12 // t6a + vrshrn.i32 d19, q8, #12 // t7a + + vqadd.s16 \r0, d4, d8 // out0 + vqsub.s16 d4, d4, d8 // t2 + vqadd.s16 \r7, d6, d18 // out7 + vqsub.s16 d6, d6, d18 // t3 + vqneg.s16 \r7, \r7 // out7 + + vqadd.s16 \r1, d5, d9 // out1 + vqsub.s16 d5, d5, d9 // t6 + vqadd.s16 \r6, d7, d19 // out6 + vqsub.s16 d7, d7, d19 // t7 + vqneg.s16 \r1, \r1 // out1 + + vmull_vmlal q9, d4, d6, d2[0], d2[0] // -> out3 (d19 or d20) + vmull_vmlsl q4, d4, d6, d2[0], d2[0] // -> out4 (d20 or d19) + vmull_vmlsl q10, d5, d7, d2[0], d2[0] // -> out5 (d21 or d18) + vrshrn.i32 d4, q9, #12 // out3 + vmull_vmlal q9, d5, d7, d2[0], d2[0] // -> out2 (d18 or d21) + vrshrn.i32 d5, q10, #12 // out5 + vrshrn.i32 \r2, q9, #12 // out2 (d18 or d21) + vrshrn.i32 \r4, q4, #12 // out4 (d20 or d19) + + vqneg.s16 \r3, d4 // out3 + vqneg.s16 \r5, d5 // out5 +.endm + +function inv_adst_8h_x8_neon, export=1 + iadst_8h_x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + bx lr +endfunc + +function inv_flipadst_8h_x8_neon, export=1 + iadst_8h_x8 q15, q14, q13, q12, q11, q10, q9, q8, d30, d31, d28, d29, d26, d27, d24, d25, d22, d23, d20, d21, d18, d19, d16, d17 + bx lr +endfunc + +function inv_adst_4h_x8_neon, export=1 + iadst_4h_x8 d16, d17, d18, d19, d20, d21, d22, d23 + bx lr +endfunc + +function inv_flipadst_4h_x8_neon, export=1 + iadst_4h_x8 d23, d22, d21, d20, d19, d18, d17, d16 + bx lr +endfunc + +function inv_identity_8h_x8_neon, export=1 + vqshl.s16 q8, q8, #1 + vqshl.s16 q9, q9, #1 + vqshl.s16 q10, q10, #1 + vqshl.s16 q11, q11, #1 + vqshl.s16 q12, q12, #1 + vqshl.s16 q13, q13, #1 + vqshl.s16 q14, q14, #1 + vqshl.s16 q15, q15, #1 + bx lr +endfunc + +function inv_identity_4h_x8_neon, export=1 + vqshl.s16 q8, q8, #1 + vqshl.s16 q9, q9, #1 + vqshl.s16 q10, q10, #1 + vqshl.s16 q11, q11, #1 + bx lr +endfunc + +.macro def_fn_8x8_base variant +function inv_txfm_\variant\()add_8x8_neon + vmov.i16 q0, #0 + vmov.i16 q1, #0 + vld1.16 {q8, q9}, [r2, :128] + vst1.16 {q0, q1}, [r2, :128]! + vld1.16 {q10, q11}, [r2, :128] + vst1.16 {q0, q1}, [r2, :128]! + vld1.16 {q12, q13}, [r2, :128] + vst1.16 {q0, q1}, [r2, :128]! + vld1.16 {q14, q15}, [r2, :128] + vst1.16 {q0, q1}, [r2, :128] + +.ifc \variant, identity_ + // The identity shl #1 and downshift srshr #1 cancel out +.else + blx r4 + + vrshr.s16 q8, q8, #1 + vrshr.s16 q9, q9, #1 + vrshr.s16 q10, q10, #1 + vrshr.s16 q11, q11, #1 + vrshr.s16 q12, q12, #1 + vrshr.s16 q13, q13, #1 + vrshr.s16 q14, q14, #1 + vrshr.s16 q15, q15, #1 +.endif + + transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30 + + blx r5 + + load_add_store_8x8 r0, r7 + vpop {q4-q7} + pop {r4-r5,r7,pc} +endfunc +.endm + +def_fn_8x8_base +def_fn_8x8_base identity_ + +.macro def_fn_8x8 txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc 8, 8, 1 +.endif + push {r4-r5,r7,lr} + vpush {q4-q7} + movrel_local r5, inv_\txfm2\()_8h_x8_neon +.ifc \txfm1, identity + b inv_txfm_identity_add_8x8_neon +.else + movrel_local r4, inv_\txfm1\()_8h_x8_neon + b inv_txfm_add_8x8_neon +.endif +endfunc +.endm + +def_fn_8x8 dct, dct +def_fn_8x8 identity, identity +def_fn_8x8 dct, adst +def_fn_8x8 dct, flipadst +def_fn_8x8 dct, identity +def_fn_8x8 adst, dct +def_fn_8x8 adst, adst +def_fn_8x8 adst, flipadst +def_fn_8x8 flipadst, dct +def_fn_8x8 flipadst, adst +def_fn_8x8 flipadst, flipadst +def_fn_8x8 identity, dct +def_fn_8x8 adst, identity +def_fn_8x8 flipadst, identity +def_fn_8x8 identity, adst +def_fn_8x8 identity, flipadst + +function inv_txfm_add_8x4_neon + vmov.i16 q14, #0 + vmov.i16 q15, #0 + movw r12, #2896*8 + vdup.16 d0, r12 + vld1.16 {d16, d17, d18, d19}, [r2, :128] + vst1.16 {q14, q15}, [r2, :128]! + vld1.16 {d20, d21, d22, d23}, [r2, :128] + vst1.16 {q14, q15}, [r2, :128] + + scale_input d0[0], q8, q9, q10, q11 + + blx r4 + + transpose_4x4h q8, q9, d16, d17, d18, d19 + transpose_4x4h q10, q11, d20, d21, d22, d23 + vswp d17, d20 + vswp d19, d21 + vswp d18, d20 + vswp d21, d22 + + blx r5 + + load_add_store_8x4 r0, r7 + vpop {q4-q7} + pop {r4-r5,r7,pc} +endfunc + +function inv_txfm_add_4x8_neon + vmov.i16 q14, #0 + vmov.i16 q15, #0 + movw r12, #2896*8 + vdup.16 d0, r12 + vld1.16 {q8, q9}, [r2, :128] + vst1.16 {q14, q15}, [r2, :128]! + vld1.16 {q10, q11}, [r2, :128] + vst1.16 {q14, q15}, [r2, :128] + + scale_input d0[0], q8, q9, q10, q11 + + blx r4 + + transpose_4x8h q8, q9, q10, q11 + vswp d17, d20 + vswp d19, d21 + vswp d17, d18 + vswp d19, d22 + + blx r5 + + load_add_store_4x8 r0, r7 + vpop {q4-q7} + pop {r4-r5,r7,pc} +endfunc + +.macro def_fn_48 w, h, txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc \w, \h, 0 +.endif + push {r4-r5,r7,lr} + vpush {q4-q7} + movrel_local r4, inv_\txfm1\()_\h\()h_x\w\()_neon + movrel_local r5, inv_\txfm2\()_\w\()h_x\h\()_neon + b inv_txfm_add_\w\()x\h\()_neon +endfunc +.endm + +.macro def_fns_48 w, h +def_fn_48 \w, \h, dct, dct +def_fn_48 \w, \h, identity, identity +def_fn_48 \w, \h, dct, adst +def_fn_48 \w, \h, dct, flipadst +def_fn_48 \w, \h, dct, identity +def_fn_48 \w, \h, adst, dct +def_fn_48 \w, \h, adst, adst +def_fn_48 \w, \h, adst, flipadst +def_fn_48 \w, \h, flipadst, dct +def_fn_48 \w, \h, flipadst, adst +def_fn_48 \w, \h, flipadst, flipadst +def_fn_48 \w, \h, identity, dct +def_fn_48 \w, \h, adst, identity +def_fn_48 \w, \h, flipadst, identity +def_fn_48 \w, \h, identity, adst +def_fn_48 \w, \h, identity, flipadst +.endm + +def_fns_48 4, 8 +def_fns_48 8, 4 + +function inv_dct_4h_x16_neon, export=1 + movrel_local r12, idct_coeffs + vld1.16 {q0, q1}, [r12, :128] + + vmull_vmlsl q2, d17, d31, d2[0], d2[1] // -> t8a + vmull_vmlal q3, d17, d31, d2[1], d2[0] // -> t15a + vmull_vmlsl q4, d25, d23, d2[2], d2[3] // -> t9a + vrshrn.i32 d17, q2, #12 // t8a + vrshrn.i32 d31, q3, #12 // t15a + vmull_vmlal q2, d25, d23, d2[3], d2[2] // -> t14a + vmull_vmlsl q3, d21, d27, d3[0], d3[1] // -> t10a + vrshrn.i32 d23, q4, #12 // t9a + vrshrn.i32 d25, q2, #12 // t14a + vmull_vmlal q4, d21, d27, d3[1], d3[0] // -> t13a + vmull_vmlsl q2, d29, d19, d3[2], d3[3] // -> t11a + vrshrn.i32 d21, q3, #12 // t10a + vrshrn.i32 d27, q4, #12 // t13a + vmull_vmlal q3, d29, d19, d3[3], d3[2] // -> t12a + vrshrn.i32 d19, q2, #12 // t11a + vrshrn.i32 d29, q3, #12 // t12a + + idct_4h_x8 d16, d18, d20, d22, d24, d26, d28, d30 + + vqsub.s16 d4, d17, d23 // t9 + vqadd.s16 d17, d17, d23 // t8 + vqsub.s16 d5, d31, d25 // t14 + vqadd.s16 d31, d31, d25 // t15 + vqsub.s16 d23, d19, d21 // t10 + vqadd.s16 d19, d19, d21 // t11 + vqadd.s16 d25, d29, d27 // t12 + vqsub.s16 d29, d29, d27 // t13 + + vmull_vmlsl q3, d5, d4, d0[2], d0[3] // -> t9a + vmull_vmlal q4, d5, d4, d0[3], d0[2] // -> t14a + vrshrn.i32 d21, q3, #12 // t9a + vrshrn.i32 d27, q4, #12 // t14a + + vmull_vmlsl q3, d29, d23, d0[2], d0[3] // -> t13a + vmull_vmlal q4, d29, d23, d0[3], d0[2] // -> t10a + vrshrn.i32 d29, q3, #12 // t13a + vneg.s32 q4, q4 + vrshrn.i32 d23, q4, #12 // t10a + + vqsub.s16 d4, d17, d19 // t11a + vqadd.s16 d17, d17, d19 // t8a + vqsub.s16 d5, d31, d25 // t12a + vqadd.s16 d31, d31, d25 // t15a + vqadd.s16 d19, d21, d23 // t9 + vqsub.s16 d21, d21, d23 // t10 + vqsub.s16 d25, d27, d29 // t13 + vqadd.s16 d27, d27, d29 // t14 + + vmull_vmlsl q3, d5, d4, d0[0], d0[0] // -> t11 + vmull_vmlal q4, d5, d4, d0[0], d0[0] // -> t12 + vmull_vmlsl q2, d25, d21, d0[0], d0[0] // -> t10a + + vrshrn.i32 d6, q3, #12 // t11 + vrshrn.i32 d7, q4, #12 // t12 + vmull_vmlal q4, d25, d21, d0[0], d0[0] // -> t10a + vrshrn.i32 d4, q2, #12 // t10a + vrshrn.i32 d5, q4, #12 // t13a + + vqadd.s16 d8, d16, d31 // out0 + vqsub.s16 d31, d16, d31 // out15 + vmov d16, d8 + vqadd.s16 d23, d30, d17 // out7 + vqsub.s16 d9, d30, d17 // out8 + vqadd.s16 d17, d18, d27 // out1 + vqsub.s16 d30, d18, d27 // out14 + vqadd.s16 d18, d20, d5 // out2 + vqsub.s16 d29, d20, d5 // out13 + vqadd.s16 d5, d28, d19 // out6 + vqsub.s16 d25, d28, d19 // out9 + vqadd.s16 d19, d22, d7 // out3 + vqsub.s16 d28, d22, d7 // out12 + vqadd.s16 d20, d24, d6 // out4 + vqsub.s16 d27, d24, d6 // out11 + vqadd.s16 d21, d26, d4 // out5 + vqsub.s16 d26, d26, d4 // out10 + vmov d24, d9 + vmov d22, d5 + + bx lr +endfunc + +.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15 + movrel_local r12, iadst16_coeffs + vld1.16 {q0, q1}, [r12, :128] + movrel_local r12, idct_coeffs + + vmull_vmlal q2, d31, d16, d0[0], d0[1] // -> t0 + vmull_vmlsl q3, d31, d16, d0[1], d0[0] // -> t1 + vmull_vmlal q4, d29, d18, d0[2], d0[3] // -> t2 + vrshrn.i32 d16, q2, #12 // t0 + vrshrn.i32 d31, q3, #12 // t1 + vmull_vmlsl q2, d29, d18, d0[3], d0[2] // -> t3 + vmull_vmlal q3, d27, d20, d1[0], d1[1] // -> t4 + vrshrn.i32 d18, q4, #12 // t2 + vrshrn.i32 d29, q2, #12 // t3 + vmull_vmlsl q4, d27, d20, d1[1], d1[0] // -> t5 + vmull_vmlal q2, d25, d22, d1[2], d1[3] // -> t6 + vrshrn.i32 d20, q3, #12 // t4 + vrshrn.i32 d27, q4, #12 // t5 + vmull_vmlsl q3, d25, d22, d1[3], d1[2] // -> t7 + vmull_vmlal q4, d23, d24, d2[0], d2[1] // -> t8 + vrshrn.i32 d22, q2, #12 // t6 + vrshrn.i32 d25, q3, #12 // t7 + vmull_vmlsl q2, d23, d24, d2[1], d2[0] // -> t9 + vmull_vmlal q3, d21, d26, d2[2], d2[3] // -> t10 + vrshrn.i32 d23, q4, #12 // t8 + vrshrn.i32 d24, q2, #12 // t9 + vmull_vmlsl q4, d21, d26, d2[3], d2[2] // -> t11 + vmull_vmlal q2, d19, d28, d3[0], d3[1] // -> t12 + vrshrn.i32 d21, q3, #12 // t10 + vrshrn.i32 d26, q4, #12 // t11 + vmull_vmlsl q3, d19, d28, d3[1], d3[0] // -> t13 + vmull_vmlal q4, d17, d30, d3[2], d3[3] // -> t14 + vrshrn.i32 d19, q2, #12 // t12 + vrshrn.i32 d28, q3, #12 // t13 + vmull_vmlsl q2, d17, d30, d3[3], d3[2] // -> t15 + vrshrn.i32 d17, q4, #12 // t14 + vrshrn.i32 d30, q2, #12 // t15 + + vld1.16 {q0}, [r12, :128] + + vqsub.s16 d2, d16, d23 // t8a + vqadd.s16 d16, d16, d23 // t0a + vqsub.s16 d3, d31, d24 // t9a + vqadd.s16 d31, d31, d24 // t1a + vqadd.s16 d23, d18, d21 // t2a + vqsub.s16 d18, d18, d21 // t10a + vqadd.s16 d24, d29, d26 // t3a + vqsub.s16 d29, d29, d26 // t11a + vqadd.s16 d21, d20, d19 // t4a + vqsub.s16 d20, d20, d19 // t12a + vqadd.s16 d26, d27, d28 // t5a + vqsub.s16 d27, d27, d28 // t13a + vqadd.s16 d19, d22, d17 // t6a + vqsub.s16 d22, d22, d17 // t14a + vqadd.s16 d28, d25, d30 // t7a + vqsub.s16 d25, d25, d30 // t15a + + vmull_vmlal q2, d2, d3, d1[1], d1[0] // -> t8 + vmull_vmlsl q3, d2, d3, d1[0], d1[1] // -> t9 + vmull_vmlal q4, d18, d29, d1[3], d1[2] // -> t10 + vrshrn.i32 d17, q2, #12 // t8 + vrshrn.i32 d30, q3, #12 // t9 + vmull_vmlsl q2, d18, d29, d1[2], d1[3] // -> t11 + vmull_vmlsl q3, d27, d20, d1[1], d1[0] // -> t12 + vrshrn.i32 d18, q4, #12 // t10 + vrshrn.i32 d29, q2, #12 // t11 + vmull_vmlal q4, d27, d20, d1[0], d1[1] // -> t13 + vmull_vmlsl q2, d25, d22, d1[3], d1[2] // -> t14 + vrshrn.i32 d27, q3, #12 // t12 + vrshrn.i32 d20, q4, #12 // t13 + vmull_vmlal q3, d25, d22, d1[2], d1[3] // -> t15 + vrshrn.i32 d25, q2, #12 // t14 + vrshrn.i32 d22, q3, #12 // t15 + + vqsub.s16 d2, d16, d21 // t4 + vqadd.s16 d16, d16, d21 // t0 + vqsub.s16 d3, d31, d26 // t5 + vqadd.s16 d31, d31, d26 // t1 + vqadd.s16 d21, d23, d19 // t2 + vqsub.s16 d23, d23, d19 // t6 + vqadd.s16 d26, d24, d28 // t3 + vqsub.s16 d24, d24, d28 // t7 + vqadd.s16 d19, d17, d27 // t8a + vqsub.s16 d17, d17, d27 // t12a + vqadd.s16 d28, d30, d20 // t9a + vqsub.s16 d30, d30, d20 // t13a + vqadd.s16 d27, d18, d25 // t10a + vqsub.s16 d18, d18, d25 // t14a + vqadd.s16 d20, d29, d22 // t11a + vqsub.s16 d29, d29, d22 // t15a + + vmull_vmlal q2, d2, d3, d0[3], d0[2] // -> t4a + vmull_vmlsl q3, d2, d3, d0[2], d0[3] // -> t5a + vmull_vmlsl q4, d24, d23, d0[3], d0[2] // -> t6a + vrshrn.i32 d22, q2, #12 // t4a + vrshrn.i32 d25, q3, #12 // t5a + vmull_vmlal q2, d24, d23, d0[2], d0[3] // -> t7a + vmull_vmlal q3, d17, d30, d0[3], d0[2] // -> t12 + vrshrn.i32 d24, q4, #12 // t6a + vrshrn.i32 d23, q2, #12 // t7a + vmull_vmlsl q4, d17, d30, d0[2], d0[3] // -> t13 + vmull_vmlsl q2, d29, d18, d0[3], d0[2] // -> t14 + vrshrn.i32 d17, q3, #12 // t12 + vmull_vmlal q3, d29, d18, d0[2], d0[3] // -> t15 + vrshrn.i32 d29, q4, #12 // t13 + vrshrn.i32 d30, q2, #12 // t14 + vrshrn.i32 d18, q3, #12 // t15 + + vqsub.s16 d2, d16, d21 // t2a +.ifc \o0, d16 + vqadd.s16 \o0, d16, d21 // out0 + vqsub.s16 d21, d31, d26 // t3a + vqadd.s16 \o15,d31, d26 // out15 +.else + vqadd.s16 d4, d16, d21 // out0 + vqsub.s16 d21, d31, d26 // t3a + vqadd.s16 \o15,d31, d26 // out15 + vmov \o0, d4 +.endif + vqneg.s16 \o15, \o15 // out15 + + vqsub.s16 d3, d29, d18 // t15a + vqadd.s16 \o13,d29, d18 // out13 + vqadd.s16 \o2, d17, d30 // out2 + vqsub.s16 d26, d17, d30 // t14a + vqneg.s16 \o13,\o13 // out13 + + vqadd.s16 \o1, d19, d27 // out1 + vqsub.s16 d27, d19, d27 // t10 + vqadd.s16 \o14,d28, d20 // out14 + vqsub.s16 d20, d28, d20 // t11 + vqneg.s16 \o1, \o1 // out1 + + vqadd.s16 \o3, d22, d24 // out3 + vqsub.s16 d22, d22, d24 // t6 + vqadd.s16 \o12,d25, d23 // out12 + vqsub.s16 d23, d25, d23 // t7 + vqneg.s16 \o3, \o3 // out3 + + vmull_vmlsl q12, d2, d21, d0[0], d0[0] // -> out8 (d24 or d23) + vmull_vmlal q2, d2, d21, d0[0], d0[0] // -> out7 (d23 or d24) + vmull_vmlal q3, d26, d3, d0[0], d0[0] // -> out5 (d21 or d26) + + vrshrn.i32 d24, q12, #12 // out8 + vrshrn.i32 d4, q2, #12 // out7 + vrshrn.i32 d5, q3, #12 // out5 + vmull_vmlsl q4, d26, d3, d0[0], d0[0] // -> out10 (d26 or d21) + vmull_vmlal q1, d22, d23, d0[0], d0[0] // -> out4 (d20 or d27) + vrshrn.i32 d26, q4, #12 // out10 + + vmull_vmlsl q4, d22, d23, d0[0], d0[0] // -> out11 (d27 or d20) + vmull_vmlal q11, d27, d20, d0[0], d0[0] // -> out6 (d22 or d25) + vmull_vmlsl q3, d27, d20, d0[0], d0[0] // -> out9 (d25 or d22) + + vrshrn.i32 \o4, q1, #12 // out4 + vrshrn.i32 d7, q3, #12 // out9 + vrshrn.i32 d6, q4, #12 // out11 + vrshrn.i32 \o6, q11, #12 // out6 + +.ifc \o8, d23 + vmov \o8, d24 + vmov \o10,d26 +.endif + + vqneg.s16 \o7, d4 // out7 + vqneg.s16 \o5, d5 // out5 + vqneg.s16 \o11,d6 // out11 + vqneg.s16 \o9, d7 // out9 +.endm + +function inv_adst_4h_x16_neon, export=1 + iadst_16 d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + bx lr +endfunc + +function inv_flipadst_4h_x16_neon, export=1 + iadst_16 d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16 + bx lr +endfunc + +function inv_identity_4h_x16_neon, export=1 + movw r12, #2*(5793-4096)*8 + vdup.16 d0, r12 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vqrdmulh.s16 q1, \i, d0[0] + vqadd.s16 \i, \i, \i + vqadd.s16 \i, \i, q1 +.endr + bx lr +endfunc + +.macro identity_4x16_shift2 c +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vqrdmulh.s16 q2, \i, \c + vshr.s16 q2, q2, #1 + vrhadd.s16 \i, \i, q2 +.endr +.endm + +.macro identity_4x16_shift1 c +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vqrdmulh.s16 q2, \i, \c + vrshr.s16 q2, q2, #1 + vqadd.s16 \i, \i, q2 +.endr +.endm + +.macro identity_8x8_shift1 c + identity_4x16_shift1 \c +.endm + +.macro identity_8x8 c +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vqrdmulh.s16 q2, \i, \c + vqadd.s16 \i, \i, \i + vqadd.s16 \i, \i, q2 +.endr +.endm + +.macro def_horz_16 scale=0, identity=0, shift=2, suffix +function inv_txfm_horz\suffix\()_16x4_neon + push {lr} + vmov.i16 d7, #0 +.if \identity + movw r12, #2*(5793-4096)*8 + vdup.16 d0, r12 +.endif +.if \scale + movw r12, #2896*8 + vdup.16 d1, r12 +.endif +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.16 {\i}, [r7, :64] + vst1.16 {d7}, [r7, :64], r8 +.endr +.if \scale + scale_input d1[0], q8, q9, q10, q11, q12, q13, q14, q15 +.endif +.if \identity +.if \shift == -2 + identity_4x16_shift2 d0[0] +.else + identity_4x16_shift1 d0[0] +.endif +.else + blx r4 +.endif +.if \shift > 0 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vrshr.s16 \i, \i, #\shift +.endr +.endif + transpose_4x4h q8, q9, d16, d17, d18, d19 + transpose_4x4h q10, q11, d20, d21, d22, d23 + transpose_4x4h q12, q13, d24, d25, d26, d27 + transpose_4x4h q14, q15, d28, d29, d30, d31 + +.irp i, d16, d20, d24, d28, d17, d21, d25, d29, d18, d22, d26, d30, d19, d23, d27, d31 + vst1.16 {\i}, [r6, :64]! +.endr + + pop {pc} +endfunc +.endm + +def_horz_16 scale=0, identity=0, shift=2 +def_horz_16 scale=1, identity=0, shift=1, suffix=_scale +def_horz_16 scale=0, identity=1, shift=-2, suffix=_identity +def_horz_16 scale=1, identity=1, shift=-1, suffix=_scale_identity + +function inv_txfm_add_vert_4x16_neon + push {lr} +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.16 {\i}, [r7, :64], r8 +.endr + blx r5 + load_add_store_4x16 r6, r7 + pop {pc} +endfunc + +.macro sub_sp_align space +#if CONFIG_THUMB + mov r7, sp + and r7, r7, #15 +#else + and r7, sp, #15 +#endif + sub sp, sp, r7 + // Now the stack is aligned, store the amount of adjustment back + // on the stack, as we don't want to waste a register as frame + // pointer. + str r7, [sp, #-16]! +#ifdef _WIN32 +.if \space > 8192 + // Here, we'd need to touch two (or more) pages while decrementing + // the stack pointer. + .error "sub_sp_align doesn't support values over 8K at the moment" +.elseif \space > 4096 + sub r7, sp, #4096 + ldr r12, [r7] + sub r7, r7, #(\space - 4096) + mov sp, r7 +.else + sub sp, sp, #\space +.endif +#else +.if \space >= 4096 + sub sp, sp, #(\space)/4096*4096 +.endif +.if (\space % 4096) != 0 + sub sp, sp, #(\space)%4096 +.endif +#endif +.endm + +.macro add_sp_align space +.if \space >= 4096 + add sp, sp, #(\space)/4096*4096 +.endif +.if (\space % 4096) != 0 + add sp, sp, #(\space)%4096 +.endif + ldr r7, [sp], #16 + // Add back the original stack adjustment + add sp, sp, r7 +.endm + +function inv_txfm_add_16x16_neon + sub_sp_align 512 + ldrh r11, [r10], #2 +.irp i, 0, 4, 8, 12 + add r6, sp, #(\i*16*2) +.if \i > 0 + mov r8, #(16 - \i) + cmp r3, r11 + blt 1f +.if \i < 12 + ldrh r11, [r10], #2 +.endif +.endif + add r7, r2, #(\i*2) + mov r8, #16*2 + blx r9 +.endr + b 3f +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #4 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b +3: +.irp i, 0, 4, 8, 12 + add r6, r0, #(\i) + add r7, sp, #(\i*2) + mov r8, #32 + bl inv_txfm_add_vert_4x16_neon +.endr + + add_sp_align 512 + vpop {q4} + pop {r4-r11,pc} +endfunc + +const eob_16x16 + .short 10, 36, 78, 256 +endconst + +const eob_16x16_identity + .short 4, 8, 12, 256 +endconst + +.macro def_fn_16x16 txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc 16, 16, 2 +.endif + push {r4-r11,lr} + vpush {q4} +.ifc \txfm1, identity + movrel_local r9, inv_txfm_horz_identity_16x4_neon +.else + movrel_local r9, inv_txfm_horz_16x4_neon + movrel_local r4, inv_\txfm1\()_4h_x16_neon +.endif + movrel_local r5, inv_\txfm2\()_4h_x16_neon +.ifc \txfm1, identity +.ifc \txfm2, identity + movrel_local r10, eob_16x16 +.else + movrel_local r10, eob_16x16_identity +.endif +.else +.ifc \txfm2, identity + movrel_local r10, eob_16x16_identity +.else + movrel_local r10, eob_16x16 +.endif +.endif + b inv_txfm_add_16x16_neon +endfunc +.endm + +def_fn_16x16 dct, dct +def_fn_16x16 identity, identity +def_fn_16x16 dct, adst +def_fn_16x16 dct, flipadst +def_fn_16x16 dct, identity +def_fn_16x16 adst, dct +def_fn_16x16 adst, adst +def_fn_16x16 adst, flipadst +def_fn_16x16 flipadst, dct +def_fn_16x16 flipadst, adst +def_fn_16x16 flipadst, flipadst +def_fn_16x16 identity, dct + +.macro def_fn_416_base variant +function inv_txfm_\variant\()add_16x4_neon + +.ifc \variant, identity_ + vmov.i16 d4, #0 +.irp i, d16, d18, d20, d22 + vld1.16 {\i}, [r2, :64] + vst1.16 {d4}, [r2, :64]! +.endr +.irp i, d17, d19, d21, d23 + vld1.16 {\i}, [r2, :64] + vst1.16 {d4}, [r2, :64]! +.endr + movw r12, #2*(5793-4096)*8 + vdup.16 d0, r12 +.irp i, d24, d26, d28, d30 + vld1.16 {\i}, [r2, :64] + vst1.16 {d4}, [r2, :64]! +.endr +.irp i, d25, d27, d29, d31 + vld1.16 {\i}, [r2, :64] + vst1.16 {d4}, [r2, :64]! +.endr + + identity_4x16_shift1 d0[0] +.else + vmov.i16 q2, #0 + vmov.i16 q3, #0 + vld1.16 {d16, d17, d18, d19}, [r2, :128] + vst1.16 {q2, q3}, [r2, :128]! + vld1.16 {d20, d21, d22, d23}, [r2, :128] + vst1.16 {q2, q3}, [r2, :128]! + vld1.16 {d24, d25, d26, d27}, [r2, :128] + vst1.16 {q2, q3}, [r2, :128]! + vld1.16 {d28, d29, d30, d31}, [r2, :128] + vst1.16 {q2, q3}, [r2, :128]! + + blx r4 + + vswp d17, d20 + vswp d19, d22 + vswp d18, d20 + vswp d19, d21 +.irp i, q8, q9, q10, q11 + vrshr.s16 \i, \i, #1 +.endr +.endif + transpose_4x8h q8, q9, q10, q11 + blx r5 + mov r6, r0 + load_add_store_8x4 r6, r7 + +.ifc \variant, identity_ + vmov q8, q12 + vmov q9, q13 + vmov q10, q14 + vmov q11, q15 +.else + vswp d25, d28 + vswp d27, d30 + vswp d26, d28 + vswp d27, d29 + vrshr.s16 q8, q12, #1 + vrshr.s16 q9, q13, #1 + vrshr.s16 q10, q14, #1 + vrshr.s16 q11, q15, #1 +.endif + transpose_4x8h q8, q9, q10, q11 + blx r5 + add r6, r0, #8 + load_add_store_8x4 r6, r7 + + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_txfm_\variant\()add_4x16_neon + vmov.i16 q2, #0 + + mov r11, #32 + cmp r3, r10 + blt 1f + + add r6, r2, #16 +.ifc \variant, identity_ +.irp i, q12, q13, q14, q15 + vld1.16 {\i}, [r6, :128] + vst1.16 {q2}, [r6, :128], r11 +.endr + movw r12, #(5793-4096)*8 + vdup.16 d0, r12 + identity_8x4_shift1 q12, q13, q14, q15, d0[0] +.else +.irp i, q8, q9, q10, q11 + vld1.16 {\i}, [r6, :128] + vst1.16 {q2}, [r6, :128], r11 +.endr + blx r4 + vrshr.s16 q12, q8, #1 + vrshr.s16 q13, q9, #1 + vrshr.s16 q14, q10, #1 + vrshr.s16 q15, q11, #1 +.endif + transpose_4x8h q12, q13, q14, q15 + vswp d27, d29 + vswp d26, d28 + vswp d27, d30 + vswp d25, d28 + + b 2f +1: +.irp i, q12, q13, q14, q15 + vmov.i16 \i, #0 +.endr +2: + vmov.i16 q2, #0 +.irp i, q8, q9, q10, q11 + vld1.16 {\i}, [r2, :128] + vst1.16 {q2}, [r2, :128], r11 +.endr +.ifc \variant, identity_ + movw r12, #(5793-4096)*8 + vdup.16 d0, r12 + identity_8x4_shift1 q8, q9, q10, q11, d0[0] +.else + blx r4 +.irp i, q8, q9, q10, q11 + vrshr.s16 \i, \i, #1 +.endr +.endif + transpose_4x8h q8, q9, q10, q11 + vswp d19, d21 + vswp d18, d20 + vswp d19, d22 + vswp d17, d20 + + blx r5 + + load_add_store_4x16 r0, r6 + + vpop {q4-q7} + pop {r4-r11,pc} +endfunc +.endm + +def_fn_416_base +def_fn_416_base identity_ + +.macro def_fn_416 w, h, txfm1, txfm2, eob_half +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc \w, \h, 1 +.endif + push {r4-r11,lr} + vpush {q4-q7} +.if \w == 4 + movrel_local r4, inv_\txfm1\()_8h_x\w\()_neon + movrel_local r5, inv_\txfm2\()_4h_x\h\()_neon + mov r10, #\eob_half +.else + movrel_local r4, inv_\txfm1\()_4h_x\w\()_neon + movrel_local r5, inv_\txfm2\()_8h_x\h\()_neon +.endif +.ifc \txfm1, identity + b inv_txfm_identity_add_\w\()x\h\()_neon +.else + b inv_txfm_add_\w\()x\h\()_neon +.endif +endfunc +.endm + +.macro def_fns_416 w, h +def_fn_416 \w, \h, dct, dct, 29 +def_fn_416 \w, \h, identity, identity, 29 +def_fn_416 \w, \h, dct, adst, 29 +def_fn_416 \w, \h, dct, flipadst, 29 +def_fn_416 \w, \h, dct, identity, 8 +def_fn_416 \w, \h, adst, dct, 29 +def_fn_416 \w, \h, adst, adst, 29 +def_fn_416 \w, \h, adst, flipadst, 29 +def_fn_416 \w, \h, flipadst, dct, 29 +def_fn_416 \w, \h, flipadst, adst, 29 +def_fn_416 \w, \h, flipadst, flipadst, 29 +def_fn_416 \w, \h, identity, dct, 32 +def_fn_416 \w, \h, adst, identity, 8 +def_fn_416 \w, \h, flipadst, identity, 8 +def_fn_416 \w, \h, identity, adst, 32 +def_fn_416 \w, \h, identity, flipadst, 32 +.endm + +def_fns_416 4, 16 +def_fns_416 16, 4 + +.macro def_fn_816_base variant +function inv_txfm_\variant\()add_16x8_neon + sub_sp_align 256 + +.irp i, 0, 4 + add r6, sp, #(\i*16*2) +.if \i > 0 + cmp r3, r10 + blt 1f +.endif + add r7, r2, #(\i*2) + mov r8, #8*2 + blx r9 +.endr + b 2f +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr +2: + +.irp i, 0, 8 + add r7, sp, #(\i*2) + mov r8, #32 +.irp j, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.16 {\j}, [r7, :128], r8 +.endr + blx r5 + + add r6, r0, #(\i) + load_add_store_8x8 r6, r7 +.endr + + add_sp_align 256 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_txfm_\variant\()add_8x16_neon + sub_sp_align 256 + +.irp i, 0, 8 + add r6, sp, #(\i*8*2) +.if \i > 0 + cmp r3, r10 + blt 1f +.endif + add r7, r2, #(\i*2) + mov r8, #16*2 + + vmov.i16 q2, #0 + movw r12, #2896*8 + vdup.16 d0, r12 + +.irp j, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.16 {\j}, [r7, :128] + vst1.16 {q2}, [r7, :128], r8 +.endr + scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15 +.ifc \variant, identity_ + // The identity shl #1 and downshift vrshr #1 cancel out +.else + blx r4 +.irp j, q8, q9, q10, q11, q12, q13, q14, q15 + vrshr.s16 \j, \j, #1 +.endr +.endif + transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30 + vst1.16 {q8, q9}, [r6, :128]! + vst1.16 {q10, q11}, [r6, :128]! + vst1.16 {q12, q13}, [r6, :128]! + vst1.16 {q14, q15}, [r6, :128]! +.endr + b 2f +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr +2: + +.irp i, 0, 4 + add r6, r0, #(\i) + add r7, sp, #(\i*2) + mov r8, #16 + bl inv_txfm_add_vert_4x16_neon +.endr + + add_sp_align 256 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc +.endm + +def_fn_816_base +def_fn_816_base identity_ + +.macro def_fn_816 w, h, txfm1, txfm2, eob_8x8, eob_4x4 +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc \w, \h, 1 +.endif + push {r4-r11,lr} + vpush {q4-q7} +.if \w == 8 + movrel_local r4, inv_\txfm1\()_8h_x8_neon + movrel_local r5, inv_\txfm2\()_4h_x16_neon +.else +.ifc \txfm1, identity + movrel_local r9, inv_txfm_horz_scale_identity_16x4_neon +.else + movrel_local r4, inv_\txfm1\()_4h_x16_neon + movrel_local r9, inv_txfm_horz_scale_16x4_neon +.endif + movrel_local r5, inv_\txfm2\()_8h_x8_neon +.endif +.if \w == 8 + mov r10, #\eob_8x8 +.else + mov r10, #\eob_4x4 +.endif +.ifc \txfm1, identity + b inv_txfm_identity_add_\w\()x\h\()_neon +.else + b inv_txfm_add_\w\()x\h\()_neon +.endif +endfunc +.endm + +.macro def_fns_816 w, h +def_fn_816 \w, \h, dct, dct, 43, 10 +def_fn_816 \w, \h, identity, identity, 43, 10 +def_fn_816 \w, \h, dct, adst, 43, 10 +def_fn_816 \w, \h, dct, flipadst, 43, 10 +def_fn_816 \w, \h, dct, identity, 8, 4 +def_fn_816 \w, \h, adst, dct, 43, 10 +def_fn_816 \w, \h, adst, adst, 43, 10 +def_fn_816 \w, \h, adst, flipadst, 43, 10 +def_fn_816 \w, \h, flipadst, dct, 43, 10 +def_fn_816 \w, \h, flipadst, adst, 43, 10 +def_fn_816 \w, \h, flipadst, flipadst, 43, 10 +def_fn_816 \w, \h, identity, dct, 64, 4 +def_fn_816 \w, \h, adst, identity, 8, 4 +def_fn_816 \w, \h, flipadst, identity, 8, 4 +def_fn_816 \w, \h, identity, adst, 64, 4 +def_fn_816 \w, \h, identity, flipadst, 64, 4 +.endm + +def_fns_816 8, 16 +def_fns_816 16, 8 + +function inv_dct32_odd_4h_x16_neon, export=1 + movrel_local r12, idct_coeffs, 2*16 + vld1.16 {q0, q1}, [r12, :128] + sub r12, r12, #2*16 + + vmull_vmlsl q2, d16, d31, d0[0], d0[1] // -> t16a + vmull_vmlal q3, d16, d31, d0[1], d0[0] // -> t31a + vmull_vmlsl q4, d24, d23, d0[2], d0[3] // -> t17a + vrshrn.i32 d16, q2, #12 // t16a + vrshrn.i32 d31, q3, #12 // t31a + vmull_vmlal q2, d24, d23, d0[3], d0[2] // -> t30a + vmull_vmlsl q3, d20, d27, d1[0], d1[1] // -> t18a + vrshrn.i32 d24, q4, #12 // t17a + vrshrn.i32 d23, q2, #12 // t30a + vmull_vmlal q4, d20, d27, d1[1], d1[0] // -> t29a + vmull_vmlsl q2, d28, d19, d1[2], d1[3] // -> t19a + vrshrn.i32 d20, q3, #12 // t18a + vrshrn.i32 d27, q4, #12 // t29a + vmull_vmlal q3, d28, d19, d1[3], d1[2] // -> t28a + vmull_vmlsl q4, d18, d29, d2[0], d2[1] // -> t20a + vrshrn.i32 d28, q2, #12 // t19a + vrshrn.i32 d19, q3, #12 // t28a + vmull_vmlal q2, d18, d29, d2[1], d2[0] // -> t27a + vmull_vmlsl q3, d26, d21, d2[2], d2[3] // -> t21a + vrshrn.i32 d18, q4, #12 // t20a + vrshrn.i32 d29, q2, #12 // t27a + vmull_vmlal q4, d26, d21, d2[3], d2[2] // -> t26a + vmull_vmlsl q2, d22, d25, d3[0], d3[1] // -> t22a + vrshrn.i32 d26, q3, #12 // t21a + vrshrn.i32 d21, q4, #12 // t26a + vmull_vmlal q3, d22, d25, d3[1], d3[0] // -> t25a + vmull_vmlsl q4, d30, d17, d3[2], d3[3] // -> t23a + vrshrn.i32 d22, q2, #12 // t22a + vrshrn.i32 d25, q3, #12 // t25a + vmull_vmlal q2, d30, d17, d3[3], d3[2] // -> t24a + vrshrn.i32 d30, q4, #12 // t23a + vrshrn.i32 d17, q2, #12 // t24a + + vld1.16 {q0}, [r12, :128] + + vqsub.s16 d2, d16, d24 // t17 + vqadd.s16 d16, d16, d24 // t16 + vqsub.s16 d3, d31, d23 // t30 + vqadd.s16 d31, d31, d23 // t31 + vqsub.s16 d24, d28, d20 // t18 + vqadd.s16 d28, d28, d20 // t19 + vqadd.s16 d23, d18, d26 // t20 + vqsub.s16 d18, d18, d26 // t21 + vqsub.s16 d20, d30, d22 // t22 + vqadd.s16 d30, d30, d22 // t23 + vqadd.s16 d26, d17, d25 // t24 + vqsub.s16 d17, d17, d25 // t25 + vqsub.s16 d22, d29, d21 // t26 + vqadd.s16 d29, d29, d21 // t27 + vqadd.s16 d25, d19, d27 // t28 + vqsub.s16 d19, d19, d27 // t29 + + vmull_vmlsl q2, d3, d2, d1[0], d1[1] // -> t17a + vmull_vmlal q3, d3, d2, d1[1], d1[0] // -> t30a + vmull_vmlal q4, d19, d24, d1[1], d1[0] // -> t18a + vrshrn.i32 d21, q2, #12 // t17a + vrshrn.i32 d27, q3, #12 // t30a + vneg.s32 q4, q4 // -> t18a + vmull_vmlsl q1, d19, d24, d1[0], d1[1] // -> t29a + vmull_vmlsl q2, d22, d18, d1[2], d1[3] // -> t21a + vrshrn.i32 d19, q4, #12 // t18a + vrshrn.i32 d24, q1, #12 // t29a + vmull_vmlal q3, d22, d18, d1[3], d1[2] // -> t26a + vmull_vmlal q4, d17, d20, d1[3], d1[2] // -> t22a + vrshrn.i32 d22, q2, #12 // t21a + vrshrn.i32 d18, q3, #12 // t26a + vneg.s32 q4, q4 // -> t22a + vmull_vmlsl q1, d17, d20, d1[2], d1[3] // -> t25a + vrshrn.i32 d17, q4, #12 // t22a + vrshrn.i32 d20, q1, #12 // t25a + + vqsub.s16 d2, d27, d24 // t29 + vqadd.s16 d27, d27, d24 // t30 + vqsub.s16 d3, d21, d19 // t18 + vqadd.s16 d21, d21, d19 // t17 + vqsub.s16 d24, d16, d28 // t19a + vqadd.s16 d16, d16, d28 // t16a + vqsub.s16 d19, d30, d23 // t20a + vqadd.s16 d30, d30, d23 // t23a + vqsub.s16 d28, d17, d22 // t21 + vqadd.s16 d17, d17, d22 // t22 + vqadd.s16 d23, d26, d29 // t24a + vqsub.s16 d26, d26, d29 // t27a + vqadd.s16 d22, d20, d18 // t25 + vqsub.s16 d20, d20, d18 // t26 + vqsub.s16 d29, d31, d25 // t28a + vqadd.s16 d31, d31, d25 // t31a + + vmull_vmlsl q2, d2, d3, d0[2], d0[3] // -> t18a + vmull_vmlal q3, d2, d3, d0[3], d0[2] // -> t29a + vmull_vmlsl q4, d29, d24, d0[2], d0[3] // -> t19 + vrshrn.i32 d18, q2, #12 // t18a + vrshrn.i32 d25, q3, #12 // t29a + vmull_vmlal q1, d29, d24, d0[3], d0[2] // -> t28 + vmull_vmlal q2, d26, d19, d0[3], d0[2] // -> t20 + vrshrn.i32 d29, q4, #12 // t19 + vrshrn.i32 d24, q1, #12 // t28 + vneg.s32 q2, q2 // -> t20 + vmull_vmlsl q3, d26, d19, d0[2], d0[3] // -> t27 + vmull_vmlal q4, d20, d28, d0[3], d0[2] // -> t21a + vrshrn.i32 d26, q2, #12 // t20 + vrshrn.i32 d19, q3, #12 // t27 + vneg.s32 q4, q4 // -> t21a + vmull_vmlsl q1, d20, d28, d0[2], d0[3] // -> t26a + vrshrn.i32 d20, q4, #12 // t21a + vrshrn.i32 d28, q1, #12 // t26a + + vqsub.s16 d2, d16, d30 // t23 + vqadd.s16 d16, d16, d30 // t16 = out16 + vqsub.s16 d3, d31, d23 // t24 + vqadd.s16 d31, d31, d23 // t31 = out31 + vqsub.s16 d23, d21, d17 // t22a + vqadd.s16 d17, d21, d17 // t17a = out17 + vqadd.s16 d30, d27, d22 // t30a = out30 + vqsub.s16 d21, d27, d22 // t25a + vqsub.s16 d27, d18, d20 // t21 + vqadd.s16 d18, d18, d20 // t18 = out18 + vqadd.s16 d4, d29, d26 // t19a = out19 + vqsub.s16 d26, d29, d26 // t20a + vqadd.s16 d29, d25, d28 // t29 = out29 + vqsub.s16 d25, d25, d28 // t26 + vqadd.s16 d28, d24, d19 // t28a = out28 + vqsub.s16 d24, d24, d19 // t27a + vmov d19, d4 // out19 + + vmull_vmlsl q2, d24, d26, d0[0], d0[0] // -> t20 + vmull_vmlal q3, d24, d26, d0[0], d0[0] // -> t27 + vrshrn.i32 d20, q2, #12 // t20 + vrshrn.i32 d22, q3, #12 // t27 + + vmull_vmlal q2, d25, d27, d0[0], d0[0] // -> t26a + vmull_vmlsl q3, d25, d27, d0[0], d0[0] // -> t21a + vmov d27, d22 // t27 + vrshrn.i32 d26, q2, #12 // t26a + + vmull_vmlsl q12, d21, d23, d0[0], d0[0] // -> t22 + vmull_vmlal q2, d21, d23, d0[0], d0[0] // -> t25 + vrshrn.i32 d21, q3, #12 // t21a + vrshrn.i32 d22, q12, #12 // t22 + vrshrn.i32 d25, q2, #12 // t25 + + vmull_vmlsl q2, d3, d2, d0[0], d0[0] // -> t23a + vmull_vmlal q3, d3, d2, d0[0], d0[0] // -> t24a + vrshrn.i32 d23, q2, #12 // t23a + vrshrn.i32 d24, q3, #12 // t24a + + bx lr +endfunc + +.macro def_horz_32 scale=0, shift=2, suffix +function inv_txfm_horz\suffix\()_dct_32x4_neon + push {lr} + vmov.i16 d7, #0 + lsl r8, r8, #1 +.if \scale + movw r12, #2896*8 + vdup.16 d0, r12 +.endif + +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.16 {\i}, [r7, :64] + vst1.16 {d7}, [r7, :64], r8 +.endr + sub r7, r7, r8, lsl #4 + add r7, r7, r8, lsr #1 +.if \scale + scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15 +.endif + bl inv_dct_4h_x16_neon + transpose_4x4h q8, q9, d16, d17, d18, d19 + transpose_4x4h q10, q11, d20, d21, d22, d23 + transpose_4x4h q12, q13, d24, d25, d26, d27 + transpose_4x4h q14, q15, d28, d29, d30, d31 + +.macro store1 r0, r1, r2, r3 + vst1.16 {\r0}, [r6, :64]! + vst1.16 {\r1}, [r6, :64]! + vst1.16 {\r2}, [r6, :64]! + vst1.16 {\r3}, [r6, :64]! + add r6, r6, #32 +.endm + store1 d16, d20, d24, d28 + store1 d17, d21, d25, d29 + store1 d18, d22, d26, d30 + store1 d19, d23, d27, d31 +.purgem store1 + sub r6, r6, #64*4 + + vmov.i16 d7, #0 +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.16 {\i}, [r7, :64] + vst1.16 {d7}, [r7, :64], r8 +.endr +.if \scale + // This relies on the fact that the idct also leaves the right coeff in d0[1] + scale_input d0[1], q8, q9, q10, q11, q12, q13, q14, q15 +.endif + bl inv_dct32_odd_4h_x16_neon + transpose_4x4h q15, q14, d31, d30, d29, d28 + transpose_4x4h q13, q12, d27, d26, d25, d24 + transpose_4x4h q11, q10, d23, d22, d21, d20 + transpose_4x4h q9, q8, d19, d18, d17, d16 +.macro store2 r0, r1, r2, r3, shift + vld1.16 {q0, q1}, [r6, :128] + vqsub.s16 d7, d0, \r0 + vqadd.s16 d0, d0, \r0 + vqsub.s16 d6, d1, \r1 + vqadd.s16 d1, d1, \r1 + vqsub.s16 d5, d2, \r2 + vqadd.s16 d2, d2, \r2 + vqsub.s16 d4, d3, \r3 + vqadd.s16 d3, d3, \r3 + vrev64.16 q2, q2 + vrev64.16 q3, q3 + vrshr.s16 q0, q0, #\shift + vrshr.s16 q1, q1, #\shift + vrshr.s16 q2, q2, #\shift + vrshr.s16 q3, q3, #\shift + vst1.16 {q0, q1}, [r6, :128]! + vst1.16 {q2, q3}, [r6, :128]! +.endm + + store2 d31, d27, d23, d19, \shift + store2 d30, d26, d22, d18, \shift + store2 d29, d25, d21, d17, \shift + store2 d28, d24, d20, d16, \shift +.purgem store2 + pop {pc} +endfunc +.endm + +def_horz_32 scale=0, shift=2 +def_horz_32 scale=1, shift=1, suffix=_scale + +function inv_txfm_add_vert_dct_4x32_neon + push {r10-r11,lr} + lsl r8, r8, #1 + +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.16 {\i}, [r7, :64], r8 +.endr + sub r7, r7, r8, lsl #4 + + bl inv_dct_4h_x16_neon + +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vst1.16 {\i}, [r7, :64], r8 +.endr + sub r7, r7, r8, lsl #4 + add r7, r7, r8, lsr #1 + +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.16 {\i}, [r7, :64], r8 +.endr + sub r7, r7, r8, lsl #4 + sub r7, r7, r8, lsr #1 + bl inv_dct32_odd_4h_x16_neon + + neg r9, r8 + mov r10, r6 +.macro combine r0, r1, r2, r3, op, stride + vld1.16 {d4}, [r7, :64], \stride + vld1.32 {d2[0]}, [r10, :32], r1 + vld1.16 {d5}, [r7, :64], \stride + vld1.32 {d2[1]}, [r10, :32], r1 + \op\().s16 d4, d4, \r0 + vld1.16 {d6}, [r7, :64], \stride + vld1.32 {d3[0]}, [r10, :32], r1 + \op\().s16 d5, d5, \r1 + vld1.32 {d3[1]}, [r10, :32], r1 + vrshr.s16 q2, q2, #4 + \op\().s16 d6, d6, \r2 + vld1.16 {d7}, [r7, :64], \stride + vaddw.u8 q2, q2, d2 + \op\().s16 d7, d7, \r3 + vqmovun.s16 d2, q2 + vrshr.s16 q3, q3, #4 + vst1.32 {d2[0]}, [r6, :32], r1 + vaddw.u8 q3, q3, d3 + vst1.32 {d2[1]}, [r6, :32], r1 + vqmovun.s16 d3, q3 + vst1.32 {d3[0]}, [r6, :32], r1 + vst1.32 {d3[1]}, [r6, :32], r1 +.endm + combine d31, d30, d29, d28, vqadd, r8 + combine d27, d26, d25, d24, vqadd, r8 + combine d23, d22, d21, d20, vqadd, r8 + combine d19, d18, d17, d16, vqadd, r8 + sub r7, r7, r8 + combine d16, d17, d18, d19, vqsub, r9 + combine d20, d21, d22, d23, vqsub, r9 + combine d24, d25, d26, d27, vqsub, r9 + combine d28, d29, d30, d31, vqsub, r9 +.purgem combine + + pop {r10-r11,pc} +endfunc + +const eob_32x32 + .short 10, 36, 78, 136, 210, 300, 406, 1024 +endconst + +const eob_16x32 + .short 10, 36, 78, 151, 215, 279, 343, 512 +endconst + +const eob_16x32_shortside + .short 10, 36, 78, 512 +endconst + +const eob_8x32 + // Contrary to the others, this one is only ever used in increments of 8x8 + .short 43, 107, 171, 256 +endconst + +function inv_txfm_add_identity_identity_32x32_8bpc_neon, export=1 + push {r4-r7,lr} + vmov.i16 q0, #0 + movrel_local r5, eob_32x32, 2 + + mov r6, #2*32 +1: + mov r12, #0 + movrel_local r4, eob_32x32, 2 +2: + add r12, r12, #8 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.16 {\i}, [r2, :128] + vst1.16 {q0}, [r2, :128], r6 +.endr + transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30 + + load_add_store_8x8 r0, r7, shiftbits=2 + ldrh lr, [r4], #4 + sub r0, r0, r1, lsl #3 + cmp r3, lr + add r0, r0, #8 + bge 2b + + ldrh lr, [r5], #4 + cmp r3, lr + blt 9f + + sub r0, r0, r12 + add r0, r0, r1, lsl #3 + mls r2, r6, r12, r2 + add r2, r2, #2*8 + b 1b +9: + pop {r4-r7,pc} +endfunc + +.macro shift_8_regs op, shift +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + \op \i, \i, #\shift +.endr +.endm + +.macro def_identity_1632 w, h, wshort, hshort +function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1 + push {r4-r7,lr} + movw r6, #2896*8 + movw r7, #2*(5793-4096)*8 + vdup.i16 d0, r6 + movrel_local r5, eob_16x32\hshort, 2 + vmov.16 d0[1], r7 + + mov r6, #2*\h +1: + mov r12, #0 + movrel_local r4, eob_16x32\wshort, 2 +2: + vmov.i16 q1, #0 + add r12, r12, #8 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.16 {\i}, [r2, :128] + vst1.16 {q1}, [r2, :128], r6 +.endr + scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15 + +.if \w == 16 + // 16x32 + identity_8x8_shift1 d0[1] +.else + // 32x16 + shift_8_regs vqshl.s16, 1 + identity_8x8 d0[1] +.endif + + transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30 + +.if \w == 16 + load_add_store_8x8 r0, r7, shiftbits=2 +.else + load_add_store_8x8 r0, r7, shiftbits=4 +.endif + ldrh lr, [r4], #4 + sub r0, r0, r1, lsl #3 + cmp r3, lr + add r0, r0, #8 + bge 2b + + ldrh lr, [r5], #4 + cmp r3, lr + blt 9f + + sub r0, r0, r12 + add r0, r0, r1, lsl #3 + mls r2, r6, r12, r2 + add r2, r2, #2*8 + b 1b +9: + pop {r4-r7,pc} +endfunc +.endm + +def_identity_1632 16, 32, _shortside, +def_identity_1632 32, 16, , _shortside + +.macro def_identity_832 w, h +function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1 + push {r4-r5,lr} + vmov.i16 q0, #0 + movrel_local r4, eob_8x32 + + mov r12, #2*\h +1: + ldrh lr, [r4], #2 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.16 {\i}, [r2, :128] + vst1.16 {q0}, [r2, :128], r12 +.endr + +.if \w == 8 + // 8x32 + shift_8_regs vrshr.s16, 1 +.endif + + transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30 + + cmp r3, lr +.if \w == 8 + load_add_store_8x8 r0, r5, shiftbits=2 +.else + load_add_store_8x8 r0, r5, shiftbits=3 +.endif + + blt 9f +.if \w == 8 + sub r2, r2, r12, lsl #3 + add r2, r2, #2*8 +.else + sub r0, r0, r1, lsl #3 + add r0, r0, #8 +.endif + b 1b + +9: + pop {r4-r5,pc} +endfunc +.endm + +def_identity_832 8, 32 +def_identity_832 32, 8 + +function inv_txfm_add_dct_dct_32x32_8bpc_neon, export=1 + idct_dc 32, 32, 2 + + push {r4-r11,lr} + vpush {q4} + sub_sp_align 2048 + movrel_local r10, eob_32x32 + ldrh r11, [r10], #2 + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r6, sp, #(\i*32*2) +.if \i > 0 + mov r8, #(32 - \i) + cmp r3, r11 + blt 1f +.if \i < 28 + ldrh r11, [r10], #2 +.endif +.endif + add r7, r2, #(\i*2) + mov r8, #32*2 + bl inv_txfm_horz_dct_32x4_neon +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r6, r0, #(\i) + add r7, sp, #(\i*2) + mov r8, #32*2 + bl inv_txfm_add_vert_dct_4x32_neon +.endr + + add_sp_align 2048 + vpop {q4} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_16x32_8bpc_neon, export=1 + idct_dc 16, 32, 1 + + push {r4-r11,lr} + vpush {q4} + sub_sp_align 1024 + movrel_local r10, eob_16x32 + ldrh r11, [r10], #2 + movrel_local r4, inv_dct_4h_x16_neon + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r6, sp, #(\i*16*2) + add r7, r2, #(\i*2) +.if \i > 0 + mov r8, #(32 - \i) + cmp r3, r11 + blt 1f +.if \i < 28 + ldrh r11, [r10], #2 +.endif +.endif + mov r8, #2*32 + bl inv_txfm_horz_scale_16x4_neon +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #4 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12 + add r6, r0, #(\i) + add r7, sp, #(\i*2) + mov r8, #16*2 + bl inv_txfm_add_vert_dct_4x32_neon +.endr + + add_sp_align 1024 + vpop {q4} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_32x16_8bpc_neon, export=1 + idct_dc 32, 16, 1 + + push {r4-r11,lr} + vpush {q4} + sub_sp_align 1024 + movrel_local r10, eob_16x32 + ldrh r11, [r10], #2 + movrel_local r5, inv_dct_4h_x16_neon + +.irp i, 0, 4, 8, 12 + add r6, sp, #(\i*32*2) + add r7, r2, #(\i*2) +.if \i > 0 + mov r8, #(16 - \i) + cmp r3, r11 + blt 1f +.if \i < 12 + ldrh r11, [r10], #2 +.endif +.endif + mov r8, #2*16 + bl inv_txfm_horz_scale_dct_32x4_neon +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r6, r0, #(\i) + add r7, sp, #(\i*2) + mov r8, #32*2 + bl inv_txfm_add_vert_4x16_neon +.endr + + add_sp_align 1024 + vpop {q4} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_8x32_8bpc_neon, export=1 + idct_dc 8, 32, 2 + + push {r4-r11,lr} + vpush {q4-q7} + sub_sp_align 512 + + movrel_local r10, eob_8x32 + + mov r8, #2*32 + mov r9, #32 + mov r6, sp +1: + vmov.i16 q0, #0 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.16 {\i}, [r2, :128] + vst1.16 {q0}, [r2, :128], r8 +.endr + ldrh r11, [r10], #2 + sub r2, r2, r8, lsl #3 + sub r9, r9, #8 + add r2, r2, #2*8 + + bl inv_dct_8h_x8_neon + +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vrshr.s16 \i, \i, #2 +.endr + + transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30 + + vst1.16 {q8, q9}, [r6, :128]! + cmp r3, r11 + vst1.16 {q10, q11}, [r6, :128]! + vst1.16 {q12, q13}, [r6, :128]! + vst1.16 {q14, q15}, [r6, :128]! + + bge 1b + cmp r9, #0 + beq 3f + + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r9, r9, #8 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4 + add r6, r0, #(\i) + add r7, sp, #(\i*2) + mov r8, #8*2 + bl inv_txfm_add_vert_dct_4x32_neon +.endr + + add_sp_align 512 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_32x8_8bpc_neon, export=1 + idct_dc 32, 8, 2 + + push {r4-r11,lr} + vpush {q4-q7} + sub_sp_align 512 + +.irp i, 0, 4 + add r6, sp, #(\i*32*2) + add r7, r2, #(\i*2) +.if \i > 0 + cmp r3, #10 + blt 1f +.endif + mov r8, #8*2 + bl inv_txfm_horz_dct_32x4_neon +.endr + b 2f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +.rept 8 + vst1.16 {q2, q3}, [r6, :128]! +.endr + +2: + mov r8, #2*32 + mov r9, #0 +1: + add r6, r0, r9 + add r7, sp, r9, lsl #1 // #(\i*2) + +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.16 {\i}, [r7, :128], r8 +.endr + add r9, r9, #8 + + bl inv_dct_8h_x8_neon + + cmp r9, #32 + + load_add_store_8x8 r6, r7 + + blt 1b + + add_sp_align 512 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_dct64_step1_neon + // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a + // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a + // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a + // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a + + vld1.16 {d0, d1, d2}, [r12, :64]! + + vqrdmulh.s16 d23, d16, d0[1] // t63a + vqrdmulh.s16 d16, d16, d0[0] // t32a + vqrdmulh.s16 d22, d17, d0[2] // t62a + vqrdmulh.s16 d17, d17, d0[3] // t33a + vqrdmulh.s16 d21, d18, d1[1] // t61a + vqrdmulh.s16 d18, d18, d1[0] // t34a + vqrdmulh.s16 d20, d19, d1[2] // t60a + vqrdmulh.s16 d19, d19, d1[3] // t35a + + vqadd.s16 d24, d16, d17 // t32 + vqsub.s16 d25, d16, d17 // t33 + vqsub.s16 d26, d19, d18 // t34 + vqadd.s16 d27, d19, d18 // t35 + vqadd.s16 d28, d20, d21 // t60 + vqsub.s16 d29, d20, d21 // t61 + vqsub.s16 d30, d23, d22 // t62 + vqadd.s16 d31, d23, d22 // t63 + + vmull_vmlal q2, d29, d26, d2[0], d2[1] // -> t34a + vmull_vmlsl q3, d29, d26, d2[1], d2[0] // -> t61a + vneg.s32 q2, q2 // t34a + vmull_vmlsl q4, d30, d25, d2[1], d2[0] // -> t33a + vrshrn.i32 d26, q2, #12 // t34a + vmull_vmlal q2, d30, d25, d2[0], d2[1] // -> t62a + vrshrn.i32 d29, q3, #12 // t61a + vrshrn.i32 d25, q4, #12 // t33a + vrshrn.i32 d30, q2, #12 // t62a + + vqadd.s16 d16, d24, d27 // t32a + vqsub.s16 d19, d24, d27 // t35a + vqadd.s16 d17, d25, d26 // t33 + vqsub.s16 d18, d25, d26 // t34 + vqsub.s16 d20, d31, d28 // t60a + vqadd.s16 d23, d31, d28 // t63a + vqsub.s16 d21, d30, d29 // t61 + vqadd.s16 d22, d30, d29 // t62 + + vmull_vmlal q2, d21, d18, d2[2], d2[3] // -> t61a + vmull_vmlsl q3, d21, d18, d2[3], d2[2] // -> t34a + vmull_vmlal q4, d20, d19, d2[2], d2[3] // -> t60 + vrshrn.i32 d21, q2, #12 // t61a + vrshrn.i32 d18, q3, #12 // t34a + vmull_vmlsl q2, d20, d19, d2[3], d2[2] // -> t35 + vrshrn.i32 d20, q4, #12 // t60 + vrshrn.i32 d19, q2, #12 // t35 + + vst1.16 {d16, d17, d18, d19}, [r6, :128]! + vst1.16 {d20, d21, d22, d23}, [r6, :128]! + + bx lr +endfunc + +function inv_dct64_step2_neon + movrel_local r12, idct_coeffs + vld1.16 {d0}, [r12, :64] +1: + // t32a/33/34a/35/60/61a/62/63a + // t56a/57/58a/59/36/37a/38/39a + // t40a/41/42a/43/52/53a/54/55a + // t48a/49/50a/51/44/45a/46/47a + vldr d16, [r6, #2*4*0] // t32a + vldr d17, [r9, #2*4*8] // t39a + vldr d18, [r9, #2*4*0] // t63a + vldr d19, [r6, #2*4*8] // t56a + vldr d20, [r6, #2*4*16] // t40a + vldr d21, [r9, #2*4*24] // t47a + vldr d22, [r9, #2*4*16] // t55a + vldr d23, [r6, #2*4*24] // t48a + + vqadd.s16 d24, d16, d17 // t32 + vqsub.s16 d25, d16, d17 // t39 + vqadd.s16 d26, d18, d19 // t63 + vqsub.s16 d27, d18, d19 // t56 + vqsub.s16 d28, d21, d20 // t40 + vqadd.s16 d29, d21, d20 // t47 + vqadd.s16 d30, d23, d22 // t48 + vqsub.s16 d31, d23, d22 // t55 + + vmull_vmlal q2, d27, d25, d0[3], d0[2] // -> t56a + vmull_vmlsl q3, d27, d25, d0[2], d0[3] // -> t39a + vmull_vmlal q4, d31, d28, d0[3], d0[2] // -> t40a + vrshrn.i32 d25, q2, #12 // t56a + vrshrn.i32 d27, q3, #12 // t39a + vneg.s32 q4, q4 // t40a + vmull_vmlsl q2, d31, d28, d0[2], d0[3] // -> t55a + vrshrn.i32 d31, q4, #12 // t40a + vrshrn.i32 d28, q2, #12 // t55a + + vqadd.s16 d16, d24, d29 // t32a + vqsub.s16 d19, d24, d29 // t47a + vqadd.s16 d17, d27, d31 // t39 + vqsub.s16 d18, d27, d31 // t40 + vqsub.s16 d20, d26, d30 // t48a + vqadd.s16 d23, d26, d30 // t63a + vqsub.s16 d21, d25, d28 // t55 + vqadd.s16 d22, d25, d28 // t56 + + vmull_vmlsl q2, d21, d18, d0[0], d0[0] // -> t40a + vmull_vmlal q3, d21, d18, d0[0], d0[0] // -> t55a + vmull_vmlsl q4, d20, d19, d0[0], d0[0] // -> t47 + vrshrn.i32 d18, q2, #12 // t40a + vrshrn.i32 d21, q3, #12 // t55a + vmull_vmlal q2, d20, d19, d0[0], d0[0] // -> t48 + vrshrn.i32 d19, q4, #12 // t47 + vrshrn.i32 d20, q2, #12 // t48 + + vstr d16, [r6, #2*4*0] // t32a + vstr d17, [r9, #2*4*0] // t39 + vstr d18, [r6, #2*4*8] // t40a + vstr d19, [r9, #2*4*8] // t47 + vstr d20, [r6, #2*4*16] // t48 + vstr d21, [r9, #2*4*16] // t55a + vstr d22, [r6, #2*4*24] // t56 + vstr d23, [r9, #2*4*24] // t63a + + add r6, r6, #2*4 + sub r9, r9, #2*4 + cmp r6, r9 + blt 1b + bx lr +endfunc + +.macro load8 src, strd, zero, clear +.irp i, d16, d17, d18, d19, d20, d21, d22, d23 +.if \clear + vld1.16 {\i}, [\src, :64] + vst1.16 {\zero}, [\src, :64], \strd +.else + vld1.16 {\i}, [\src, :64], \strd +.endif +.endr +.endm + +.macro store16 dst + vst1.16 {q8, q9}, [\dst, :128]! + vst1.16 {q10, q11}, [\dst, :128]! + vst1.16 {q12, q13}, [\dst, :128]! + vst1.16 {q14, q15}, [\dst, :128]! +.endm + +.macro clear_upper8 +.irp i, q12, q13, q14, q15 + vmov.i16 \i, #0 +.endr +.endm + +.macro vmov_if reg, val, cond +.if \cond + vmov.i16 \reg, \val +.endif +.endm + +.macro movdup_if reg, gpr, val, cond +.if \cond + movw \gpr, \val + vdup.16 \reg, \gpr +.endif +.endm + +.macro vst1_if regs, dst, dstalign, cond +.if \cond + vst1.16 \regs, \dst, \dstalign +.endif +.endm + +.macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7 +.if \cond + scale_input \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 +.endif +.endm + +.macro def_dct64_func suffix, clear=0, scale=0 +function inv_txfm_dct\suffix\()_4h_x64_neon, export=1 + mov r6, sp + + push {r10-r11,lr} + + lsl r8, r8, #2 + + movdup_if d0, r12, #2896*8, \scale + vmov_if d7, #0, \clear + load8 r7, r8, d7, \clear + clear_upper8 + sub r7, r7, r8, lsl #3 + add r7, r7, r8, lsr #1 + scale_if \scale, d0[0], q8, q9, q10, q11 + + bl inv_dct_4h_x16_neon + + store16 r6 + + movdup_if d0, r12, #2896*8, \scale + vmov_if d7, #0, \clear + load8 r7, r8, d7, \clear + clear_upper8 + sub r7, r7, r8, lsl #3 + lsr r8, r8, #1 + sub r7, r7, r8, lsr #1 + scale_if \scale, d0[0], q8, q9, q10, q11 + + bl inv_dct32_odd_4h_x16_neon + + add r10, r6, #8*15 + sub r6, r6, #8*16 + + mov r9, #-8 + +.macro store_addsub r0, r1, r2, r3 + vld1.16 {d2}, [r6, :64]! + vld1.16 {d3}, [r6, :64]! + vqadd.s16 d6, d2, \r0 + vqsub.s16 \r0, d2, \r0 + vld1.16 {d4}, [r6, :64]! + vqadd.s16 d7, d3, \r1 + vqsub.s16 \r1, d3, \r1 + vld1.16 {d5}, [r6, :64]! + vqadd.s16 d2, d4, \r2 + sub r6, r6, #8*4 + vqsub.s16 \r2, d4, \r2 + vst1.16 {d6}, [r6, :64]! + vst1.16 {\r0}, [r10, :64], r9 + vqadd.s16 d3, d5, \r3 + vqsub.s16 \r3, d5, \r3 + vst1.16 {d7}, [r6, :64]! + vst1.16 {\r1}, [r10, :64], r9 + vst1.16 {d2}, [r6, :64]! + vst1.16 {\r2}, [r10, :64], r9 + vst1.16 {d3}, [r6, :64]! + vst1.16 {\r3}, [r10, :64], r9 +.endm + store_addsub d31, d30, d29, d28 + store_addsub d27, d26, d25, d24 + store_addsub d23, d22, d21, d20 + store_addsub d19, d18, d17, d16 +.purgem store_addsub + + add r6, r6, #2*4*16 + + movrel_local r12, idct64_coeffs + movdup_if d0, lr, #2896*8, \scale + vmov_if d7, #0, \clear + add r9, r7, r8, lsl #4 // offset 16 + add r10, r7, r8, lsl #3 // offset 8 + sub r9, r9, r8 // offset 15 + sub r11, r10, r8 // offset 7 + vld1.16 {d16}, [r7, :64] // in1 (offset 0) + vld1.16 {d17}, [r9, :64] // in31 (offset 15) + vld1.16 {d18}, [r10, :64] // in17 (offset 8) + vld1.16 {d19}, [r11, :64] // in15 (offset 7) + vst1_if {d7}, [r7, :64], \clear + vst1_if {d7}, [r9, :64], \clear + vst1_if {d7}, [r10, :64], \clear + vst1_if {d7}, [r11, :64], \clear + scale_if \scale, d0[0], q8, q9 + bl inv_dct64_step1_neon + movdup_if d0, lr, #2896*8, \scale + vmov_if d7, #0, \clear + add r7, r7, r8, lsl #2 // offset 4 + sub r9, r9, r8, lsl #2 // offset 11 + sub r10, r7, r8 // offset 3 + add r11, r9, r8 // offset 12 + vld1.16 {d16}, [r10, :64] // in7 (offset 3) + vld1.16 {d17}, [r11, :64] // in25 (offset 12) + vld1.16 {d18}, [r9, :64] // in23 (offset 11) + vld1.16 {d19}, [r7, :64] // in9 (offset 4) + vst1_if {d7}, [r7, :64], \clear + vst1_if {d7}, [r9, :64], \clear + vst1_if {d7}, [r10, :64], \clear + vst1_if {d7}, [r11, :64], \clear + scale_if \scale, d0[0], q8, q9 + bl inv_dct64_step1_neon + movdup_if d0, lr, #2896*8, \scale + vmov_if d7, #0, \clear + sub r10, r10, r8, lsl #1 // offset 1 + sub r9, r9, r8, lsl #1 // offset 9 + add r10, r10, r8 // offset 2 + add r9, r9, r8 // offset 10 + add r7, r7, r8 // offset 5 + add r11, r11, r8 // offset 13 + vld1.16 d16, [r10, :64] // in5 (offset 2) + vld1.16 d17, [r11, :64] // in27 (offset 13) + vld1.16 d18, [r9, :64] // in21 (offset 10) + vld1.16 d19, [r7, :64] // in11 (offset 5) + vst1_if d7, [r10, :64], \clear + vst1_if d7, [r11, :64], \clear + vst1_if d7, [r9, :64], \clear + vst1_if d7, [r7, :64], \clear + scale_if \scale, d0[0], q8, q9 + bl inv_dct64_step1_neon + movdup_if d0, lr, #2896*8, \scale + vmov_if d7, #0, \clear + sub r10, r10, r8 // offset 1 + sub r9, r9, r8 // offset 9 + add r11, r11, r8 // offset 14 + add r7, r7, r8 // offset 6 + vld1.16 d16, [r10, :64] // in3 (offset 1) + vld1.16 d17, [r11, :64] // in29 (offset 14) + vld1.16 d18, [r9, :64] // in19 (offset 9) + vld1.16 d19, [r7, :64] // in13 (offset 6) + vst1_if d7, [r10, :64], \clear + vst1_if d7, [r11, :64], \clear + vst1_if d7, [r9, :64], \clear + vst1_if d7, [r7, :64], \clear + scale_if \scale, d0[0], q8, q9 + bl inv_dct64_step1_neon + + sub r6, r6, #2*4*32 + add r9, r6, #2*4*7 + + bl inv_dct64_step2_neon + + pop {r10-r11,pc} +endfunc +.endm + +def_dct64_func +def_dct64_func _clear, clear=1 +def_dct64_func _clear_scale, clear=1, scale=1 + +function inv_txfm_horz_dct_64x4_neon + vdup.16 q3, r9 + + mov r7, sp + add r8, sp, #2*4*(64 - 4) + add r9, r6, #2*56 + + push {r10-r11,lr} + + mov r10, #2*64 + mov r11, #-2*4*4 + +1: + vld1.16 {d16, d17, d18, d19}, [r7, :128]! + vld1.16 {d28, d29, d30, d31}, [r8, :128], r11 + vld1.16 {d20, d21, d22, d23}, [r7, :128]! + vld1.16 {d24, d25, d26, d27}, [r8, :128], r11 + transpose_4x4h q8, q9, d16, d17, d18, d19 + transpose_4x4h q15, q14, d31, d30, d29, d28 + transpose_4x4h q10, q11, d20, d21, d22, d23 + transpose_4x4h q13, q12, d27, d26, d25, d24 + +.macro store_addsub src0, src1, src2, src3 + vqsub.s16 d3, \src0, \src1 + vqsub.s16 d2, \src2, \src3 + vqadd.s16 d0, \src0, \src1 + vqadd.s16 d1, \src2, \src3 + vrshl.s16 q1, q1, q3 + vrshl.s16 q0, q0, q3 + vrev64.16 q1, q1 + vst1.16 {q0}, [r6, :128], r10 + vst1.16 {q1}, [r9, :128], r10 +.endm + store_addsub d16, d31, d20, d27 + store_addsub d17, d30, d21, d26 + store_addsub d18, d29, d22, d25 + store_addsub d19, d28, d23, d24 +.purgem store_addsub + sub r6, r6, r10, lsl #2 + sub r9, r9, r10, lsl #2 + add r6, r6, #16 + sub r9, r9, #16 + + cmp r7, r8 + blt 1b + pop {r10-r11,pc} +endfunc + +function inv_txfm_add_vert_dct_4x64_neon + lsl r8, r8, #1 + + mov r7, sp + add r8, sp, #2*4*(64 - 4) + add r9, r6, r1, lsl #6 + sub r9, r9, r1 + + push {r10-r11,lr} + + neg r10, r1 + mov r11, #-2*4*4 + +1: + vld1.16 {d16, d17, d18, d19}, [r7, :128]! + vld1.16 {d28, d29, d30, d31}, [r8, :128], r11 + vld1.16 {d20, d21, d22, d23}, [r7, :128]! + vld1.16 {d24, d25, d26, d27}, [r8, :128], r11 + +.macro add_dest_addsub src0, src1, src2, src3 + vld1.32 {d0[0]}, [r6, :32], r1 + vld1.32 {d1[0]}, [r9, :32], r10 + vqadd.s16 d4, \src0, \src1 + vld1.32 {d0[1]}, [r6, :32] + vqadd.s16 d5, \src2, \src3 + vld1.32 {d1[1]}, [r9, :32] + vqsub.s16 d6, \src0, \src1 + vqsub.s16 d7, \src2, \src3 + sub r6, r6, r1 + sub r9, r9, r10 + vrshr.s16 q2, q2, #4 + vrshr.s16 q3, q3, #4 + vaddw.u8 q2, q2, d0 + vaddw.u8 q3, q3, d1 + vqmovun.s16 d0, q2 + vqmovun.s16 d1, q3 + vst1.32 {d0[0]}, [r6, :32], r1 + vst1.32 {d1[0]}, [r9, :32], r10 + vst1.32 {d0[1]}, [r6, :32], r1 + vst1.32 {d1[1]}, [r9, :32], r10 +.endm + add_dest_addsub d16, d31, d17, d30 + add_dest_addsub d18, d29, d19, d28 + add_dest_addsub d20, d27, d21, d26 + add_dest_addsub d22, d25, d23, d24 +.purgem add_dest_addsub + cmp r7, r8 + blt 1b + + pop {r10-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_64x64_8bpc_neon, export=1 + idct_dc 64, 64, 2 + + push {r4-r11,lr} + vpush {q4} + + sub_sp_align 64*32*2+64*4*2 + add r5, sp, #64*4*2 + + movrel_local r10, eob_32x32 + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r6, r5, #(\i*64*2) +.if \i > 0 + mov r8, #(32 - \i) + cmp r3, r11 + blt 1f +.endif + add r7, r2, #(\i*2) + mov r8, #32*2 + bl inv_txfm_dct_clear_4h_x64_neon + add r6, r5, #(\i*64*2) + mov r9, #-2 // shift + bl inv_txfm_horz_dct_64x4_neon +.if \i < 28 + ldrh r11, [r10], #2 +.endif +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 8 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 + add r7, r5, #(\i*2) + mov r8, #64*2 + bl inv_txfm_dct_4h_x64_neon + add r6, r0, #(\i) + bl inv_txfm_add_vert_dct_4x64_neon +.endr + + add_sp_align 64*32*2+64*4*2 + vpop {q4} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_64x32_8bpc_neon, export=1 + idct_dc 64, 32, 1 + + push {r4-r11,lr} + vpush {q4} + + sub_sp_align 64*32*2+64*4*2 + add r5, sp, #64*4*2 + + movrel_local r10, eob_32x32 + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r6, r5, #(\i*64*2) +.if \i > 0 + mov r8, #(32 - \i) + cmp r3, r11 + blt 1f +.endif + add r7, r2, #(\i*2) + mov r8, #32*2 + bl inv_txfm_dct_clear_scale_4h_x64_neon + add r6, r5, #(\i*64*2) + mov r9, #-1 // shift + bl inv_txfm_horz_dct_64x4_neon +.if \i < 28 + ldrh r11, [r10], #2 +.endif +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 8 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 + add r6, r0, #(\i) + add r7, r5, #(\i*2) + mov r8, #64*2 + bl inv_txfm_add_vert_dct_4x32_neon +.endr + + add_sp_align 64*32*2+64*4*2 + vpop {q4} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1 + idct_dc 32, 64, 1 + + push {r4-r11,lr} + vpush {q4} + + sub_sp_align 32*32*2+64*4*2 + add r5, sp, #64*4*2 + + movrel_local r10, eob_32x32 + ldrh r11, [r10], #2 + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r6, r5, #(\i*32*2) +.if \i > 0 + mov r8, #(32 - \i) + cmp r3, r11 + blt 1f + ldrh r11, [r10], #2 +.endif + add r7, r2, #(\i*2) + mov r8, #32*2 + bl inv_txfm_horz_scale_dct_32x4_neon +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r7, r5, #(\i*2) + mov r8, #32*2 + bl inv_txfm_dct_4h_x64_neon + add r6, r0, #(\i) + bl inv_txfm_add_vert_dct_4x64_neon +.endr + + add_sp_align 32*32*2+64*4*2 + vpop {q4} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1 + idct_dc 64, 16, 2 + + push {r4-r11,lr} + vpush {q4} + + sub_sp_align 64*16*2+64*4*2 + add r4, sp, #64*4*2 + + movrel_local r10, eob_16x32 + +.irp i, 0, 4, 8, 12 + add r6, r4, #(\i*64*2) +.if \i > 0 + mov r8, #(16 - \i) + cmp r3, r11 + blt 1f +.endif + add r7, r2, #(\i*2) + mov r8, #16*2 + bl inv_txfm_dct_clear_4h_x64_neon + add r6, r4, #(\i*64*2) + mov r9, #-2 // shift + bl inv_txfm_horz_dct_64x4_neon +.if \i < 8 + ldrh r11, [r10], #2 +.endif +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 8 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: + movrel_local r5, inv_dct_4h_x16_neon +.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 + add r6, r0, #(\i) + add r7, r4, #(\i*2) + mov r8, #64*2 + bl inv_txfm_add_vert_4x16_neon +.endr + + add_sp_align 64*16*2+64*4*2 + vpop {q4} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1 + idct_dc 16, 64, 2 + + push {r4-r11,lr} + vpush {q4} + + sub_sp_align 16*32*2+64*4*2 + add r5, sp, #64*4*2 + + movrel_local r10, eob_16x32 + ldrh r11, [r10], #2 + + movrel_local r4, inv_dct_4h_x16_neon +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r6, r5, #(\i*16*2) +.if \i > 0 + mov r8, #(32 - \i) + cmp r3, r11 + blt 1f + ldrh r11, [r10], #2 +.endif + add r7, r2, #(\i*2) + mov r8, #32*2 + bl inv_txfm_horz_16x4_neon +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #4 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12 + add r7, r5, #(\i*2) + mov r8, #16*2 + bl inv_txfm_dct_4h_x64_neon + add r6, r0, #(\i) + bl inv_txfm_add_vert_dct_4x64_neon +.endr + + add_sp_align 16*32*2+64*4*2 + vpop {q4} + pop {r4-r11,pc} +endfunc diff --git a/third_party/dav1d/src/arm/32/loopfilter.S b/third_party/dav1d/src/arm/32/loopfilter.S new file mode 100644 index 0000000000..25f993d176 --- /dev/null +++ b/third_party/dav1d/src/arm/32/loopfilter.S @@ -0,0 +1,868 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2019, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +.macro loop_filter wd +function lpf_8_wd\wd\()_neon + vabd.u8 d0, d22, d23 // abs(p1 - p0) + vabd.u8 d1, d25, d24 // abs(q1 - q0) + vabd.u8 d2, d23, d24 // abs(p0 - q0) + vabd.u8 d3, d22, d25 // abs(p1 - q1) +.if \wd >= 6 + vabd.u8 d4, d21, d22 // abs(p2 - p1) + vabd.u8 d5, d26, d25 // abs(q2 - q1) +.endif +.if \wd >= 8 + vabd.u8 d6, d20, d21 // abs(p3 - p2) + vabd.u8 d7, d27, d26 // abs(q3 - q3) +.endif +.if \wd >= 6 + vmax.u8 d4, d4, d5 +.endif + vqadd.u8 d2, d2, d2 // abs(p0 - q0) * 2 +.if \wd >= 8 + vmax.u8 d6, d6, d7 +.endif + vshr.u8 d3, d3, #1 +.if \wd >= 8 + vmax.u8 d4, d4, d6 +.endif +.if \wd >= 6 + vand d4, d4, d14 +.endif + vmax.u8 d0, d0, d1 // max(abs(p1 - p0), abs(q1 - q0)) + vqadd.u8 d2, d2, d3 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 +.if \wd >= 6 + vmax.u8 d4, d0, d4 + vcge.u8 d1, d11, d4 // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I +.else + vcge.u8 d1, d11, d0 // max(abs(p1 - p0), abs(q1 - q0)) <= I +.endif + vcge.u8 d2, d10, d2 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E + vand d1, d1, d2 // fm + vand d1, d1, d13 // fm && wd >= 4 +.if \wd >= 6 + vand d14, d14, d1 // fm && wd > 4 +.endif +.if \wd >= 16 + vand d15, d15, d1 // fm && wd == 16 +.endif + + vmov r10, r11, d1 + orrs r10, r10, r11 + beq 9f // if (!fm || wd < 4) return; + +.if \wd >= 6 + vmov.i8 d10, #1 + vabd.u8 d2, d21, d23 // abs(p2 - p0) + vabd.u8 d3, d22, d23 // abs(p1 - p0) + vabd.u8 d4, d25, d24 // abs(q1 - q0) + vabd.u8 d5, d26, d24 // abs(q2 - q0) +.if \wd >= 8 + vabd.u8 d6, d20, d23 // abs(p3 - p0) + vabd.u8 d7, d27, d24 // abs(q3 - q0) +.endif + vmax.u8 d2, d2, d3 + vmax.u8 d4, d4, d5 +.if \wd >= 8 + vmax.u8 d6, d6, d7 +.endif + vmax.u8 d2, d2, d4 +.if \wd >= 8 + vmax.u8 d2, d2, d6 +.endif + +.if \wd == 16 + vabd.u8 d3, d17, d23 // abs(p6 - p0) + vabd.u8 d4, d18, d23 // abs(p5 - p0) + vabd.u8 d5, d19, d23 // abs(p4 - p0) +.endif + vcge.u8 d2, d10, d2 // flat8in +.if \wd == 16 + vabd.u8 d6, d28, d24 // abs(q4 - q0) + vabd.u8 d7, d29, d24 // abs(q5 - q0) + vabd.u8 d8, d30, d24 // abs(q6 - q0) +.endif + vand d14, d2, d14 // flat8in && fm && wd > 4 + vbic d1, d1, d14 // fm && wd >= 4 && !flat8in +.if \wd == 16 + vmax.u8 d3, d3, d4 + vmax.u8 d5, d5, d6 +.endif + vmov r10, r11, d1 +.if \wd == 16 + vmax.u8 d7, d7, d8 + vmax.u8 d3, d3, d5 + vmax.u8 d3, d3, d7 + vcge.u8 d3, d10, d3 // flat8out +.endif + orrs r10, r10, r11 +.if \wd == 16 + vand d15, d15, d3 // flat8out && fm && wd == 16 + vand d15, d15, d14 // flat8out && flat8in && fm && wd == 16 + vbic d14, d14, d15 // flat8in && fm && wd >= 4 && !flat8out +.endif + beq 1f // skip wd == 4 case +.endif + + vsubl.u8 q1, d22, d25 // p1 - q1 + vcgt.u8 d0, d0, d12 // hev + vqmovn.s16 d2, q1 + vand d4, d2, d0 // if (hev) iclip_diff(p1 - q1) + vbic d0, d1, d0 // (fm && wd >= 4 && !hev) + vsubl.u8 q1, d24, d23 + vmov.i16 q3, #3 + vmul.i16 q1, q1, q3 + vmov.i8 d6, #4 + vaddw.s8 q1, q1, d4 + vmov.i8 d7, #3 + vqmovn.s16 d2, q1 // f + vqadd.s8 d4, d6, d2 // imin(f + 4, 127) + vqadd.s8 d5, d7, d2 // imin(f + 3, 127) + vshr.s8 d4, d4, #3 // f1 + vshr.s8 d5, d5, #3 // f2 + vmovl.u8 q1, d23 // p0 + vmovl.u8 q3, d24 // q0 + vaddw.s8 q1, q1, d5 + vsubw.s8 q3, q3, d4 + vrshr.s8 d4, d4, #1 // (f1 + 1) >> 1 + vqmovun.s16 d2, q1 // out p0 + vqmovun.s16 d6, q3 // out q0 + vbit d23, d2, d1 // if (fm && wd >= 4) + vmovl.u8 q1, d22 // p1 + vbit d24, d6, d1 // if (fm && wd >= 4) + vmovl.u8 q3, d25 // q1 + vaddw.s8 q1, q1, d4 + vsubw.s8 q3, q3, d4 + vqmovun.s16 d2, q1 // out p1 + vqmovun.s16 d6, q3 // out q1 + vbit d22, d2, d0 // if (fm && wd >= 4 && !hev) + vbit d25, d6, d0 // if (fm && wd >= 4 && !hev) +1: + +.if \wd == 6 + vmov r10, r11, d14 + orrs r10, r10, r11 + beq 2f // skip if there's no flat8in + + vaddl.u8 q0, d21, d21 // p2 * 2 + vaddl.u8 q1, d21, d22 // p2 + p1 + vaddl.u8 q2, d22, d23 // p1 + p0 + vaddl.u8 q3, d23, d24 // p0 + q0 + vadd.i16 q4, q0, q1 + vadd.i16 q5, q2, q3 + vaddl.u8 q6, d24, d25 // q0 + q1 + vadd.i16 q4, q4, q5 + vsub.i16 q6, q6, q0 + vaddl.u8 q5, d25, d26 // q1 + q2 + vrshrn.i16 d0, q4, #3 // out p1 + + vadd.i16 q4, q4, q6 + vsub.i16 q5, q5, q1 + vaddl.u8 q6, d26, d26 // q2 + q2 + vrshrn.i16 d1, q4, #3 // out p0 + + vadd.i16 q4, q4, q5 + vsub.i16 q6, q6, q2 + vrshrn.i16 d2, q4, #3 // out q0 + + vbit d22, d0, d14 // p1 if (flat8in) + vadd.i16 q4, q4, q6 + vbit d23, d1, d14 // p0 if (flat8in) + vrshrn.i16 d3, q4, #3 // out q1 + vbit d24, d2, d14 // q0 if (flat8in) + vbit d25, d3, d14 // q1 if (flat8in) +.elseif \wd >= 8 + vmov r10, r11, d14 + orrs r10, r10, r11 +.if \wd == 8 + beq 8f // skip if there's no flat8in +.else + beq 2f // skip if there's no flat8in +.endif + + vaddl.u8 q0, d20, d21 // p3 + p2 + vaddl.u8 q1, d22, d25 // p1 + q1 + vaddl.u8 q2, d20, d22 // p3 + p1 + vaddl.u8 q3, d23, d26 // p0 + q2 + vadd.i16 q4, q0, q0 // 2 * (p3 + p2) + vaddw.u8 q4, q4, d23 // + p0 + vaddw.u8 q4, q4, d24 // + q0 + vadd.i16 q4, q4, q2 // + p3 + p1 + vsub.i16 q1, q1, q0 // p1 + q1 - p3 - p2 + vsub.i16 q3, q3, q2 // p0 + q2 - p3 - p1 + vrshrn.i16 d10, q4, #3 // out p2 + + vadd.i16 q4, q4, q1 + vaddl.u8 q0, d20, d23 // p3 + p0 + vaddl.u8 q1, d24, d27 // q0 + q3 + vrshrn.i16 d11, q4, #3 // out p1 + + vadd.i16 q4, q4, q3 + vsub.i16 q1, q1, q0 // q0 + q3 - p3 - p0 + vaddl.u8 q2, d21, d24 // p2 + q0 + vaddl.u8 q3, d25, d27 // q1 + q3 + vrshrn.i16 d12, q4, #3 // out p0 + + vadd.i16 q4, q4, q1 + vsub.i16 q3, q3, q2 // q1 + q3 - p2 - q0 + vaddl.u8 q0, d22, d25 // p1 + q1 + vaddl.u8 q1, d26, d27 // q2 + q3 + vrshrn.i16 d13, q4, #3 // out q0 + + vadd.i16 q4, q4, q3 + vsub.i16 q1, q1, q0 // q2 + q3 - p1 - q1 + vrshrn.i16 d0, q4, #3 // out q1 + + vadd.i16 q4, q4, q1 + + vbit d21, d10, d14 + vbit d22, d11, d14 + vbit d23, d12, d14 + vrshrn.i16 d1, q4, #3 // out q2 + vbit d24, d13, d14 + vbit d25, d0, d14 + vbit d26, d1, d14 +.endif +2: +.if \wd == 16 + vmov r10, r11, d15 + orrs r10, r10, r11 + bne 1f // check if flat8out is needed + vmov r10, r11, d14 + orrs r10, r10, r11 + beq 8f // if there was no flat8in, just write the inner 4 pixels + b 7f // if flat8in was used, write the inner 6 pixels +1: + + vaddl.u8 q1, d17, d17 // p6 + p6 + vaddl.u8 q2, d17, d18 // p6 + p5 + vaddl.u8 q3, d17, d19 // p6 + p4 + vaddl.u8 q4, d17, d20 // p6 + p3 + vadd.i16 q6, q1, q2 + vadd.i16 q5, q3, q4 + vaddl.u8 q3, d17, d21 // p6 + p2 + vadd.i16 q6, q6, q5 + vaddl.u8 q4, d17, d22 // p6 + p1 + vaddl.u8 q5, d18, d23 // p5 + p0 + vadd.i16 q3, q3, q4 + vaddl.u8 q4, d19, d24 // p4 + q0 + vadd.i16 q6, q6, q3 + vadd.i16 q5, q5, q4 + vaddl.u8 q3, d20, d25 // p3 + q1 + vadd.i16 q6, q6, q5 + vsub.i16 q3, q3, q1 + vaddl.u8 q1, d21, d26 // p2 + q2 + vrshrn.i16 d0, q6, #4 // out p5 + vadd.i16 q6, q6, q3 // - (p6 + p6) + (p3 + q1) + vsub.i16 q1, q1, q2 + vaddl.u8 q2, d22, d27 // p1 + q3 + vaddl.u8 q3, d17, d19 // p6 + p4 + vrshrn.i16 d1, q6, #4 // out p4 + vadd.i16 q6, q6, q1 // - (p6 + p5) + (p2 + q2) + vsub.i16 q2, q2, q3 + vaddl.u8 q3, d23, d28 // p0 + q4 + vaddl.u8 q4, d17, d20 // p6 + p3 + vrshrn.i16 d2, q6, #4 // out p3 + vadd.i16 q6, q6, q2 // - (p6 + p4) + (p1 + q3) + vsub.i16 q3, q3, q4 + vaddl.u8 q4, d24, d29 // q0 + q5 + vaddl.u8 q2, d17, d21 // p6 + p2 + vrshrn.i16 d3, q6, #4 // out p2 + vadd.i16 q6, q6, q3 // - (p6 + p3) + (p0 + q4) + vsub.i16 q4, q4, q2 + vaddl.u8 q3, d25, d30 // q1 + q6 + vaddl.u8 q5, d17, d22 // p6 + p1 + vrshrn.i16 d4, q6, #4 // out p1 + vadd.i16 q6, q6, q4 // - (p6 + p2) + (q0 + q5) + vsub.i16 q3, q3, q5 + vaddl.u8 q4, d26, d30 // q2 + q6 + vbif d0, d18, d15 // out p5 + vaddl.u8 q5, d18, d23 // p5 + p0 + vrshrn.i16 d5, q6, #4 // out p0 + vadd.i16 q6, q6, q3 // - (p6 + p1) + (q1 + q6) + vsub.i16 q4, q4, q5 + vaddl.u8 q5, d27, d30 // q3 + q6 + vbif d1, d19, d15 // out p4 + vaddl.u8 q9, d19, d24 // p4 + q0 + vrshrn.i16 d6, q6, #4 // out q0 + vadd.i16 q6, q6, q4 // - (p5 + p0) + (q2 + q6) + vsub.i16 q5, q5, q9 + vaddl.u8 q4, d28, d30 // q4 + q6 + vbif d2, d20, d15 // out p3 + vaddl.u8 q9, d20, d25 // p3 + q1 + vrshrn.i16 d7, q6, #4 // out q1 + vadd.i16 q6, q6, q5 // - (p4 + q0) + (q3 + q6) + vsub.i16 q9, q4, q9 + vaddl.u8 q5, d29, d30 // q5 + q6 + vbif d3, d21, d15 // out p2 + vaddl.u8 q10, d21, d26 // p2 + q2 + vrshrn.i16 d8, q6, #4 // out q2 + vadd.i16 q6, q6, q9 // - (p3 + q1) + (q4 + q6) + vsub.i16 q5, q5, q10 + vaddl.u8 q9, d30, d30 // q6 + q6 + vbif d4, d22, d15 // out p1 + vaddl.u8 q10, d22, d27 // p1 + q3 + vrshrn.i16 d9, q6, #4 // out q3 + vadd.i16 q6, q6, q5 // - (p2 + q2) + (q5 + q6) + vsub.i16 q9, q9, q10 + vbif d5, d23, d15 // out p0 + vrshrn.i16 d10, q6, #4 // out q4 + vadd.i16 q6, q6, q9 // - (p1 + q3) + (q6 + q6) + vrshrn.i16 d11, q6, #4 // out q5 + vbif d6, d24, d15 // out q0 + vbif d7, d25, d15 // out q1 + vbif d8, d26, d15 // out q2 + vbif d9, d27, d15 // out q3 + vbif d10, d28, d15 // out q4 + vbif d11, d29, d15 // out q5 +.endif + + bx lr +.if \wd == 16 +7: + // Return to a shorter epilogue, writing only the inner 6 pixels + bx r8 +.endif +.if \wd >= 8 +8: + // Return to a shorter epilogue, writing only the inner 4 pixels + bx r9 +.endif +9: + // Return directly without writing back any pixels + bx r12 +endfunc +.endm + +loop_filter 16 +loop_filter 8 +loop_filter 6 +loop_filter 4 + +.macro lpf_8_wd16 + adr r8, 7f + CONFIG_THUMB + adr r9, 8f + CONFIG_THUMB + bl lpf_8_wd16_neon +.endm + +.macro lpf_8_wd8 + adr r9, 8f + CONFIG_THUMB + bl lpf_8_wd8_neon +.endm + +.macro lpf_8_wd6 + bl lpf_8_wd6_neon +.endm + +.macro lpf_8_wd4 + bl lpf_8_wd4_neon +.endm + +function lpf_v_4_8_neon + mov r12, lr + sub r10, r0, r1, lsl #1 + vld1.8 {d22}, [r10, :64], r1 // p1 + vld1.8 {d24}, [r0, :64], r1 // q0 + vld1.8 {d23}, [r10, :64], r1 // p0 + vld1.8 {d25}, [r0, :64], r1 // q1 + sub r0, r0, r1, lsl #1 + + lpf_8_wd4 + + sub r10, r0, r1, lsl #1 + vst1.8 {d22}, [r10, :64], r1 // p1 + vst1.8 {d24}, [r0, :64], r1 // q0 + vst1.8 {d23}, [r10, :64], r1 // p0 + vst1.8 {d25}, [r0, :64], r1 // q1 + sub r0, r0, r1, lsl #1 + bx r12 +endfunc + +function lpf_h_4_8_neon + mov r12, lr + sub r10, r0, #2 + add r0, r10, r1, lsl #2 + vld1.32 {d22[0]}, [r10], r1 + vld1.32 {d22[1]}, [r0], r1 + vld1.32 {d23[0]}, [r10], r1 + vld1.32 {d23[1]}, [r0], r1 + vld1.32 {d24[0]}, [r10], r1 + vld1.32 {d24[1]}, [r0], r1 + vld1.32 {d25[0]}, [r10], r1 + vld1.32 {d25[1]}, [r0], r1 + add r0, r0, #2 + + transpose_4x8b q11, q12, d22, d23, d24, d25 + + lpf_8_wd4 + + sub r10, r0, r1, lsl #3 + sub r10, r10, #2 + transpose_4x8b q11, q12, d22, d23, d24, d25 + add r0, r10, r1, lsl #2 + + vst1.32 {d22[0]}, [r10], r1 + vst1.32 {d22[1]}, [r0], r1 + vst1.32 {d23[0]}, [r10], r1 + vst1.32 {d23[1]}, [r0], r1 + vst1.32 {d24[0]}, [r10], r1 + vst1.32 {d24[1]}, [r0], r1 + vst1.32 {d25[0]}, [r10], r1 + vst1.32 {d25[1]}, [r0], r1 + add r0, r0, #2 + bx r12 +endfunc + +function lpf_v_6_8_neon + mov r12, lr + sub r10, r0, r1, lsl #1 + sub r10, r10, r1 + vld1.8 {d21}, [r10, :64], r1 // p2 + vld1.8 {d24}, [r0, :64], r1 // q0 + vld1.8 {d22}, [r10, :64], r1 // p1 + vld1.8 {d25}, [r0, :64], r1 // q1 + vld1.8 {d23}, [r10, :64], r1 // p0 + vld1.8 {d26}, [r0, :64], r1 // q2 + sub r0, r0, r1, lsl #1 + sub r0, r0, r1 + + lpf_8_wd6 + + sub r10, r0, r1, lsl #1 + vst1.8 {d22}, [r10, :64], r1 // p1 + vst1.8 {d24}, [r0, :64], r1 // q0 + vst1.8 {d23}, [r10, :64], r1 // p0 + vst1.8 {d25}, [r0, :64], r1 // q1 + sub r0, r0, r1, lsl #1 + bx r12 +endfunc + +function lpf_h_6_8_neon + mov r12, lr + sub r10, r0, #4 + add r0, r10, r1, lsl #2 + vld1.8 {d20}, [r10], r1 + vld1.8 {d24}, [r0], r1 + vld1.8 {d21}, [r10], r1 + vld1.8 {d25}, [r0], r1 + vld1.8 {d22}, [r10], r1 + vld1.8 {d26}, [r0], r1 + vld1.8 {d23}, [r10], r1 + vld1.8 {d27}, [r0], r1 + add r0, r0, #4 + + transpose_8x8b q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27 + + lpf_8_wd6 + + sub r10, r0, r1, lsl #3 + sub r10, r10, #2 + transpose_4x8b q11, q12, d22, d23, d24, d25 + add r0, r10, r1, lsl #2 + + vst1.32 {d22[0]}, [r10], r1 + vst1.32 {d22[1]}, [r0], r1 + vst1.32 {d23[0]}, [r10], r1 + vst1.32 {d23[1]}, [r0], r1 + vst1.32 {d24[0]}, [r10], r1 + vst1.32 {d24[1]}, [r0], r1 + vst1.32 {d25[0]}, [r10], r1 + vst1.32 {d25[1]}, [r0], r1 + add r0, r0, #2 + bx r12 +endfunc + +function lpf_v_8_8_neon + mov r12, lr + sub r10, r0, r1, lsl #2 + vld1.8 {d20}, [r10, :64], r1 // p3 + vld1.8 {d24}, [r0, :64], r1 // q0 + vld1.8 {d21}, [r10, :64], r1 // p2 + vld1.8 {d25}, [r0, :64], r1 // q1 + vld1.8 {d22}, [r10, :64], r1 // p1 + vld1.8 {d26}, [r0, :64], r1 // q2 + vld1.8 {d23}, [r10, :64], r1 // p0 + vld1.8 {d27}, [r0, :64], r1 // q3 + sub r0, r0, r1, lsl #2 + + lpf_8_wd8 + + sub r10, r0, r1, lsl #1 + sub r10, r10, r1 + vst1.8 {d21}, [r10, :64], r1 // p2 + vst1.8 {d24}, [r0, :64], r1 // q0 + vst1.8 {d22}, [r10, :64], r1 // p1 + vst1.8 {d25}, [r0, :64], r1 // q1 + vst1.8 {d23}, [r10, :64], r1 // p0 + vst1.8 {d26}, [r0, :64], r1 // q2 + sub r0, r0, r1, lsl #1 + sub r0, r0, r1 + bx r12 + +8: + sub r10, r0, r1, lsl #1 + vst1.8 {d22}, [r10, :64], r1 // p1 + vst1.8 {d24}, [r0, :64], r1 // q0 + vst1.8 {d23}, [r10, :64], r1 // p0 + vst1.8 {d25}, [r0, :64], r1 // q1 + sub r0, r0, r1, lsl #1 + bx r12 +endfunc + +function lpf_h_8_8_neon + mov r12, lr + sub r10, r0, #4 + add r0, r10, r1, lsl #2 + vld1.8 {d20}, [r10], r1 + vld1.8 {d24}, [r0], r1 + vld1.8 {d21}, [r10], r1 + vld1.8 {d25}, [r0], r1 + vld1.8 {d22}, [r10], r1 + vld1.8 {d26}, [r0], r1 + vld1.8 {d23}, [r10], r1 + vld1.8 {d27}, [r0], r1 + add r0, r0, #4 + + transpose_8x8b q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27 + + lpf_8_wd8 + + sub r10, r0, r1, lsl #3 + sub r10, r10, #4 + transpose_8x8b q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27 + add r0, r10, r1, lsl #2 + + vst1.8 {d20}, [r10], r1 + vst1.8 {d24}, [r0], r1 + vst1.8 {d21}, [r10], r1 + vst1.8 {d25}, [r0], r1 + vst1.8 {d22}, [r10], r1 + vst1.8 {d26}, [r0], r1 + vst1.8 {d23}, [r10], r1 + vst1.8 {d27}, [r0], r1 + add r0, r0, #4 + bx r12 +8: + sub r10, r0, r1, lsl #3 + sub r10, r10, #2 + transpose_4x8b q11, q12, d22, d23, d24, d25 + add r0, r10, r1, lsl #2 + + vst1.32 {d22[0]}, [r10], r1 + vst1.32 {d22[1]}, [r0], r1 + vst1.32 {d23[0]}, [r10], r1 + vst1.32 {d23[1]}, [r0], r1 + vst1.32 {d24[0]}, [r10], r1 + vst1.32 {d24[1]}, [r0], r1 + vst1.32 {d25[0]}, [r10], r1 + vst1.32 {d25[1]}, [r0], r1 + add r0, r0, #2 + bx r12 +endfunc + +function lpf_v_16_8_neon + mov r12, lr + + sub r10, r0, r1, lsl #3 + add r10, r10, r1 + vld1.8 {d17}, [r10, :64], r1 // p6 + vld1.8 {d24}, [r0, :64], r1 // q0 + vld1.8 {d18}, [r10, :64], r1 // p5 + vld1.8 {d25}, [r0, :64], r1 // q1 + vld1.8 {d19}, [r10, :64], r1 // p4 + vld1.8 {d26}, [r0, :64], r1 // q2 + vld1.8 {d20}, [r10, :64], r1 // p3 + vld1.8 {d27}, [r0, :64], r1 // q3 + vld1.8 {d21}, [r10, :64], r1 // p2 + vld1.8 {d28}, [r0, :64], r1 // q4 + vld1.8 {d22}, [r10, :64], r1 // p1 + vld1.8 {d29}, [r0, :64], r1 // q5 + vld1.8 {d23}, [r10, :64], r1 // p0 + vld1.8 {d30}, [r0, :64], r1 // q6 + sub r0, r0, r1, lsl #3 + add r0, r0, r1 + + lpf_8_wd16 + + sub r10, r0, r1, lsl #2 + sub r10, r10, r1, lsl #1 + vst1.8 {d0}, [r10, :64], r1 // p5 + vst1.8 {d6}, [r0, :64], r1 // q0 + vst1.8 {d1}, [r10, :64], r1 // p4 + vst1.8 {d7}, [r0, :64], r1 // q1 + vst1.8 {d2}, [r10, :64], r1 // p3 + vst1.8 {d8}, [r0, :64], r1 // q2 + vst1.8 {d3}, [r10, :64], r1 // p2 + vst1.8 {d9}, [r0, :64], r1 // q3 + vst1.8 {d4}, [r10, :64], r1 // p1 + vst1.8 {d10}, [r0, :64], r1 // q4 + vst1.8 {d5}, [r10, :64], r1 // p0 + vst1.8 {d11}, [r0, :64], r1 // q5 + sub r0, r0, r1, lsl #2 + sub r0, r0, r1, lsl #1 + bx r12 +7: + sub r10, r0, r1 + sub r10, r10, r1, lsl #1 + vst1.8 {d21}, [r10, :64], r1 // p2 + vst1.8 {d24}, [r0, :64], r1 // q0 + vst1.8 {d22}, [r10, :64], r1 // p1 + vst1.8 {d25}, [r0, :64], r1 // q1 + vst1.8 {d23}, [r10, :64], r1 // p0 + vst1.8 {d26}, [r0, :64], r1 // q2 + sub r0, r0, r1, lsl #1 + sub r0, r0, r1 + bx r12 + +8: + sub r10, r0, r1, lsl #1 + vst1.8 {d22}, [r10, :64], r1 // p1 + vst1.8 {d24}, [r0, :64], r1 // q0 + vst1.8 {d23}, [r10, :64], r1 // p0 + vst1.8 {d25}, [r0, :64], r1 // q1 + sub r0, r0, r1, lsl #1 + bx r12 +endfunc + +function lpf_h_16_8_neon + mov r12, lr + sub r10, r0, #8 + vld1.8 {d16}, [r10, :64], r1 + vld1.8 {d24}, [r0, :64], r1 + vld1.8 {d17}, [r10, :64], r1 + vld1.8 {d25}, [r0, :64], r1 + vld1.8 {d18}, [r10, :64], r1 + vld1.8 {d26}, [r0, :64], r1 + vld1.8 {d19}, [r10, :64], r1 + vld1.8 {d27}, [r0, :64], r1 + vld1.8 {d20}, [r10, :64], r1 + vld1.8 {d28}, [r0, :64], r1 + vld1.8 {d21}, [r10, :64], r1 + vld1.8 {d29}, [r0, :64], r1 + vld1.8 {d22}, [r10, :64], r1 + vld1.8 {d30}, [r0, :64], r1 + vld1.8 {d23}, [r10, :64], r1 + vld1.8 {d31}, [r0, :64], r1 + + transpose_8x8b q8, q9, q10, q11, d16, d17, d18, d19, d20, d21, d22, d23 + transpose_8x8b q12, q13, q14, q15, d24, d25, d26, d27, d28, d29, d30, d31 + + lpf_8_wd16 + + sub r0, r0, r1, lsl #3 + sub r10, r0, #8 + + transpose_8x8b q8, q0, q1, q2, d16, d17, d0, d1, d2, d3, d4, d5 + transpose_8x8b q3, q4, q5, q15, d6, d7, d8, d9, d10, d11, d30, d31 + + vst1.8 {d16}, [r10, :64], r1 + vst1.8 {d6}, [r0, :64], r1 + vst1.8 {d17}, [r10, :64], r1 + vst1.8 {d7}, [r0, :64], r1 + vst1.8 {d0}, [r10, :64], r1 + vst1.8 {d8}, [r0, :64], r1 + vst1.8 {d1}, [r10, :64], r1 + vst1.8 {d9}, [r0, :64], r1 + vst1.8 {d2}, [r10, :64], r1 + vst1.8 {d10}, [r0, :64], r1 + vst1.8 {d3}, [r10, :64], r1 + vst1.8 {d11}, [r0, :64], r1 + vst1.8 {d4}, [r10, :64], r1 + vst1.8 {d30}, [r0, :64], r1 + vst1.8 {d5}, [r10, :64], r1 + vst1.8 {d31}, [r0, :64], r1 + bx r12 + +7: + sub r10, r0, r1, lsl #3 + sub r10, r10, #4 + transpose_8x8b q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27 + add r0, r10, r1, lsl #2 + + vst1.8 {d20}, [r10], r1 + vst1.8 {d24}, [r0], r1 + vst1.8 {d21}, [r10], r1 + vst1.8 {d25}, [r0], r1 + vst1.8 {d22}, [r10], r1 + vst1.8 {d26}, [r0], r1 + vst1.8 {d23}, [r10], r1 + vst1.8 {d27}, [r0], r1 + add r0, r0, #4 + bx r12 +8: + sub r10, r0, r1, lsl #3 + sub r10, r10, #2 + transpose_4x8b q11, q12, d22, d23, d24, d25 + add r0, r10, r1, lsl #2 + + vst1.32 {d22[0]}, [r10], r1 + vst1.32 {d22[1]}, [r0], r1 + vst1.32 {d23[0]}, [r10], r1 + vst1.32 {d23[1]}, [r0], r1 + vst1.32 {d24[0]}, [r10], r1 + vst1.32 {d24[1]}, [r0], r1 + vst1.32 {d25[0]}, [r10], r1 + vst1.32 {d25[1]}, [r0], r1 + add r0, r0, #2 + bx r12 +endfunc + +// void dav1d_lpf_v_sb_y_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const uint32_t *const vmask, +// const uint8_t (*l)[4], ptrdiff_t b4_stride, +// const Av1FilterLUT *lut, const int w) + +.macro lpf_func dir, type +function lpf_\dir\()_sb_\type\()_8bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] + ldrd r6, r7, [r2] // vmask[0], vmask[1] +.ifc \type, y + ldr r2, [r2, #8] // vmask[2] +.endif + add r5, r5, #128 // Move to sharp part of lut +.ifc \type, y + orr r7, r7, r2 // vmask[1] |= vmask[2] +.endif +.ifc \dir, v + sub r4, r3, r4, lsl #2 +.else + sub r3, r3, #4 + lsl r4, r4, #2 +.endif + orr r6, r6, r7 // vmask[0] |= vmask[1] + +1: + tst r6, #0x03 +.ifc \dir, v + vld1.8 {d0}, [r4]! + vld1.8 {d1}, [r3]! +.else + vld2.32 {d0[0], d1[0]}, [r3], r4 + vld2.32 {d0[1], d1[1]}, [r3], r4 +.endif + beq 7f // if (!(vm & bits)) continue; + + vld1.8 {d5[]}, [r5] // sharp[0] + add r5, r5, #8 + vmov.i32 d2, #0xff + vdup.32 d13, r6 // vmask[0] + + vand d0, d0, d2 // Keep only lowest byte in each 32 bit word + vand d1, d1, d2 + vtst.8 d3, d1, d2 // Check for nonzero values in l[0][0] + vmov.i8 d4, #1 + vld1.8 {d6[]}, [r5] // sharp[1] + sub r5, r5, #8 + vbif d1, d0, d3 // if (!l[0][0]) L = l[offset][0] + vmul.i32 d1, d1, d4 // L +.ifc \type, y + vdup.32 d15, r2 // vmask[2] +.endif + vtst.32 d2, d1, d2 // L != 0 + vdup.32 d14, r7 // vmask[1] + vmov r10, r11, d2 + orrs r10, r10, r11 + beq 7f // if (!L) continue; + vneg.s8 d5, d5 // -sharp[0] + movrel_local r10, word_12 + vshr.u8 d12, d1, #4 // H + vld1.32 {d16}, [r10, :64] + vshl.s8 d3, d1, d5 // L >> sharp[0] +.ifc \type, y + vtst.32 d15, d15, d16 // if (vmask[2] & bits) +.endif + vmov.i8 d7, #2 + vmin.u8 d3, d3, d6 // imin(L >> sharp[0], sharp[1]) + vadd.i8 d0, d1, d7 // L + 2 + vmax.u8 d11, d3, d4 // imax(imin(), 1) = limit = I + vadd.u8 d0, d0, d0 // 2*(L + 2) + vtst.32 d14, d14, d16 // if (vmask[1] & bits) + vadd.i8 d10, d0, d11 // 2*(L + 2) + limit = E + vtst.32 d13, d13, d16 // if (vmask[0] & bits) + vand d13, d13, d2 // vmask[0] &= L != 0 + +.ifc \type, y + tst r2, #0x03 + beq 2f + // wd16 + bl lpf_\dir\()_16_8_neon + b 8f +2: +.endif + tst r7, #0x03 + beq 3f +.ifc \type, y + // wd8 + bl lpf_\dir\()_8_8_neon +.else + // wd6 + bl lpf_\dir\()_6_8_neon +.endif + b 8f +3: + // wd4 + bl lpf_\dir\()_4_8_neon +.ifc \dir, h + b 8f +7: + // For dir h, the functions above increment r0. + // If the whole function is skipped, increment it here instead. + add r0, r0, r1, lsl #3 +.else +7: +.endif +8: + lsrs r6, r6, #2 // vmask[0] >>= 2 + lsr r7, r7, #2 // vmask[1] >>= 2 +.ifc \type, y + lsr r2, r2, #2 // vmask[2] >>= 2 +.endif +.ifc \dir, v + add r0, r0, #8 +.else + // For dir h, r0 is returned incremented +.endif + bne 1b + + vpop {q4-q7} + pop {r4-r11,pc} +endfunc +.endm + +lpf_func v, y +lpf_func h, y +lpf_func v, uv +lpf_func h, uv + +const word_12, align=4 + .word 1, 2 +endconst diff --git a/third_party/dav1d/src/arm/32/looprestoration.S b/third_party/dav1d/src/arm/32/looprestoration.S new file mode 100644 index 0000000000..073525a5a5 --- /dev/null +++ b/third_party/dav1d/src/arm/32/looprestoration.S @@ -0,0 +1,2099 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2019, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +// void dav1d_wiener_filter_h_8bpc_neon(int16_t *dst, const pixel (*left)[4], +// const pixel *src, ptrdiff_t stride, +// const int16_t fh[7], const intptr_t w, +// int h, enum LrEdgeFlags edges); +function wiener_filter_h_8bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4} + ldrd r4, r5, [sp, #52] + ldrd r6, r7, [sp, #60] + mov r8, r5 + vld1.16 {q0}, [r4] + movw r9, #(1 << 14) - (1 << 2) + vdup.16 q14, r9 + vmov.s16 q15, #2048 + // Calculate mid_stride + add r10, r5, #7 + bic r10, r10, #7 + lsl r10, r10, #1 + + // Clear the last unused element of q0, to allow filtering a single + // pixel with one plain vmul+vpadd. + mov r12, #0 + vmov.16 d1[3], r12 + + // Set up pointers for reading/writing alternate rows + add r12, r0, r10 + lsl r10, r10, #1 + add lr, r2, r3 + lsl r3, r3, #1 + + // Subtract the width from mid_stride + sub r10, r10, r5, lsl #1 + + // For w >= 8, we read (w+5)&~7+8 pixels, for w < 8 we read 16 pixels. + cmp r5, #8 + add r11, r5, #13 + bic r11, r11, #7 + bge 1f + mov r11, #16 +1: + sub r3, r3, r11 + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst r7, #1 // LR_HAVE_LEFT + beq 2f + // LR_HAVE_LEFT + cmp r1, #0 + bne 0f + // left == NULL + sub r2, r2, #3 + sub lr, lr, #3 + b 1f +0: // LR_HAVE_LEFT, left != NULL +2: // !LR_HAVE_LEFT, increase the stride. + // For this case we don't read the left 3 pixels from the src pointer, + // but shift it as if we had done that. + add r3, r3, #3 + + +1: // Loop vertically + vld1.8 {q2}, [r2]! + vld1.8 {q9}, [lr]! + + tst r7, #1 // LR_HAVE_LEFT + beq 0f + cmp r1, #0 + beq 2f + // LR_HAVE_LEFT, left != NULL + vld1.32 {d3[1]}, [r1]! + // Move r2/lr back to account for the last 3 bytes we loaded earlier, + // which we'll shift out. + sub r2, r2, #3 + sub lr, lr, #3 + vld1.32 {d17[1]}, [r1]! + vext.8 q2, q1, q2, #13 + vext.8 q9, q8, q9, #13 + b 2f +0: + // !LR_HAVE_LEFT, fill q1 with the leftmost byte + // and shift q2 to have 3x the first byte at the front. + vdup.8 q1, d4[0] + vdup.8 q8, d18[0] + // Move r2 back to account for the last 3 bytes we loaded before, + // which we shifted out. + sub r2, r2, #3 + sub lr, lr, #3 + vext.8 q2, q1, q2, #13 + vext.8 q9, q8, q9, #13 + +2: + vmovl.u8 q1, d4 + vmovl.u8 q2, d5 + vmovl.u8 q8, d18 + vmovl.u8 q9, d19 + + tst r7, #2 // LR_HAVE_RIGHT + bne 4f + // If we'll need to pad the right edge, load that byte to pad with + // here since we can find it pretty easily from here. + sub r9, r5, #14 + ldrb r11, [r2, r9] + ldrb r9, [lr, r9] + // Fill q12/q13 with the right padding pixel + vdup.8 d24, r11 + vdup.8 d26, r9 + vmovl.u8 q12, d24 + vmovl.u8 q13, d26 +3: // !LR_HAVE_RIGHT + // If we'll have to pad the right edge we need to quit early here. + cmp r5, #11 + bge 4f // If w >= 11, all used input pixels are valid + cmp r5, #7 + bge 5f // If w >= 7, we can filter 4 pixels + b 6f + +4: // Loop horizontally + // This is tuned as some sort of compromise between Cortex A7, A8, + // A9 and A53. + vmul.s16 q3, q1, d0[0] + vext.8 q10, q1, q2, #2 + vext.8 q11, q1, q2, #4 + vmla.s16 q3, q10, d0[1] + vmla.s16 q3, q11, d0[2] + vext.8 q10, q1, q2, #6 + vext.8 q11, q1, q2, #8 + vmla.s16 q3, q10, d0[3] + vmla.s16 q3, q11, d1[0] + vext.8 q10, q1, q2, #10 + vext.8 q11, q1, q2, #12 + vmla.s16 q3, q10, d1[1] + vmla.s16 q3, q11, d1[2] + + vmul.s16 q10, q8, d0[0] + vext.8 q11, q8, q9, #2 + vext.8 q4, q8, q9, #4 + vmla.s16 q10, q11, d0[1] + vmla.s16 q10, q4, d0[2] + vext.8 q11, q8, q9, #6 + vext.8 q4, q8, q9, #8 + vmla.s16 q10, q11, d0[3] + vmla.s16 q10, q4, d1[0] + vext.8 q11, q8, q9, #10 + vext.8 q4, q8, q9, #12 + vmla.s16 q10, q11, d1[1] + vmla.s16 q10, q4, d1[2] + + vext.8 q1, q1, q2, #6 + vext.8 q8, q8, q9, #6 + vshl.s16 q1, q1, #7 + vshl.s16 q8, q8, #7 + vsub.s16 q1, q1, q14 + vsub.s16 q8, q8, q14 + vqadd.s16 q3, q3, q1 + vqadd.s16 q10, q10, q8 + vshr.s16 q3, q3, #3 + vshr.s16 q10, q10, #3 + vadd.s16 q3, q3, q15 + vadd.s16 q10, q10, q15 + vst1.16 {q3}, [r0, :128]! + vst1.16 {q10}, [r12, :128]! + + subs r5, r5, #8 + ble 9f + tst r7, #2 // LR_HAVE_RIGHT + vmov q1, q2 + vmov q8, q9 + vld1.8 {d4}, [r2]! + vld1.8 {d18}, [lr]! + vmovl.u8 q2, d4 + vmovl.u8 q9, d18 + bne 4b // If we don't need to pad, just keep filtering. + b 3b // If we need to pad, check how many pixels we have left. + +5: // Filter 4 pixels, 7 <= w < 11 +.macro filter_4 + vext.8 d20, d2, d3, #2 + vext.8 d21, d2, d3, #4 + vext.8 d22, d2, d3, #6 + vext.8 d23, d3, d4, #2 + vext.8 d8, d3, d4, #4 + vmul.s16 d6, d2, d0[0] + vmla.s16 d6, d20, d0[1] + vmla.s16 d6, d21, d0[2] + vmla.s16 d6, d22, d0[3] + vmla.s16 d6, d3, d1[0] + vmla.s16 d6, d23, d1[1] + vmla.s16 d6, d8, d1[2] + + vext.8 d20, d16, d17, #2 + vext.8 d21, d16, d17, #4 + vext.8 d22, d16, d17, #6 + vext.8 d23, d17, d18, #2 + vext.8 d8, d17, d18, #4 + vmul.s16 d7, d16, d0[0] + vmla.s16 d7, d20, d0[1] + vmla.s16 d7, d21, d0[2] + vmla.s16 d7, d22, d0[3] + vmla.s16 d7, d17, d1[0] + vmla.s16 d7, d23, d1[1] + vmla.s16 d7, d8, d1[2] + + vext.8 d22, d2, d3, #6 + vext.8 d23, d16, d17, #6 + vshl.s16 q11, q11, #7 + vsub.s16 q11, q11, q14 + vqadd.s16 q3, q3, q11 + vshr.s16 q3, q3, #3 + vadd.s16 q3, q3, q15 +.endm + filter_4 + vst1.16 {d6}, [r0, :64]! + vst1.16 {d7}, [r12, :64]! + + subs r5, r5, #4 // 3 <= w < 7 + vext.8 q1, q1, q2, #8 + vext.8 q2, q2, q2, #8 + vext.8 q8, q8, q9, #8 + vext.8 q9, q9, q9, #8 + +6: // Pad the right edge and filter the last few pixels. + // w < 7, w+3 pixels valid in q1-q2 + cmp r5, #5 + blt 7f + bgt 8f + // w == 5, 8 pixels valid in q1, q2 invalid + vmov q2, q12 + vmov q9, q13 + b 88f + +7: // 1 <= w < 5, 4-7 pixels valid in q1 + sub r9, r5, #1 + // r9 = (pixels valid - 4) + adr r11, L(variable_shift_tbl) + ldr r9, [r11, r9, lsl #2] + add r11, r11, r9 + vmov q2, q12 + vmov q9, q13 + bx r11 + + .align 2 +L(variable_shift_tbl): + .word 44f - L(variable_shift_tbl) + CONFIG_THUMB + .word 55f - L(variable_shift_tbl) + CONFIG_THUMB + .word 66f - L(variable_shift_tbl) + CONFIG_THUMB + .word 77f - L(variable_shift_tbl) + CONFIG_THUMB + +44: // 4 pixels valid in d2/d16, fill d3/d17 with padding. + vmov d3, d4 + vmov d17, d18 + b 88f + // Shift q1 right, shifting out invalid pixels, + // shift q1 left to the original offset, shifting in padding pixels. +55: // 5 pixels valid + vext.8 q1, q1, q1, #10 + vext.8 q1, q1, q2, #6 + vext.8 q8, q8, q8, #10 + vext.8 q8, q8, q9, #6 + b 88f +66: // 6 pixels valid + vext.8 q1, q1, q1, #12 + vext.8 q1, q1, q2, #4 + vext.8 q8, q8, q8, #12 + vext.8 q8, q8, q9, #4 + b 88f +77: // 7 pixels valid + vext.8 q1, q1, q1, #14 + vext.8 q1, q1, q2, #2 + vext.8 q8, q8, q8, #14 + vext.8 q8, q8, q9, #2 + b 88f + +8: // w > 5, w == 6, 9 pixels valid in q1-q2, 1 pixel valid in q2 + vext.8 q2, q2, q2, #2 + vext.8 q2, q2, q12, #14 + vext.8 q9, q9, q9, #2 + vext.8 q9, q9, q13, #14 + +88: + // w < 7, q1-q2 padded properly + cmp r5, #4 + blt 888f + + // w >= 4, filter 4 pixels + filter_4 + vst1.16 {d6}, [r0, :64]! + vst1.16 {d7}, [r12, :64]! + subs r5, r5, #4 // 0 <= w < 4 + vext.8 q1, q1, q2, #8 + vext.8 q8, q8, q9, #8 + beq 9f +888: // 1 <= w < 4, filter 1 pixel at a time + vmul.s16 q3, q1, q0 + vmul.s16 q10, q8, q0 + vpadd.s16 d6, d6, d7 + vpadd.s16 d7, d20, d21 + vdup.16 d24, d2[3] + vpadd.s16 d6, d6, d7 + vdup.16 d25, d16[3] + vpadd.s16 d6, d6, d6 + vtrn.16 d24, d25 + vshl.s16 d24, d24, #7 + vsub.s16 d24, d24, d28 + vqadd.s16 d6, d6, d24 + vshr.s16 d6, d6, #3 + vadd.s16 d6, d6, d30 + vst1.s16 {d6[0]}, [r0, :16]! + vst1.s16 {d6[1]}, [r12, :16]! + subs r5, r5, #1 + vext.8 q1, q1, q2, #2 + vext.8 q8, q8, q9, #2 + bgt 888b + +9: + subs r6, r6, #2 + ble 0f + // Jump to the next row and loop horizontally + add r0, r0, r10 + add r12, r12, r10 + add r2, r2, r3 + add lr, lr, r3 + mov r5, r8 + b 1b +0: + vpop {q4} + pop {r4-r11,pc} +.purgem filter_4 +endfunc + +// void dav1d_wiener_filter_v_8bpc_neon(pixel *dst, ptrdiff_t stride, +// const int16_t *mid, int w, int h, +// const int16_t fv[7], enum LrEdgeFlags edges, +// ptrdiff_t mid_stride); +function wiener_filter_v_8bpc_neon, export=1 + push {r4-r7,lr} + ldrd r4, r5, [sp, #20] + ldrd r6, r7, [sp, #28] + mov lr, r4 + vmov.s16 q1, #0 + mov r12, #128 + vld1.16 {q0}, [r5] + vmov.s16 d2[3], r12 + vadd.s16 q0, q0, q1 + + // Calculate the number of rows to move back when looping vertically + mov r12, r4 + tst r6, #4 // LR_HAVE_TOP + beq 0f + sub r2, r2, r7, lsl #1 + add r12, r12, #2 +0: + tst r6, #8 // LR_HAVE_BOTTOM + beq 1f + add r12, r12, #2 + +1: // Start of horizontal loop; start one vertical filter slice. + // Load rows into q8-q11 and pad properly. + tst r6, #4 // LR_HAVE_TOP + vld1.16 {q8}, [r2, :128], r7 + beq 2f + // LR_HAVE_TOP + vld1.16 {q10}, [r2, :128], r7 + vmov q9, q8 + vld1.16 {q11}, [r2, :128], r7 + b 3f +2: // !LR_HAVE_TOP + vmov q9, q8 + vmov q10, q8 + vmov q11, q8 + +3: + cmp r4, #4 + blt 5f + // Start filtering normally; fill in q12-q14 with unique rows. + vld1.16 {q12}, [r2, :128], r7 + vld1.16 {q13}, [r2, :128], r7 + vld1.16 {q14}, [r2, :128], r7 + +4: +.macro filter compare + subs r4, r4, #1 + // Interleaving the mul/mla chains actually hurts performance + // significantly on Cortex A53, thus keeping mul/mla tightly + // chained like this. + vmull.s16 q2, d16, d0[0] + vmlal.s16 q2, d18, d0[1] + vmlal.s16 q2, d20, d0[2] + vmlal.s16 q2, d22, d0[3] + vmlal.s16 q2, d24, d1[0] + vmlal.s16 q2, d26, d1[1] + vmlal.s16 q2, d28, d1[2] + vmull.s16 q3, d17, d0[0] + vmlal.s16 q3, d19, d0[1] + vmlal.s16 q3, d21, d0[2] + vmlal.s16 q3, d23, d0[3] + vmlal.s16 q3, d25, d1[0] + vmlal.s16 q3, d27, d1[1] + vmlal.s16 q3, d29, d1[2] + vqrshrun.s32 d4, q2, #11 + vqrshrun.s32 d5, q3, #11 + vqmovun.s16 d4, q2 + vst1.8 {d4}, [r0], r1 +.if \compare + cmp r4, #4 +.else + ble 9f +.endif + vmov q8, q9 + vmov q9, q10 + vmov q10, q11 + vmov q11, q12 + vmov q12, q13 + vmov q13, q14 +.endm + filter 1 + blt 7f + vld1.16 {q14}, [r2, :128], r7 + b 4b + +5: // Less than 4 rows in total; not all of q12-q13 are filled yet. + tst r6, #8 // LR_HAVE_BOTTOM + beq 6f + // LR_HAVE_BOTTOM + cmp r4, #2 + // We load at least 2 rows in all cases. + vld1.16 {q12}, [r2, :128], r7 + vld1.16 {q13}, [r2, :128], r7 + bgt 53f // 3 rows in total + beq 52f // 2 rows in total +51: // 1 row in total, q11 already loaded, load edge into q12-q14. + vmov q13, q12 + b 8f +52: // 2 rows in total, q11 already loaded, load q12 with content data + // and 2 rows of edge. + vld1.16 {q14}, [r2, :128], r7 + vmov q15, q14 + b 8f +53: + // 3 rows in total, q11 already loaded, load q12 and q13 with content + // and 2 rows of edge. + vld1.16 {q14}, [r2, :128], r7 + vld1.16 {q15}, [r2, :128], r7 + vmov q1, q15 + b 8f + +6: + // !LR_HAVE_BOTTOM + cmp r4, #2 + bgt 63f // 3 rows in total + beq 62f // 2 rows in total +61: // 1 row in total, q11 already loaded, pad that into q12-q14. + vmov q12, q11 + vmov q13, q11 + vmov q14, q11 + b 8f +62: // 2 rows in total, q11 already loaded, load q12 and pad that into q12-q15. + vld1.16 {q12}, [r2, :128], r7 + vmov q13, q12 + vmov q14, q12 + vmov q15, q12 + b 8f +63: + // 3 rows in total, q11 already loaded, load q12 and q13 and pad q13 into q14-q15,q1. + vld1.16 {q12}, [r2, :128], r7 + vld1.16 {q13}, [r2, :128], r7 + vmov q14, q13 + vmov q15, q13 + vmov q1, q13 + b 8f + +7: + // All registers up to q13 are filled already, 3 valid rows left. + // < 4 valid rows left; fill in padding and filter the last + // few rows. + tst r6, #8 // LR_HAVE_BOTTOM + beq 71f + // LR_HAVE_BOTTOM; load 2 rows of edge. + vld1.16 {q14}, [r2, :128], r7 + vld1.16 {q15}, [r2, :128], r7 + vmov q1, q15 + b 8f +71: + // !LR_HAVE_BOTTOM, pad 3 rows + vmov q14, q13 + vmov q15, q13 + vmov q1, q13 + +8: // At this point, all registers up to q14-15,q1 are loaded with + // edge/padding (depending on how many rows are left). + filter 0 // This branches to 9f when done + vmov q14, q15 + vmov q15, q1 + b 8b + +9: // End of one vertical slice. + subs r3, r3, #8 + ble 0f + // Move pointers back up to the top and loop horizontally. + mls r0, r1, lr, r0 + mls r2, r7, r12, r2 + add r0, r0, #8 + add r2, r2, #16 + mov r4, lr + b 1b + +0: + pop {r4-r7,pc} +.purgem filter +endfunc + +// void dav1d_copy_narrow_8bpc_neon(pixel *dst, ptrdiff_t stride, +// const pixel *src, int w, int h); +function copy_narrow_8bpc_neon, export=1 + push {r4,lr} + ldr r4, [sp, #8] + adr r12, L(copy_narrow_tbl) + ldr r3, [r12, r3, lsl #2] + add r12, r12, r3 + bx r12 + + .align 2 +L(copy_narrow_tbl): + .word 0 + .word 10f - L(copy_narrow_tbl) + CONFIG_THUMB + .word 20f - L(copy_narrow_tbl) + CONFIG_THUMB + .word 30f - L(copy_narrow_tbl) + CONFIG_THUMB + .word 40f - L(copy_narrow_tbl) + CONFIG_THUMB + .word 50f - L(copy_narrow_tbl) + CONFIG_THUMB + .word 60f - L(copy_narrow_tbl) + CONFIG_THUMB + .word 70f - L(copy_narrow_tbl) + CONFIG_THUMB + +10: + add r3, r0, r1 + lsl r1, r1, #1 +18: + subs r4, r4, #8 + blt 110f + vld1.8 {d0}, [r2, :64]! + vst1.8 {d0[0]}, [r0], r1 + vst1.8 {d0[1]}, [r3], r1 + vst1.8 {d0[2]}, [r0], r1 + vst1.8 {d0[3]}, [r3], r1 + vst1.8 {d0[4]}, [r0], r1 + vst1.8 {d0[5]}, [r3], r1 + vst1.8 {d0[6]}, [r0], r1 + vst1.8 {d0[7]}, [r3], r1 + ble 0f + b 18b +110: + add r4, r4, #8 + asr r1, r1, #1 +11: + subs r4, r4, #1 + vld1.8 {d0[]}, [r2]! + vst1.8 {d0[0]}, [r0], r1 + bgt 11b +0: + pop {r4,pc} + +20: + add r3, r0, r1 + lsl r1, r1, #1 +24: + subs r4, r4, #4 + blt 210f + vld1.16 {d0}, [r2, :64]! + vst1.16 {d0[0]}, [r0, :16], r1 + vst1.16 {d0[1]}, [r3, :16], r1 + vst1.16 {d0[2]}, [r0, :16], r1 + vst1.16 {d0[3]}, [r3, :16], r1 + ble 0f + b 24b +210: + add r4, r4, #4 + asr r1, r1, #1 +22: + subs r4, r4, #1 + vld1.16 {d0[]}, [r2, :16]! + vst1.16 {d0[0]}, [r0, :16], r1 + bgt 22b +0: + pop {r4,pc} + +30: + ldrh r3, [r2] + ldrb r12, [r2, #2] + add r2, r2, #3 + subs r4, r4, #1 + strh r3, [r0] + strb r12, [r0, #2] + add r0, r0, r1 + bgt 30b + pop {r4,pc} + +40: + add r3, r0, r1 + lsl r1, r1, #1 +42: + subs r4, r4, #2 + blt 41f + vld1.8 {d0}, [r2, :64]! + vst1.32 {d0[0]}, [r0, :32], r1 + vst1.32 {d0[1]}, [r3, :32], r1 + ble 0f + b 42b +41: + vld1.32 {d0[]}, [r2, :32] + vst1.32 {d0[0]}, [r0, :32] +0: + pop {r4,pc} + +50: + ldr r3, [r2] + ldrb r12, [r2, #4] + add r2, r2, #5 + subs r4, r4, #1 + str r3, [r0] + strb r12, [r0, #4] + add r0, r0, r1 + bgt 50b + pop {r4,pc} + +60: + ldr r3, [r2] + ldrh r12, [r2, #4] + add r2, r2, #6 + subs r4, r4, #1 + str r3, [r0] + strh r12, [r0, #4] + add r0, r0, r1 + bgt 60b + pop {r4,pc} + +70: + ldr r3, [r2] + ldrh r12, [r2, #4] + ldrb lr, [r2, #6] + add r2, r2, #7 + subs r4, r4, #1 + str r3, [r0] + strh r12, [r0, #4] + strb lr, [r0, #6] + add r0, r0, r1 + bgt 70b + pop {r4,pc} +endfunc + +#define SUM_STRIDE (384+16) + +// void dav1d_sgr_box3_h_8bpc_neon(int32_t *sumsq, int16_t *sum, +// const pixel (*left)[4], +// const pixel *src, const ptrdiff_t stride, +// const int w, const int h, +// const enum LrEdgeFlags edges); +function sgr_box3_h_8bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] + ldrd r6, r7, [sp, #108] + add r5, r5, #2 // w += 2 + + // Set up pointers for reading/writing alternate rows + add r10, r0, #(4*SUM_STRIDE) // sumsq + add r11, r1, #(2*SUM_STRIDE) // sum + add r12, r3, r4 // src + lsl r4, r4, #1 + mov r9, #(2*2*SUM_STRIDE) // double sum stride + + // Subtract the aligned width from the output stride. + // With LR_HAVE_RIGHT, align to 8, without it, align to 4. + tst r7, #2 // LR_HAVE_RIGHT + bne 0f + // !LR_HAVE_RIGHT + add lr, r5, #3 + bic lr, lr, #3 + b 1f +0: + add lr, r5, #7 + bic lr, lr, #7 +1: + sub r9, r9, lr, lsl #1 + + // Store the width for the vertical loop + mov r8, r5 + + // Subtract the number of pixels read from the input from the stride + add lr, r5, #14 + bic lr, lr, #7 + sub r4, r4, lr + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst r7, #1 // LR_HAVE_LEFT + beq 2f + // LR_HAVE_LEFT + cmp r2, #0 + bne 0f + // left == NULL + sub r3, r3, #2 + sub r12, r12, #2 + b 1f +0: // LR_HAVE_LEFT, left != NULL +2: // !LR_HAVE_LEFT, increase the stride. + // For this case we don't read the left 2 pixels from the src pointer, + // but shift it as if we had done that. + add r4, r4, #2 + + +1: // Loop vertically + vld1.8 {q0}, [r3]! + vld1.8 {q4}, [r12]! + + tst r7, #1 // LR_HAVE_LEFT + beq 0f + cmp r2, #0 + beq 2f + // LR_HAVE_LEFT, left != NULL + vld1.32 {d3[]}, [r2]! + // Move r3/r12 back to account for the last 2 bytes we loaded earlier, + // which we'll shift out. + sub r3, r3, #2 + sub r12, r12, #2 + vld1.32 {d11[]}, [r2]! + vext.8 q0, q1, q0, #14 + vext.8 q4, q5, q4, #14 + b 2f +0: + // !LR_HAVE_LEFT, fill q1 with the leftmost byte + // and shift q0 to have 2x the first byte at the front. + vdup.8 q1, d0[0] + vdup.8 q5, d8[0] + // Move r3 back to account for the last 2 bytes we loaded before, + // which we shifted out. + sub r3, r3, #2 + sub r12, r12, #2 + vext.8 q0, q1, q0, #14 + vext.8 q4, q5, q4, #14 + +2: + vmull.u8 q1, d0, d0 + vmull.u8 q2, d1, d1 + vmull.u8 q5, d8, d8 + vmull.u8 q6, d9, d9 + + tst r7, #2 // LR_HAVE_RIGHT + bne 4f + // If we'll need to pad the right edge, load that byte to pad with + // here since we can find it pretty easily from here. + sub lr, r5, #(2 + 16 - 2 + 1) + ldrb r11, [r3, lr] + ldrb lr, [r12, lr] + // Fill q14/q15 with the right padding pixel + vdup.8 q14, r11 + vdup.8 q15, lr + // Restore r11 after using it for a temporary value + add r11, r1, #(2*SUM_STRIDE) +3: // !LR_HAVE_RIGHT + // If we'll have to pad the right edge we need to quit early here. + cmp r5, #10 + bge 4f // If w >= 10, all used input pixels are valid + cmp r5, #6 + bge 5f // If w >= 6, we can filter 4 pixels + b 6f + +4: // Loop horizontally +.macro vaddl_u16_n dst1, dst2, src1, src2, src3, src4, w + vaddl.u16 \dst1, \src1, \src3 +.if \w > 4 + vaddl.u16 \dst2, \src2, \src4 +.endif +.endm +.macro vaddw_u16_n dst1, dst2, src1, src2, w + vaddw.u16 \dst1, \dst1, \src1 +.if \w > 4 + vaddw.u16 \dst2, \dst2, \src2 +.endif +.endm +.macro vadd_i32_n dst1, dst2, src1, src2, w + vadd.i32 \dst1, \dst1, \src1 +.if \w > 4 + vadd.i32 \dst2, \dst2, \src2 +.endif +.endm + +.macro add3 w + vext.8 d16, d0, d1, #1 + vext.8 d17, d0, d1, #2 + vext.8 d18, d8, d9, #1 + vext.8 d19, d8, d9, #2 + vaddl.u8 q3, d0, d16 + vaddw.u8 q3, q3, d17 + vaddl.u8 q7, d8, d18 + vaddw.u8 q7, q7, d19 + + vext.8 q8, q1, q2, #2 + vext.8 q9, q1, q2, #4 + vext.8 q10, q5, q6, #2 + vext.8 q11, q5, q6, #4 + + vaddl_u16_n q12, q13, d2, d3, d16, d17, \w + vaddw_u16_n q12, q13, d18, d19, \w + + vaddl_u16_n q8, q9, d10, d11, d20, d21, \w + vaddw_u16_n q8, q9, d22, d23, \w +.endm + add3 8 + vst1.16 {q3}, [r1, :128]! + vst1.16 {q7}, [r11, :128]! + vst1.32 {q12, q13}, [r0, :128]! + vst1.32 {q8, q9}, [r10, :128]! + + subs r5, r5, #8 + ble 9f + tst r7, #2 // LR_HAVE_RIGHT + vld1.8 {d6}, [r3]! + vld1.8 {d14}, [r12]! + vmov q1, q2 + vmov q5, q6 + vext.8 q0, q0, q3, #8 + vext.8 q4, q4, q7, #8 + vmull.u8 q2, d6, d6 + vmull.u8 q6, d14, d14 + + bne 4b // If we don't need to pad, just keep summing. + b 3b // If we need to pad, check how many pixels we have left. + +5: // Produce 4 pixels, 6 <= w < 10 + add3 4 + vst1.16 {d6}, [r1, :64]! + vst1.16 {d14}, [r11, :64]! + vst1.32 {q12}, [r0, :128]! + vst1.32 {q8}, [r10, :128]! + + subs r5, r5, #4 // 2 <= w < 6 + vext.8 q0, q0, q0, #4 + vext.8 q4, q4, q4, #4 + +6: // Pad the right edge and produce the last few pixels. + // 2 <= w < 6, 2-5 pixels valid in q0 + sub lr, r5, #2 + // lr = (pixels valid - 2) + adr r11, L(box3_variable_shift_tbl) + ldr lr, [r11, lr, lsl #2] + add r11, r11, lr + bx r11 + + .align 2 +L(box3_variable_shift_tbl): + .word 22f - L(box3_variable_shift_tbl) + CONFIG_THUMB + .word 33f - L(box3_variable_shift_tbl) + CONFIG_THUMB + .word 44f - L(box3_variable_shift_tbl) + CONFIG_THUMB + .word 55f - L(box3_variable_shift_tbl) + CONFIG_THUMB + + // Shift q0 right, shifting out invalid pixels, + // shift q0 left to the original offset, shifting in padding pixels. +22: // 2 pixels valid + vext.8 q0, q0, q0, #2 + vext.8 q4, q4, q4, #2 + vext.8 q0, q0, q14, #14 + vext.8 q4, q4, q15, #14 + b 88f +33: // 3 pixels valid + vext.8 q0, q0, q0, #3 + vext.8 q4, q4, q4, #3 + vext.8 q0, q0, q14, #13 + vext.8 q4, q4, q15, #13 + b 88f +44: // 4 pixels valid + vext.8 q0, q0, q0, #4 + vext.8 q4, q4, q4, #4 + vext.8 q0, q0, q14, #12 + vext.8 q4, q4, q15, #12 + b 88f +55: // 5 pixels valid + vext.8 q0, q0, q0, #5 + vext.8 q4, q4, q4, #5 + vext.8 q0, q0, q14, #11 + vext.8 q4, q4, q15, #11 + +88: + // Restore r11 after using it for a temporary value above + add r11, r1, #(2*SUM_STRIDE) + vmull.u8 q1, d0, d0 + vmull.u8 q2, d1, d1 + vmull.u8 q5, d8, d8 + vmull.u8 q6, d9, d9 + + add3 4 + subs r5, r5, #4 + vst1.16 {d6}, [r1, :64]! + vst1.16 {d14}, [r11, :64]! + vst1.32 {q12}, [r0, :128]! + vst1.32 {q8}, [r10, :128]! + ble 9f + vext.8 q0, q0, q0, #4 + vext.8 q1, q1, q2, #8 + vext.8 q4, q4, q4, #4 + vext.8 q5, q5, q6, #8 + // Only one needed pixel left, but do a normal 4 pixel + // addition anyway + add3 4 + vst1.16 {d6}, [r1, :64]! + vst1.16 {d14}, [r11, :64]! + vst1.32 {q12}, [r0, :128]! + vst1.32 {q8}, [r10, :128]! + +9: + subs r6, r6, #2 + ble 0f + // Jump to the next row and loop horizontally + add r0, r0, r9, lsl #1 + add r10, r10, r9, lsl #1 + add r1, r1, r9 + add r11, r11, r9 + add r3, r3, r4 + add r12, r12, r4 + mov r5, r8 + b 1b +0: + vpop {q4-q7} + pop {r4-r11,pc} +.purgem add3 +endfunc + +// void dav1d_sgr_box5_h_8bpc_neon(int32_t *sumsq, int16_t *sum, +// const pixel (*left)[4], +// const pixel *src, const ptrdiff_t stride, +// const int w, const int h, +// const enum LrEdgeFlags edges); +function sgr_box5_h_8bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] + ldrd r6, r7, [sp, #108] + add r5, r5, #2 // w += 2 + + // Set up pointers for reading/writing alternate rows + add r10, r0, #(4*SUM_STRIDE) // sumsq + add r11, r1, #(2*SUM_STRIDE) // sum + add r12, r3, r4 // src + lsl r4, r4, #1 + mov r9, #(2*2*SUM_STRIDE) // double sum stride + + // Subtract the aligned width from the output stride. + // With LR_HAVE_RIGHT, align to 8, without it, align to 4. + // Subtract the number of pixels read from the input from the stride. + tst r7, #2 // LR_HAVE_RIGHT + bne 0f + // !LR_HAVE_RIGHT + add lr, r5, #3 + bic lr, lr, #3 + add r8, r5, #13 + b 1f +0: + add lr, r5, #7 + bic lr, lr, #7 + add r8, r5, #15 +1: + sub r9, r9, lr, lsl #1 + bic r8, r8, #7 + sub r4, r4, r8 + + // Store the width for the vertical loop + mov r8, r5 + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst r7, #1 // LR_HAVE_LEFT + beq 2f + // LR_HAVE_LEFT + cmp r2, #0 + bne 0f + // left == NULL + sub r3, r3, #3 + sub r12, r12, #3 + b 1f +0: // LR_HAVE_LEFT, left != NULL +2: // !LR_HAVE_LEFT, increase the stride. + // For this case we don't read the left 3 pixels from the src pointer, + // but shift it as if we had done that. + add r4, r4, #3 + +1: // Loop vertically + vld1.8 {q0}, [r3]! + vld1.8 {q4}, [r12]! + + tst r7, #1 // LR_HAVE_LEFT + beq 0f + cmp r2, #0 + beq 2f + // LR_HAVE_LEFT, left != NULL + vld1.32 {d3[]}, [r2]! + // Move r3/r12 back to account for the last 3 bytes we loaded earlier, + // which we'll shift out. + sub r3, r3, #3 + sub r12, r12, #3 + vld1.32 {d11[]}, [r2]! + vext.8 q0, q1, q0, #13 + vext.8 q4, q5, q4, #13 + b 2f +0: + // !LR_HAVE_LEFT, fill q1 with the leftmost byte + // and shift q0 to have 3x the first byte at the front. + vdup.8 q1, d0[0] + vdup.8 q5, d8[0] + // Move r3 back to account for the last 3 bytes we loaded before, + // which we shifted out. + sub r3, r3, #3 + sub r12, r12, #3 + vext.8 q0, q1, q0, #13 + vext.8 q4, q5, q4, #13 + +2: + vmull.u8 q1, d0, d0 + vmull.u8 q2, d1, d1 + vmull.u8 q5, d8, d8 + vmull.u8 q6, d9, d9 + + tst r7, #2 // LR_HAVE_RIGHT + bne 4f + // If we'll need to pad the right edge, load that byte to pad with + // here since we can find it pretty easily from here. + sub lr, r5, #(2 + 16 - 3 + 1) + ldrb r11, [r3, lr] + ldrb lr, [r12, lr] + // Fill q14/q15 with the right padding pixel + vdup.8 q14, r11 + vdup.8 q15, lr + // Restore r11 after using it for a temporary value + add r11, r1, #(2*SUM_STRIDE) +3: // !LR_HAVE_RIGHT + // If we'll have to pad the right edge we need to quit early here. + cmp r5, #11 + bge 4f // If w >= 11, all used input pixels are valid + cmp r5, #7 + bge 5f // If w >= 7, we can produce 4 pixels + b 6f + +4: // Loop horizontally +.macro add5 w + vext.8 d16, d0, d1, #1 + vext.8 d17, d0, d1, #2 + vext.8 d18, d0, d1, #3 + vext.8 d19, d0, d1, #4 + vext.8 d20, d8, d9, #1 + vext.8 d21, d8, d9, #2 + vext.8 d22, d8, d9, #3 + vext.8 d23, d8, d9, #4 + vaddl.u8 q3, d0, d16 + vaddl.u8 q12, d17, d18 + vaddl.u8 q7, d8, d20 + vaddl.u8 q13, d21, d22 + vaddw.u8 q3, q3, d19 + vaddw.u8 q7, q7, d23 + vadd.u16 q3, q3, q12 + vadd.u16 q7, q7, q13 + + vext.8 q8, q1, q2, #2 + vext.8 q9, q1, q2, #4 + vext.8 q10, q1, q2, #6 + vext.8 q11, q1, q2, #8 + vaddl_u16_n q12, q13, d2, d3, d16, d17, \w + vaddl_u16_n q8, q9, d18, d19, d20, d21, \w + vaddw_u16_n q12, q13, d22, d23, \w + vadd_i32_n q12, q13, q8, q9, \w + vext.8 q8, q5, q6, #2 + vext.8 q9, q5, q6, #4 + vext.8 q10, q5, q6, #6 + vext.8 q11, q5, q6, #8 +.if \w > 4 + vaddl_u16_n q1, q5, d10, d11, d16, d17, 8 + vaddl_u16_n q8, q9, d18, d19, d20, d21, 8 + vaddw_u16_n q1, q5, d22, d23, 8 + vadd.i32 q10, q1, q8 + vadd.i32 q11, q5, q9 +.else + // Can't clobber q1/q5 if only doing 4 pixels + vaddl.u16 q8, d10, d16 + vaddl.u16 q9, d18, d20 + vaddw.u16 q8, q8, d22 + vadd.i32 q10, q8, q9 +.endif +.endm + add5 8 + vst1.16 {q3}, [r1, :128]! + vst1.16 {q7}, [r11, :128]! + vst1.32 {q12, q13}, [r0, :128]! + vst1.32 {q10, q11}, [r10, :128]! + + subs r5, r5, #8 + ble 9f + tst r7, #2 // LR_HAVE_RIGHT + vld1.8 {d6}, [r3]! + vld1.8 {d14}, [r12]! + vmov q1, q2 + vmov q5, q6 + vext.8 q0, q0, q3, #8 + vext.8 q4, q4, q7, #8 + vmull.u8 q2, d6, d6 + vmull.u8 q6, d14, d14 + bne 4b // If we don't need to pad, just keep summing. + b 3b // If we need to pad, check how many pixels we have left. + +5: // Produce 4 pixels, 7 <= w < 11 + add5 4 + vst1.16 {d6}, [r1, :64]! + vst1.16 {d14}, [r11, :64]! + vst1.32 {q12}, [r0, :128]! + vst1.32 {q10}, [r10, :128]! + + subs r5, r5, #4 // 3 <= w < 7 + vext.8 q0, q0, q0, #4 + vext.8 q4, q4, q4, #4 + +6: // Pad the right edge and produce the last few pixels. + // w < 7, w+1 pixels valid in q0/q4 + sub lr, r5, #1 + // lr = pixels valid - 2 + adr r11, L(box5_variable_shift_tbl) + ldr lr, [r11, lr, lsl #2] + add r11, r11, lr + bx r11 + + .align 2 +L(box5_variable_shift_tbl): + .word 22f - L(box5_variable_shift_tbl) + CONFIG_THUMB + .word 33f - L(box5_variable_shift_tbl) + CONFIG_THUMB + .word 44f - L(box5_variable_shift_tbl) + CONFIG_THUMB + .word 55f - L(box5_variable_shift_tbl) + CONFIG_THUMB + .word 66f - L(box5_variable_shift_tbl) + CONFIG_THUMB + .word 77f - L(box5_variable_shift_tbl) + CONFIG_THUMB + + // Shift q0 right, shifting out invalid pixels, + // shift q0 left to the original offset, shifting in padding pixels. +22: // 2 pixels valid + vext.8 q0, q0, q0, #2 + vext.8 q4, q4, q4, #2 + vext.8 q0, q0, q14, #14 + vext.8 q4, q4, q15, #14 + b 88f +33: // 3 pixels valid + vext.8 q0, q0, q0, #3 + vext.8 q4, q4, q4, #3 + vext.8 q0, q0, q14, #13 + vext.8 q4, q4, q15, #13 + b 88f +44: // 4 pixels valid + vext.8 q0, q0, q0, #4 + vext.8 q4, q4, q4, #4 + vext.8 q0, q0, q14, #12 + vext.8 q4, q4, q15, #12 + b 88f +55: // 5 pixels valid + vext.8 q0, q0, q0, #5 + vext.8 q4, q4, q4, #5 + vext.8 q0, q0, q14, #11 + vext.8 q4, q4, q15, #11 + b 88f +66: // 6 pixels valid + vext.8 q0, q0, q0, #6 + vext.8 q4, q4, q4, #6 + vext.8 q0, q0, q14, #10 + vext.8 q4, q4, q15, #10 + b 88f +77: // 7 pixels valid + vext.8 q0, q0, q0, #7 + vext.8 q4, q4, q4, #7 + vext.8 q0, q0, q14, #9 + vext.8 q4, q4, q15, #9 + +88: + // Restore r11 after using it for a temporary value above + add r11, r1, #(2*SUM_STRIDE) + vmull.u8 q1, d0, d0 + vmull.u8 q2, d1, d1 + vmull.u8 q5, d8, d8 + vmull.u8 q6, d9, d9 + + add5 4 + subs r5, r5, #4 + vst1.16 {d6}, [r1, :64]! + vst1.16 {d14}, [r11, :64]! + vst1.32 {q12}, [r0, :128]! + vst1.32 {q10}, [r10, :128]! + ble 9f + vext.8 q0, q0, q0, #4 + vext.8 q1, q1, q2, #8 + vext.8 q4, q4, q4, #4 + vext.8 q5, q5, q6, #8 + add5 4 + vst1.16 {d6}, [r1, :64]! + vst1.16 {d14}, [r11, :64]! + vst1.32 {q12}, [r0, :128]! + vst1.32 {q10}, [r10, :128]! + +9: + subs r6, r6, #2 + ble 0f + // Jump to the next row and loop horizontally + add r0, r0, r9, lsl #1 + add r10, r10, r9, lsl #1 + add r1, r1, r9 + add r11, r11, r9 + add r3, r3, r4 + add r12, r12, r4 + mov r5, r8 + b 1b +0: + vpop {q4-q7} + pop {r4-r11,pc} +.purgem add5 +endfunc + +// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum, +// const int w, const int h, +// const enum LrEdgeFlags edges); +function sgr_box3_v_neon, export=1 + push {r4-r9,lr} + ldr r4, [sp, #28] + add r12, r3, #2 // Number of output rows to move back + mov lr, r3 // Number of input rows to move back + add r2, r2, #2 // Actual summed width + mov r7, #(4*SUM_STRIDE) // sumsq stride + mov r8, #(2*SUM_STRIDE) // sum stride + sub r0, r0, #(4*SUM_STRIDE) // sumsq -= stride + sub r1, r1, #(2*SUM_STRIDE) // sum -= stride + + tst r4, #4 // LR_HAVE_TOP + beq 0f + // If have top, read from row -2. + sub r5, r0, #(4*SUM_STRIDE) + sub r6, r1, #(2*SUM_STRIDE) + add lr, lr, #2 + b 1f +0: + // !LR_HAVE_TOP + // If we don't have top, read from row 0 even if + // we start writing to row -1. + add r5, r0, #(4*SUM_STRIDE) + add r6, r1, #(2*SUM_STRIDE) +1: + + tst r4, #8 // LR_HAVE_BOTTOM + beq 1f + // LR_HAVE_BOTTOM + add r3, r3, #2 // Sum all h+2 lines with the main loop + add lr, lr, #2 +1: + mov r9, r3 // Backup of h for next loops + +1: + // Start of horizontal loop; start one vertical filter slice. + // Start loading rows into q8-q13 and q0-q2 taking top + // padding into consideration. + tst r4, #4 // LR_HAVE_TOP + vld1.32 {q8, q9}, [r5, :128], r7 + vld1.16 {q0}, [r6, :128], r8 + beq 2f + // LR_HAVE_TOP + vld1.32 {q10, q11}, [r5, :128], r7 + vld1.16 {q1}, [r6, :128], r8 + vld1.32 {q12, q13}, [r5, :128], r7 + vld1.16 {q2}, [r6, :128], r8 + b 3f +2: // !LR_HAVE_TOP + vmov q10, q8 + vmov q11, q9 + vmov q1, q0 + vmov q12, q8 + vmov q13, q9 + vmov q2, q0 + +3: + subs r3, r3, #1 +.macro add3 + vadd.i32 q8, q8, q10 + vadd.i32 q9, q9, q11 + vadd.i16 q0, q0, q1 + vadd.i32 q8, q8, q12 + vadd.i32 q9, q9, q13 + vadd.i16 q0, q0, q2 + vst1.32 {q8, q9}, [r0, :128], r7 + vst1.16 {q0}, [r1, :128], r8 +.endm + add3 + vmov q8, q10 + vmov q9, q11 + vmov q0, q1 + vmov q10, q12 + vmov q11, q13 + vmov q1, q2 + ble 4f + vld1.32 {q12, q13}, [r5, :128], r7 + vld1.16 {q2}, [r6, :128], r8 + b 3b + +4: + tst r4, #8 // LR_HAVE_BOTTOM + bne 5f + // !LR_HAVE_BOTTOM + // Produce two more rows, extending the already loaded rows. + add3 + vmov q8, q10 + vmov q9, q11 + vmov q0, q1 + add3 + +5: // End of one vertical slice. + subs r2, r2, #8 + ble 0f + // Move pointers back up to the top and loop horizontally. + // Input pointers + mls r5, r7, lr, r5 + mls r6, r8, lr, r6 + // Output pointers + mls r0, r7, r12, r0 + mls r1, r8, r12, r1 + add r0, r0, #32 + add r1, r1, #16 + add r5, r5, #32 + add r6, r6, #16 + mov r3, r9 + b 1b + +0: + pop {r4-r9,pc} +.purgem add3 +endfunc + +// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum, +// const int w, const int h, +// const enum LrEdgeFlags edges); +function sgr_box5_v_neon, export=1 + push {r4-r9,lr} + vpush {q5-q7} + ldr r4, [sp, #76] + add r12, r3, #2 // Number of output rows to move back + mov lr, r3 // Number of input rows to move back + add r2, r2, #8 // Actual summed width + mov r7, #(4*SUM_STRIDE) // sumsq stride + mov r8, #(2*SUM_STRIDE) // sum stride + sub r0, r0, #(4*SUM_STRIDE) // sumsq -= stride + sub r1, r1, #(2*SUM_STRIDE) // sum -= stride + + tst r4, #4 // LR_HAVE_TOP + beq 0f + // If have top, read from row -2. + sub r5, r0, #(4*SUM_STRIDE) + sub r6, r1, #(2*SUM_STRIDE) + add lr, lr, #2 + b 1f +0: + // !LR_HAVE_TOP + // If we don't have top, read from row 0 even if + // we start writing to row -1. + add r5, r0, #(4*SUM_STRIDE) + add r6, r1, #(2*SUM_STRIDE) +1: + + tst r4, #8 // LR_HAVE_BOTTOM + beq 0f + // LR_HAVE_BOTTOM + add r3, r3, #2 // Handle h+2 lines with the main loop + add lr, lr, #2 + b 1f +0: + // !LR_HAVE_BOTTOM + sub r3, r3, #1 // Handle h-1 lines with the main loop +1: + mov r9, r3 // Backup of h for next loops + +1: + // Start of horizontal loop; start one vertical filter slice. + // Start loading rows into q6-q15 and q0-q3,q5 taking top + // padding into consideration. + tst r4, #4 // LR_HAVE_TOP + vld1.32 {q6, q7}, [r5, :128], r7 + vld1.16 {q0}, [r6, :128], r8 + beq 2f + // LR_HAVE_TOP + vld1.32 {q10, q11}, [r5, :128], r7 + vld1.16 {q2}, [r6, :128], r8 + vmov q8, q6 + vmov q9, q7 + vmov q1, q0 + vld1.32 {q12, q13}, [r5, :128], r7 + vld1.16 {q3}, [r6, :128], r8 + b 3f +2: // !LR_HAVE_TOP + vmov q8, q6 + vmov q9, q7 + vmov q1, q0 + vmov q10, q6 + vmov q11, q7 + vmov q2, q0 + vmov q12, q6 + vmov q13, q7 + vmov q3, q0 + +3: + cmp r3, #0 + beq 4f + vld1.32 {q14, q15}, [r5, :128], r7 + vld1.16 {q5}, [r6, :128], r8 + +3: + // Start of vertical loop + subs r3, r3, #2 +.macro add5 + vadd.i32 q6, q6, q8 + vadd.i32 q7, q7, q9 + vadd.i16 q0, q0, q1 + vadd.i32 q6, q6, q10 + vadd.i32 q7, q7, q11 + vadd.i16 q0, q0, q2 + vadd.i32 q6, q6, q12 + vadd.i32 q7, q7, q13 + vadd.i16 q0, q0, q3 + vadd.i32 q6, q6, q14 + vadd.i32 q7, q7, q15 + vadd.i16 q0, q0, q5 + vst1.32 {q6, q7}, [r0, :128], r7 + vst1.16 {q0}, [r1, :128], r8 +.endm + add5 +.macro shift2 + vmov q6, q10 + vmov q7, q11 + vmov q0, q2 + vmov q8, q12 + vmov q9, q13 + vmov q1, q3 + vmov q10, q14 + vmov q11, q15 + vmov q2, q5 +.endm + shift2 + add r0, r0, r7 + add r1, r1, r8 + ble 5f + vld1.32 {q12, q13}, [r5, :128], r7 + vld1.16 {q3}, [r6, :128], r8 + vld1.32 {q14, q15}, [r5, :128], r7 + vld1.16 {q5}, [r6, :128], r8 + b 3b + +4: + // h == 1, !LR_HAVE_BOTTOM. + // Pad the last row with the only content row, and add. + vmov q14, q12 + vmov q15, q13 + vmov q5, q3 + add5 + shift2 + add r0, r0, r7 + add r1, r1, r8 + add5 + b 6f + +5: + tst r4, #8 // LR_HAVE_BOTTOM + bne 6f + // !LR_HAVE_BOTTOM + cmp r3, #0 + bne 5f + // The intended three edge rows left; output the one at h-2 and + // the past edge one at h. + vld1.32 {q12, q13}, [r5, :128], r7 + vld1.16 {q3}, [r6, :128], r8 + // Pad the past-edge row from the last content row. + vmov q14, q12 + vmov q15, q13 + vmov q5, q3 + add5 + shift2 + add r0, r0, r7 + add r1, r1, r8 + // The last two rows are already padded properly here. + add5 + b 6f + +5: + // r3 == -1, two rows left, output one. + // Pad the last two rows from the mid one. + vmov q12, q10 + vmov q13, q11 + vmov q3, q2 + vmov q14, q10 + vmov q15, q11 + vmov q5, q2 + add5 + add r0, r0, r7 + add r1, r1, r8 + b 6f + +6: // End of one vertical slice. + subs r2, r2, #8 + ble 0f + // Move pointers back up to the top and loop horizontally. + // Input pointers + mls r5, r7, lr, r5 + mls r6, r8, lr, r6 + // Output pointers + mls r0, r7, r12, r0 + mls r1, r8, r12, r1 + add r0, r0, #32 + add r1, r1, #16 + add r5, r5, #32 + add r6, r6, #16 + mov r3, r9 + b 1b + +0: + vpop {q5-q7} + pop {r4-r9,pc} +.purgem add5 +endfunc + +// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b, +// const int w, const int h, const int strength); +// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b, +// const int w, const int h, const int strength); +function sgr_calc_ab1_neon, export=1 + push {r4-r5,lr} + vpush {q4-q7} + ldr r4, [sp, #76] + add r3, r3, #2 // h += 2 + vmov.i32 q15, #9 // n + movw r5, #455 + mov lr, #SUM_STRIDE + b sgr_calc_ab_neon +endfunc + +function sgr_calc_ab2_neon, export=1 + push {r4-r5,lr} + vpush {q4-q7} + ldr r4, [sp, #76] + add r3, r3, #3 // h += 3 + asr r3, r3, #1 // h /= 2 + vmov.i32 q15, #25 // n + mov r5, #164 + mov lr, #(2*SUM_STRIDE) +endfunc + +function sgr_calc_ab_neon + movrel r12, X(sgr_x_by_x) + vld1.8 {q8, q9}, [r12, :128]! + vmov.i8 q11, #5 + vmov.i8 d10, #55 // idx of last 5 + vld1.8 {q10}, [r12, :128] + vmov.i8 d11, #72 // idx of last 4 + vmov.i8 d12, #101 // idx of last 3 + vmov.i8 d13, #169 // idx of last 2 + vmov.i8 d14, #254 // idx of last 1 + vmov.i8 d15, #32 // elements consumed in first vtbl + add r2, r2, #2 // w += 2 + add r12, r2, #7 + bic r12, r12, #7 // aligned w + sub r12, lr, r12 // increment between rows + vmov.i16 q13, #256 + vdup.32 q12, r4 + vdup.32 q14, r5 // one_by_x + sub r0, r0, #(4*(SUM_STRIDE)) + sub r1, r1, #(2*(SUM_STRIDE)) + mov r4, r2 // backup of w + vsub.i8 q8, q8, q11 + vsub.i8 q9, q9, q11 + vsub.i8 q10, q10, q11 +1: + subs r2, r2, #8 + vld1.32 {q0, q1}, [r0, :128] // a + vld1.16 {q2}, [r1, :128] // b + vmul.i32 q0, q0, q15 // a * n + vmul.i32 q1, q1, q15 // a * n + vmull.u16 q3, d4, d4 // b * b + vmull.u16 q4, d5, d5 // b * b + vqsub.u32 q0, q0, q3 // imax(a * n - b * b, 0) + vqsub.u32 q1, q1, q4 // imax(a * n - b * b, 0) + vmul.i32 q0, q0, q12 // p * s + vmul.i32 q1, q1, q12 // p * s + vqshrn.u32 d0, q0, #16 + vqshrn.u32 d1, q1, #16 + vqrshrn.u16 d0, q0, #4 // imin(z, 255) + + vcgt.u8 d2, d0, d10 // = -1 if sgr_x_by_x[d0] < 5 + vcgt.u8 d3, d0, d11 // = -1 if sgr_x_by_x[d0] < 4 + vtbl.8 d1, {q8, q9}, d0 + vcgt.u8 d6, d0, d12 // = -1 if sgr_x_by_x[d0] < 3 + vsub.i8 d9, d0, d15 // indices for vtbx + vcgt.u8 d7, d0, d13 // = -1 if sgr_x_by_x[d0] < 2 + vadd.i8 d2, d2, d3 + vtbx.8 d1, {q10}, d9 + vcgt.u8 d8, d0, d14 // = -1 if sgr_x_by_x[d0] < 1 + vadd.i8 d6, d6, d7 + vadd.i8 d8, d8, d22 + vadd.i8 d2, d2, d6 + vadd.i8 d1, d1, d8 + vadd.i8 d1, d1, d2 + vmovl.u8 q0, d1 // x + + vmull.u16 q1, d0, d4 // x * BB[i] + vmull.u16 q2, d1, d5 // x * BB[i] + vmul.i32 q1, q1, q14 // x * BB[i] * sgr_one_by_x + vmul.i32 q2, q2, q14 // x * BB[i] * sgr_one_by_x + vrshr.s32 q1, q1, #12 // AA[i] + vrshr.s32 q2, q2, #12 // AA[i] + vsub.i16 q0, q13, q0 // 256 - x + + vst1.32 {q1, q2}, [r0, :128]! + vst1.16 {q0}, [r1, :128]! + bgt 1b + + subs r3, r3, #1 + ble 0f + add r0, r0, r12, lsl #2 + add r1, r1, r12, lsl #1 + mov r2, r4 + b 1b +0: + vpop {q4-q7} + pop {r4-r5,pc} +endfunc + +#define FILTER_OUT_STRIDE 384 + +// void dav1d_sgr_finish_filter1_8bpc_neon(int16_t *tmp, +// const pixel *src, const ptrdiff_t stride, +// const int32_t *a, const int16_t *b, +// const int w, const int h); +function sgr_finish_filter1_8bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] + ldr r6, [sp, #108] + sub r7, r3, #(4*SUM_STRIDE) + add r8, r3, #(4*SUM_STRIDE) + sub r9, r4, #(2*SUM_STRIDE) + add r10, r4, #(2*SUM_STRIDE) + mov r11, #SUM_STRIDE + mov r12, #FILTER_OUT_STRIDE + add lr, r5, #3 + bic lr, lr, #3 // Aligned width + sub r2, r2, lr + sub r12, r12, lr + sub r11, r11, lr + sub r11, r11, #4 // We read 4 extra elements from both a and b + mov lr, r5 + vmov.i16 q14, #3 + vmov.i32 q15, #3 +1: + vld1.16 {q0}, [r9]! + vld1.16 {q1}, [r4]! + vld1.16 {q2}, [r10]! + vld1.32 {q8, q9}, [r7]! + vld1.32 {q10, q11}, [r3]! + vld1.32 {q12, q13}, [r8]! + +2: + subs r5, r5, #4 + vext.8 d6, d0, d1, #2 // -stride + vext.8 d7, d2, d3, #2 // 0 + vext.8 d8, d4, d5, #2 // +stride + vext.8 d9, d0, d1, #4 // +1-stride + vext.8 d10, d2, d3, #4 // +1 + vext.8 d11, d4, d5, #4 // +1+stride + vadd.i16 d2, d2, d6 // -1, -stride + vadd.i16 d7, d7, d8 // 0, +stride + vadd.i16 d0, d0, d9 // -1-stride, +1-stride + vadd.i16 d2, d2, d7 + vadd.i16 d4, d4, d11 // -1+stride, +1+stride + vadd.i16 d2, d2, d10 // +1 + vadd.i16 d0, d0, d4 + + vext.8 q3, q8, q9, #4 // -stride + vshl.i16 d2, d2, #2 + vext.8 q4, q8, q9, #8 // +1-stride + vext.8 q5, q10, q11, #4 // 0 + vext.8 q6, q10, q11, #8 // +1 + vmla.i16 d2, d0, d28 // * 3 -> a + vadd.i32 q3, q3, q10 // -stride, -1 + vadd.i32 q8, q8, q4 // -1-stride, +1-stride + vadd.i32 q5, q5, q6 // 0, +1 + vadd.i32 q8, q8, q12 // -1+stride + vadd.i32 q3, q3, q5 + vext.8 q7, q12, q13, #4 // +stride + vext.8 q10, q12, q13, #8 // +1+stride + vld1.32 {d24[0]}, [r1]! // src + vadd.i32 q3, q3, q7 // +stride + vadd.i32 q8, q8, q10 // +1+stride + vshl.i32 q3, q3, #2 + vmla.i32 q3, q8, q15 // * 3 -> b + vmovl.u8 q12, d24 // src + vmov d0, d1 + vmlal.u16 q3, d2, d24 // b + a * src + vmov d2, d3 + vrshrn.i32 d6, q3, #9 + vmov d4, d5 + vst1.16 {d6}, [r0]! + + ble 3f + vmov q8, q9 + vmov q10, q11 + vmov q12, q13 + vld1.16 {d1}, [r9]! + vld1.16 {d3}, [r4]! + vld1.16 {d5}, [r10]! + vld1.32 {q9}, [r7]! + vld1.32 {q11}, [r3]! + vld1.32 {q13}, [r8]! + b 2b + +3: + subs r6, r6, #1 + ble 0f + mov r5, lr + add r0, r0, r12, lsl #1 + add r1, r1, r2 + add r3, r3, r11, lsl #2 + add r7, r7, r11, lsl #2 + add r8, r8, r11, lsl #2 + add r4, r4, r11, lsl #1 + add r9, r9, r11, lsl #1 + add r10, r10, r11, lsl #1 + b 1b +0: + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +// void dav1d_sgr_finish_filter2_8bpc_neon(int16_t *tmp, +// const pixel *src, const ptrdiff_t stride, +// const int32_t *a, const int16_t *b, +// const int w, const int h); +function sgr_finish_filter2_8bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] + ldr r6, [sp, #108] + add r7, r3, #(4*(SUM_STRIDE)) + sub r3, r3, #(4*(SUM_STRIDE)) + add r8, r4, #(2*(SUM_STRIDE)) + sub r4, r4, #(2*(SUM_STRIDE)) + mov r9, #(2*SUM_STRIDE) + mov r10, #FILTER_OUT_STRIDE + add r11, r5, #7 + bic r11, r11, #7 // Aligned width + sub r2, r2, r11 + sub r10, r10, r11 + sub r9, r9, r11 + sub r9, r9, #4 // We read 4 extra elements from a + sub r12, r9, #4 // We read 8 extra elements from b + mov lr, r5 + +1: + vld1.16 {q0, q1}, [r4]! + vld1.16 {q2, q3}, [r8]! + vld1.32 {q8, q9}, [r3]! + vld1.32 {q11, q12}, [r7]! + vld1.32 {q10}, [r3]! + vld1.32 {q13}, [r7]! + +2: + vmov.i16 q14, #5 + vmov.i16 q15, #6 + subs r5, r5, #8 + vext.8 q4, q0, q1, #4 // +1-stride + vext.8 q5, q2, q3, #4 // +1+stride + vext.8 q6, q0, q1, #2 // -stride + vext.8 q7, q2, q3, #2 // +stride + vadd.i16 q0, q0, q4 // -1-stride, +1-stride + vadd.i16 q5, q2, q5 // -1+stride, +1+stride + vadd.i16 q2, q6, q7 // -stride, +stride + vadd.i16 q0, q0, q5 + + vext.8 q4, q8, q9, #8 // +1-stride + vext.8 q5, q9, q10, #8 + vext.8 q6, q11, q12, #8 // +1+stride + vext.8 q7, q12, q13, #8 + vmul.i16 q0, q0, q14 // * 5 + vmla.i16 q0, q2, q15 // * 6 + vadd.i32 q4, q4, q8 // -1-stride, +1-stride + vadd.i32 q5, q5, q9 + vadd.i32 q6, q6, q11 // -1+stride, +1+stride + vadd.i32 q7, q7, q12 + vadd.i32 q4, q4, q6 + vadd.i32 q5, q5, q7 + vext.8 q6, q8, q9, #4 // -stride + vext.8 q7, q9, q10, #4 + vext.8 q8, q11, q12, #4 // +stride + vext.8 q11, q12, q13, #4 + + vld1.8 {d4}, [r1]! + + vmov.i32 q14, #5 + vmov.i32 q15, #6 + + vadd.i32 q6, q6, q8 // -stride, +stride + vadd.i32 q7, q7, q11 + vmul.i32 q4, q4, q14 // * 5 + vmla.i32 q4, q6, q15 // * 6 + vmul.i32 q5, q5, q14 // * 5 + vmla.i32 q5, q7, q15 // * 6 + + vmovl.u8 q2, d4 + vmlal.u16 q4, d0, d4 // b + a * src + vmlal.u16 q5, d1, d5 // b + a * src + vmov q0, q1 + vrshrn.i32 d8, q4, #9 + vrshrn.i32 d9, q5, #9 + vmov q2, q3 + vst1.16 {q4}, [r0]! + + ble 3f + vmov q8, q10 + vmov q11, q13 + vld1.16 {q1}, [r4]! + vld1.16 {q3}, [r8]! + vld1.32 {q9, q10}, [r3]! + vld1.32 {q12, q13}, [r7]! + b 2b + +3: + subs r6, r6, #1 + ble 0f + mov r5, lr + add r0, r0, r10, lsl #1 + add r1, r1, r2 + add r3, r3, r9, lsl #2 + add r7, r7, r9, lsl #2 + add r4, r4, r12, lsl #1 + add r8, r8, r12, lsl #1 + + vld1.32 {q8, q9}, [r3]! + vld1.16 {q0, q1}, [r4]! + vld1.32 {q10}, [r3]! + + vmov.i16 q12, #5 + vmov.i16 q13, #6 + +4: + subs r5, r5, #8 + vext.8 q3, q0, q1, #4 // +1 + vext.8 q2, q0, q1, #2 // 0 + vadd.i16 q0, q0, q3 // -1, +1 + + vext.8 q4, q8, q9, #4 // 0 + vext.8 q5, q9, q10, #4 + vext.8 q6, q8, q9, #8 // +1 + vext.8 q7, q9, q10, #8 + vmul.i16 q2, q2, q13 // * 6 + vmla.i16 q2, q0, q12 // * 5 -> a + vld1.8 {d22}, [r1]! + vadd.i32 q8, q8, q6 // -1, +1 + vadd.i32 q9, q9, q7 + vmovl.u8 q11, d22 + vmul.i32 q4, q4, q15 // * 6 + vmla.i32 q4, q8, q14 // * 5 -> b + vmul.i32 q5, q5, q15 // * 6 + vmla.i32 q5, q9, q14 // * 5 -> b + + vmlal.u16 q4, d4, d22 // b + a * src + vmlal.u16 q5, d5, d23 + vmov q0, q1 + vrshrn.i32 d8, q4, #8 + vrshrn.i32 d9, q5, #8 + vmov q8, q10 + vst1.16 {q4}, [r0]! + + ble 5f + vld1.16 {q1}, [r4]! + vld1.32 {q9, q10}, [r3]! + b 4b + +5: + subs r6, r6, #1 + ble 0f + mov r5, lr + sub r3, r3, r11, lsl #2 // Rewind r3/r4 to where they started + sub r4, r4, r11, lsl #1 + add r0, r0, r10, lsl #1 + add r1, r1, r2 + sub r3, r3, #16 + sub r4, r4, #16 + b 1b +0: + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +// void dav1d_sgr_weighted1_8bpc_neon(pixel *dst, const ptrdiff_t dst_stride, +// const pixel *src, const ptrdiff_t src_stride, +// const int16_t *t1, const int w, const int h, +// const int wt); +function sgr_weighted1_8bpc_neon, export=1 + push {r4-r9,lr} + ldrd r4, r5, [sp, #28] + ldrd r6, r7, [sp, #36] + ldr r8, [sp, #44] + vdup.16 d31, r7 + cmp r6, #2 + add r9, r0, r1 + add r12, r2, r3 + add lr, r4, #2*FILTER_OUT_STRIDE + mov r7, #(4*FILTER_OUT_STRIDE) + lsl r1, r1, #1 + lsl r3, r3, #1 + add r8, r5, #7 + bic r8, r8, #7 // Aligned width + sub r1, r1, r8 + sub r3, r3, r8 + sub r7, r7, r8, lsl #1 + mov r8, r5 + blt 2f +1: + vld1.8 {d0}, [r2]! + vld1.8 {d16}, [r12]! + vld1.16 {q1}, [r4]! + vld1.16 {q9}, [lr]! + subs r5, r5, #8 + vshll.u8 q0, d0, #4 // u + vshll.u8 q8, d16, #4 // u + vsub.i16 q1, q1, q0 // t1 - u + vsub.i16 q9, q9, q8 // t1 - u + vshll.u16 q2, d0, #7 // u << 7 + vshll.u16 q3, d1, #7 // u << 7 + vshll.u16 q10, d16, #7 // u << 7 + vshll.u16 q11, d17, #7 // u << 7 + vmlal.s16 q2, d2, d31 // v + vmlal.s16 q3, d3, d31 // v + vmlal.s16 q10, d18, d31 // v + vmlal.s16 q11, d19, d31 // v + vrshrn.i32 d4, q2, #11 + vrshrn.i32 d5, q3, #11 + vrshrn.i32 d20, q10, #11 + vrshrn.i32 d21, q11, #11 + vqmovun.s16 d4, q2 + vqmovun.s16 d20, q10 + vst1.8 {d4}, [r0]! + vst1.8 {d20}, [r9]! + bgt 1b + + sub r6, r6, #2 + cmp r6, #1 + blt 0f + mov r5, r8 + add r0, r0, r1 + add r9, r9, r1 + add r2, r2, r3 + add r12, r12, r3 + add r4, r4, r7 + add lr, lr, r7 + beq 2f + b 1b + +2: + vld1.8 {d0}, [r2]! + vld1.16 {q1}, [r4]! + subs r5, r5, #8 + vshll.u8 q0, d0, #4 // u + vsub.i16 q1, q1, q0 // t1 - u + vshll.u16 q2, d0, #7 // u << 7 + vshll.u16 q3, d1, #7 // u << 7 + vmlal.s16 q2, d2, d31 // v + vmlal.s16 q3, d3, d31 // v + vrshrn.i32 d4, q2, #11 + vrshrn.i32 d5, q3, #11 + vqmovun.s16 d2, q2 + vst1.8 {d2}, [r0]! + bgt 2b +0: + pop {r4-r9,pc} +endfunc + +// void dav1d_sgr_weighted2_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *src, const ptrdiff_t src_stride, +// const int16_t *t1, const int16_t *t2, +// const int w, const int h, +// const int16_t wt[2]); +function sgr_weighted2_8bpc_neon, export=1 + push {r4-r11,lr} + ldrd r4, r5, [sp, #36] + ldrd r6, r7, [sp, #44] + ldr r8, [sp, #52] + cmp r7, #2 + add r10, r0, r1 + add r11, r2, r3 + add r12, r4, #2*FILTER_OUT_STRIDE + add lr, r5, #2*FILTER_OUT_STRIDE + vld2.16 {d30[], d31[]}, [r8] // wt[0], wt[1] + mov r8, #4*FILTER_OUT_STRIDE + lsl r1, r1, #1 + lsl r3, r3, #1 + add r9, r6, #7 + bic r9, r9, #7 // Aligned width + sub r1, r1, r9 + sub r3, r3, r9 + sub r8, r8, r9, lsl #1 + mov r9, r6 + blt 2f +1: + vld1.8 {d0}, [r2]! + vld1.8 {d16}, [r11]! + vld1.16 {q1}, [r4]! + vld1.16 {q9}, [r12]! + vld1.16 {q2}, [r5]! + vld1.16 {q10}, [lr]! + subs r6, r6, #8 + vshll.u8 q0, d0, #4 // u + vshll.u8 q8, d16, #4 // u + vsub.i16 q1, q1, q0 // t1 - u + vsub.i16 q2, q2, q0 // t2 - u + vsub.i16 q9, q9, q8 // t1 - u + vsub.i16 q10, q10, q8 // t2 - u + vshll.u16 q3, d0, #7 // u << 7 + vshll.u16 q0, d1, #7 // u << 7 + vshll.u16 q11, d16, #7 // u << 7 + vshll.u16 q8, d17, #7 // u << 7 + vmlal.s16 q3, d2, d30 // wt[0] * (t1 - u) + vmlal.s16 q3, d4, d31 // wt[1] * (t2 - u) + vmlal.s16 q0, d3, d30 // wt[0] * (t1 - u) + vmlal.s16 q0, d5, d31 // wt[1] * (t2 - u) + vmlal.s16 q11, d18, d30 // wt[0] * (t1 - u) + vmlal.s16 q11, d20, d31 // wt[1] * (t2 - u) + vmlal.s16 q8, d19, d30 // wt[0] * (t1 - u) + vmlal.s16 q8, d21, d31 // wt[1] * (t2 - u) + vrshrn.i32 d6, q3, #11 + vrshrn.i32 d7, q0, #11 + vrshrn.i32 d22, q11, #11 + vrshrn.i32 d23, q8, #11 + vqmovun.s16 d6, q3 + vqmovun.s16 d22, q11 + vst1.8 {d6}, [r0]! + vst1.8 {d22}, [r10]! + bgt 1b + + subs r7, r7, #2 + cmp r7, #1 + blt 0f + mov r6, r9 + add r0, r0, r1 + add r10, r10, r1 + add r2, r2, r3 + add r11, r11, r3 + add r4, r4, r8 + add r12, r12, r8 + add r5, r5, r8 + add lr, lr, r8 + beq 2f + b 1b + +2: + vld1.8 {d0}, [r2]! + vld1.16 {q1}, [r4]! + vld1.16 {q2}, [r5]! + subs r6, r6, #8 + vshll.u8 q0, d0, #4 // u + vsub.i16 q1, q1, q0 // t1 - u + vsub.i16 q2, q2, q0 // t2 - u + vshll.u16 q3, d0, #7 // u << 7 + vshll.u16 q0, d1, #7 // u << 7 + vmlal.s16 q3, d2, d30 // wt[0] * (t1 - u) + vmlal.s16 q3, d4, d31 // wt[1] * (t2 - u) + vmlal.s16 q0, d3, d30 // wt[0] * (t1 - u) + vmlal.s16 q0, d5, d31 // wt[1] * (t2 - u) + vrshrn.i32 d6, q3, #11 + vrshrn.i32 d7, q0, #11 + vqmovun.s16 d6, q3 + vst1.8 {d6}, [r0]! + bgt 1b +0: + pop {r4-r11,pc} +endfunc diff --git a/third_party/dav1d/src/arm/32/looprestoration16.S b/third_party/dav1d/src/arm/32/looprestoration16.S new file mode 100644 index 0000000000..39c248f8b5 --- /dev/null +++ b/third_party/dav1d/src/arm/32/looprestoration16.S @@ -0,0 +1,720 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +// void dav1d_wiener_filter_h_16bpc_neon(int16_t *dst, const pixel (*left)[4], +// const pixel *src, ptrdiff_t stride, +// const int16_t fh[7], const intptr_t w, +// int h, enum LrEdgeFlags edges, +// const int bitdepth_max); +function wiener_filter_h_16bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] + ldrd r6, r7, [sp, #108] + ldr r8, [sp, #116] // bitdepth_max + vld1.16 {q0}, [r4] + clz r8, r8 + vmov.i32 q14, #1 + sub r9, r8, #38 // -(bitdepth + 6) + sub r8, r8, #25 // -round_bits_h + neg r9, r9 // bitdepth + 6 + vdup.32 q1, r9 + vdup.32 q13, r8 // -round_bits_h + vmov.i16 q15, #8192 + vshl.u32 q14, q14, q1 // 1 << (bitdepth + 6) + mov r8, r5 + // Calculate mid_stride + add r10, r5, #7 + bic r10, r10, #7 + lsl r10, r10, #1 + + // Clear the last unused element of q0, to allow filtering a single + // pixel with one plain vmul+vpadd. + mov r12, #0 + vmov.16 d1[3], r12 + + // Set up pointers for reading/writing alternate rows + add r12, r0, r10 + lsl r10, r10, #1 + add lr, r2, r3 + lsl r3, r3, #1 + + // Subtract the width from mid_stride + sub r10, r10, r5, lsl #1 + + // For w >= 8, we read (w+5)&~7+8 pixels, for w < 8 we read 16 pixels. + cmp r5, #8 + add r11, r5, #13 + bic r11, r11, #7 + bge 1f + mov r11, #16 +1: + sub r3, r3, r11, lsl #1 + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst r7, #1 // LR_HAVE_LEFT + beq 2f + // LR_HAVE_LEFT + cmp r1, #0 + bne 0f + // left == NULL + sub r2, r2, #6 + sub lr, lr, #6 + b 1f +0: // LR_HAVE_LEFT, left != NULL +2: // !LR_HAVE_LEFT, increase the stride. + // For this case we don't read the left 3 pixels from the src pointer, + // but shift it as if we had done that. + add r3, r3, #6 + + +1: // Loop vertically + vld1.16 {q2, q3}, [r2]! + vld1.16 {q4, q5}, [lr]! + + tst r7, #1 // LR_HAVE_LEFT + beq 0f + cmp r1, #0 + beq 2f + // LR_HAVE_LEFT, left != NULL + vld1.16 {d3}, [r1]! + // Move r2/lr back to account for the last 3 pixels we loaded earlier, + // which we'll shift out. + sub r2, r2, #6 + sub lr, lr, #6 + vld1.16 {d13}, [r1]! + vext.8 q3, q2, q3, #10 + vext.8 q2, q1, q2, #10 + vext.8 q5, q4, q5, #10 + vext.8 q4, q6, q4, #10 + b 2f +0: + // !LR_HAVE_LEFT, fill q1 with the leftmost pixel + // and shift q2/q3 to have 3x the first pixel at the front. + vdup.16 q1, d4[0] + vdup.16 q6, d8[0] + // Move r2 back to account for the last 3 pixels we loaded before, + // which we shifted out. + sub r2, r2, #6 + sub lr, lr, #6 + vext.8 q3, q2, q3, #10 + vext.8 q2, q1, q2, #10 + vext.8 q5, q4, q5, #10 + vext.8 q4, q6, q4, #10 + +2: + + tst r7, #2 // LR_HAVE_RIGHT + bne 4f + // If we'll need to pad the right edge, load that byte to pad with + // here since we can find it pretty easily from here. + sub r9, r5, #14 + lsl r9, r9, #1 + ldrh r11, [r2, r9] + ldrh r9, [lr, r9] + // Fill q11/q12 with the right padding pixel + vdup.16 q11, r11 + vdup.16 q12, r9 +3: // !LR_HAVE_RIGHT + // If we'll have to pad the right edge we need to quit early here. + cmp r5, #11 + bge 4f // If w >= 11, all used input pixels are valid + cmp r5, #7 + bge 5f // If w >= 7, we can filter 4 pixels + b 6f + +4: // Loop horizontally + vext.8 q10, q2, q3, #6 + vext.8 q8, q2, q3, #2 + vext.8 q9, q2, q3, #4 + vshll.u16 q6, d20, #7 + vshll.u16 q7, d21, #7 + vmlal.s16 q6, d4, d0[0] + vmlal.s16 q6, d16, d0[1] + vmlal.s16 q6, d18, d0[2] + vmlal.s16 q6, d20, d0[3] + vmlal.s16 q7, d5, d0[0] + vmlal.s16 q7, d17, d0[1] + vmlal.s16 q7, d19, d0[2] + vmlal.s16 q7, d21, d0[3] + vext.8 q8, q2, q3, #8 + vext.8 q9, q2, q3, #10 + vext.8 q10, q2, q3, #12 + vmlal.s16 q6, d16, d1[0] + vmlal.s16 q6, d18, d1[1] + vmlal.s16 q6, d20, d1[2] + vmlal.s16 q7, d17, d1[0] + vmlal.s16 q7, d19, d1[1] + vmlal.s16 q7, d21, d1[2] + vext.8 q10, q4, q5, #6 + vext.8 q2, q4, q5, #2 + vshll.u16 q8, d20, #7 + vshll.u16 q9, d21, #7 + vmlal.s16 q8, d8, d0[0] + vmlal.s16 q8, d4, d0[1] + vmlal.s16 q8, d20, d0[3] + vmlal.s16 q9, d9, d0[0] + vmlal.s16 q9, d5, d0[1] + vmlal.s16 q9, d21, d0[3] + vext.8 q2, q4, q5, #4 + vext.8 q10, q4, q5, #8 + vmlal.s16 q8, d4, d0[2] + vmlal.s16 q8, d20, d1[0] + vmlal.s16 q9, d5, d0[2] + vmlal.s16 q9, d21, d1[0] + vext.8 q2, q4, q5, #10 + vext.8 q10, q4, q5, #12 + vmlal.s16 q8, d4, d1[1] + vmlal.s16 q8, d20, d1[2] + vmlal.s16 q9, d5, d1[1] + vmlal.s16 q9, d21, d1[2] + + vmvn.i16 q10, #0x8000 // 0x7fff = (1 << 15) - 1 + vadd.i32 q6, q6, q14 + vadd.i32 q7, q7, q14 + vadd.i32 q8, q8, q14 + vadd.i32 q9, q9, q14 + vrshl.s32 q6, q6, q13 + vrshl.s32 q7, q7, q13 + vrshl.s32 q8, q8, q13 + vrshl.s32 q9, q9, q13 + vqmovun.s32 d12, q6 + vqmovun.s32 d13, q7 + vqmovun.s32 d14, q8 + vqmovun.s32 d15, q9 + vmin.u16 q6, q6, q10 + vmin.u16 q7, q7, q10 + vsub.i16 q6, q6, q15 + vsub.i16 q7, q7, q15 + vst1.16 {q6}, [r0, :128]! + vst1.16 {q7}, [r12, :128]! + + subs r5, r5, #8 + ble 9f + tst r7, #2 // LR_HAVE_RIGHT + vmov q2, q3 + vmov q4, q5 + vld1.16 {q3}, [r2]! + vld1.16 {q5}, [lr]! + bne 4b // If we don't need to pad, just keep filtering. + b 3b // If we need to pad, check how many pixels we have left. + +5: // Filter 4 pixels, 7 <= w < 11 +.macro filter_4 + vext.8 d18, d4, d5, #6 + vext.8 d16, d4, d5, #2 + vext.8 d17, d4, d5, #4 + vext.8 d19, d5, d6, #2 + vext.8 d20, d5, d6, #4 + vshll.u16 q6, d18, #7 + vmlal.s16 q6, d4, d0[0] + vmlal.s16 q6, d16, d0[1] + vmlal.s16 q6, d17, d0[2] + vmlal.s16 q6, d18, d0[3] + vmlal.s16 q6, d5, d1[0] + vmlal.s16 q6, d19, d1[1] + vmlal.s16 q6, d20, d1[2] + + vext.8 d18, d8, d9, #6 + vext.8 d16, d8, d9, #2 + vext.8 d17, d8, d9, #4 + vext.8 d19, d9, d10, #2 + vext.8 d20, d9, d10, #4 + vshll.u16 q7, d18, #7 + vmlal.s16 q7, d8, d0[0] + vmlal.s16 q7, d16, d0[1] + vmlal.s16 q7, d17, d0[2] + vmlal.s16 q7, d18, d0[3] + vmlal.s16 q7, d9, d1[0] + vmlal.s16 q7, d19, d1[1] + vmlal.s16 q7, d20, d1[2] + + vmvn.i16 q10, #0x8000 // 0x7fff = (1 << 15) - 1 + vadd.i32 q6, q6, q14 + vadd.i32 q7, q7, q14 + vrshl.s32 q6, q6, q13 + vrshl.s32 q7, q7, q13 + vqmovun.s32 d12, q6 + vqmovun.s32 d13, q7 + vmin.u16 q6, q6, q10 + vsub.i16 q6, q6, q15 +.endm + filter_4 + vst1.16 {d12}, [r0, :64]! + vst1.16 {d13}, [r12, :64]! + + subs r5, r5, #4 // 3 <= w < 7 + vext.8 q2, q2, q3, #8 + vext.8 q3, q3, q3, #8 + vext.8 q4, q4, q5, #8 + vext.8 q5, q5, q5, #8 + +6: // Pad the right edge and filter the last few pixels. + // w < 7, w+3 pixels valid in q2-q3 + cmp r5, #5 + blt 7f + bgt 8f + // w == 5, 8 pixels valid in q2, q3 invalid + vmov q3, q11 + vmov q5, q12 + b 88f + +7: // 1 <= w < 5, 4-7 pixels valid in q2 + sub r9, r5, #1 + // r9 = (pixels valid - 4) + adr r11, L(variable_shift_tbl) + ldr r9, [r11, r9, lsl #2] + add r11, r11, r9 + vmov q3, q11 + vmov q5, q12 + bx r11 + + .align 2 +L(variable_shift_tbl): + .word 44f - L(variable_shift_tbl) + CONFIG_THUMB + .word 55f - L(variable_shift_tbl) + CONFIG_THUMB + .word 66f - L(variable_shift_tbl) + CONFIG_THUMB + .word 77f - L(variable_shift_tbl) + CONFIG_THUMB + +44: // 4 pixels valid in q2/q4, fill the high half with padding. + vmov d5, d6 + vmov d9, d10 + b 88f + // Shift q2 right, shifting out invalid pixels, + // shift q2 left to the original offset, shifting in padding pixels. +55: // 5 pixels valid + vext.8 q2, q2, q2, #10 + vext.8 q2, q2, q3, #6 + vext.8 q4, q4, q4, #10 + vext.8 q4, q4, q5, #6 + b 88f +66: // 6 pixels valid + vext.8 q2, q2, q2, #12 + vext.8 q2, q2, q3, #4 + vext.8 q4, q4, q4, #12 + vext.8 q4, q4, q5, #4 + b 88f +77: // 7 pixels valid + vext.8 q2, q2, q2, #14 + vext.8 q2, q2, q3, #2 + vext.8 q4, q4, q4, #14 + vext.8 q4, q4, q5, #2 + b 88f + +8: // w > 5, w == 6, 9 pixels valid in q2-q3, 1 pixel valid in q3 + vext.8 q3, q3, q3, #2 + vext.8 q3, q3, q11, #14 + vext.8 q5, q5, q5, #2 + vext.8 q5, q5, q12, #14 + +88: + // w < 7, q2-q3 padded properly + cmp r5, #4 + blt 888f + + // w >= 4, filter 4 pixels + filter_4 + vst1.16 {d12}, [r0, :64]! + vst1.16 {d13}, [r12, :64]! + subs r5, r5, #4 // 0 <= w < 4 + vext.8 q2, q2, q3, #8 + vext.8 q4, q4, q5, #8 + beq 9f +888: // 1 <= w < 4, filter 1 pixel at a time + vmull.s16 q6, d4, d0 + vmull.s16 q7, d5, d1 + vmull.s16 q8, d8, d0 + vmull.s16 q9, d9, d1 + vadd.i32 q6, q7 + vadd.i32 q8, q9 + vpadd.i32 d12, d12, d13 + vpadd.i32 d13, d16, d17 + vdup.16 d14, d4[3] + vdup.16 d15, d8[3] + vpadd.i32 d12, d12, d13 + vtrn.16 d14, d15 + vadd.i32 d12, d12, d28 + vshll.u16 q7, d14, #7 + vmvn.i16 d20, #0x8000 // 0x7fff = (1 << 15) - 1 + vadd.i32 d12, d12, d14 + vrshl.s32 d12, d12, d26 + vqmovun.s32 d12, q6 + vmin.u16 d12, d12, d20 + vsub.i16 d12, d12, d30 + vst1.16 {d12[0]}, [r0, :16]! + vst1.16 {d12[1]}, [r12, :16]! + subs r5, r5, #1 + vext.8 q2, q2, q3, #2 + vext.8 q4, q4, q5, #2 + bgt 888b + +9: + subs r6, r6, #2 + ble 0f + // Jump to the next row and loop horizontally + add r0, r0, r10 + add r12, r12, r10 + add r2, r2, r3 + add lr, lr, r3 + mov r5, r8 + b 1b +0: + vpop {q4-q7} + pop {r4-r11,pc} +.purgem filter_4 +endfunc + +// void dav1d_wiener_filter_v_16bpc_neon(pixel *dst, ptrdiff_t stride, +// const int16_t *mid, int w, int h, +// const int16_t fv[7], enum LrEdgeFlags edges, +// ptrdiff_t mid_stride, const int bitdepth_max); +function wiener_filter_v_16bpc_neon, export=1 + push {r4-r7,lr} + vpush {q4-q5} + ldrd r4, r5, [sp, #52] + ldrd r6, r7, [sp, #60] + ldr lr, [sp, #68] // bitdepth_max + vmov.i16 q1, #0 + mov r12, #128 + vld1.16 {q0}, [r5] + vdup.16 q5, lr + clz lr, lr + vmov.i16 d2[3], r12 + sub lr, lr, #11 // round_bits_v + vadd.i16 q0, q0, q1 + vdup.32 q4, lr + mov lr, r4 + vneg.s32 q4, q4 // -round_bits_v + + // Calculate the number of rows to move back when looping vertically + mov r12, r4 + tst r6, #4 // LR_HAVE_TOP + beq 0f + sub r2, r2, r7, lsl #1 + add r12, r12, #2 +0: + tst r6, #8 // LR_HAVE_BOTTOM + beq 1f + add r12, r12, #2 + +1: // Start of horizontal loop; start one vertical filter slice. + // Load rows into q8-q11 and pad properly. + tst r6, #4 // LR_HAVE_TOP + vld1.16 {q8}, [r2, :128], r7 + beq 2f + // LR_HAVE_TOP + vld1.16 {q10}, [r2, :128], r7 + vmov q9, q8 + vld1.16 {q11}, [r2, :128], r7 + b 3f +2: // !LR_HAVE_TOP + vmov q9, q8 + vmov q10, q8 + vmov q11, q8 + +3: + cmp r4, #4 + blt 5f + // Start filtering normally; fill in q12-q14 with unique rows. + vld1.16 {q12}, [r2, :128], r7 + vld1.16 {q13}, [r2, :128], r7 + vld1.16 {q14}, [r2, :128], r7 + +4: +.macro filter compare + subs r4, r4, #1 + // Interleaving the mul/mla chains actually hurts performance + // significantly on Cortex A53, thus keeping mul/mla tightly + // chained like this. + vmull.s16 q2, d16, d0[0] + vmlal.s16 q2, d18, d0[1] + vmlal.s16 q2, d20, d0[2] + vmlal.s16 q2, d22, d0[3] + vmlal.s16 q2, d24, d1[0] + vmlal.s16 q2, d26, d1[1] + vmlal.s16 q2, d28, d1[2] + vmull.s16 q3, d17, d0[0] + vmlal.s16 q3, d19, d0[1] + vmlal.s16 q3, d21, d0[2] + vmlal.s16 q3, d23, d0[3] + vmlal.s16 q3, d25, d1[0] + vmlal.s16 q3, d27, d1[1] + vmlal.s16 q3, d29, d1[2] + vrshl.s32 q2, q2, q4 // round_bits_v + vrshl.s32 q3, q3, q4 + vqmovun.s32 d4, q2 + vqmovun.s32 d5, q3 + vmin.u16 q2, q2, q5 // bitdepth_max + vst1.16 {q2}, [r0], r1 +.if \compare + cmp r4, #4 +.else + ble 9f +.endif + vmov q8, q9 + vmov q9, q10 + vmov q10, q11 + vmov q11, q12 + vmov q12, q13 + vmov q13, q14 +.endm + filter 1 + blt 7f + vld1.16 {q14}, [r2, :128], r7 + b 4b + +5: // Less than 4 rows in total; not all of q12-q13 are filled yet. + tst r6, #8 // LR_HAVE_BOTTOM + beq 6f + // LR_HAVE_BOTTOM + cmp r4, #2 + // We load at least 2 rows in all cases. + vld1.16 {q12}, [r2, :128], r7 + vld1.16 {q13}, [r2, :128], r7 + bgt 53f // 3 rows in total + beq 52f // 2 rows in total +51: // 1 row in total, q11 already loaded, load edge into q12-q14. + vmov q13, q12 + b 8f +52: // 2 rows in total, q11 already loaded, load q12 with content data + // and 2 rows of edge. + vld1.16 {q14}, [r2, :128], r7 + vmov q15, q14 + b 8f +53: + // 3 rows in total, q11 already loaded, load q12 and q13 with content + // and 2 rows of edge. + vld1.16 {q14}, [r2, :128], r7 + vld1.16 {q15}, [r2, :128], r7 + vmov q1, q15 + b 8f + +6: + // !LR_HAVE_BOTTOM + cmp r4, #2 + bgt 63f // 3 rows in total + beq 62f // 2 rows in total +61: // 1 row in total, q11 already loaded, pad that into q12-q14. + vmov q12, q11 + vmov q13, q11 + vmov q14, q11 + b 8f +62: // 2 rows in total, q11 already loaded, load q12 and pad that into q12-q15. + vld1.16 {q12}, [r2, :128], r7 + vmov q13, q12 + vmov q14, q12 + vmov q15, q12 + b 8f +63: + // 3 rows in total, q11 already loaded, load q12 and q13 and pad q13 into q14-q15,q1. + vld1.16 {q12}, [r2, :128], r7 + vld1.16 {q13}, [r2, :128], r7 + vmov q14, q13 + vmov q15, q13 + vmov q1, q13 + b 8f + +7: + // All registers up to q13 are filled already, 3 valid rows left. + // < 4 valid rows left; fill in padding and filter the last + // few rows. + tst r6, #8 // LR_HAVE_BOTTOM + beq 71f + // LR_HAVE_BOTTOM; load 2 rows of edge. + vld1.16 {q14}, [r2, :128], r7 + vld1.16 {q15}, [r2, :128], r7 + vmov q1, q15 + b 8f +71: + // !LR_HAVE_BOTTOM, pad 3 rows + vmov q14, q13 + vmov q15, q13 + vmov q1, q13 + +8: // At this point, all registers up to q14-q15,q1 are loaded with + // edge/padding (depending on how many rows are left). + filter 0 // This branches to 9f when done + vmov q14, q15 + vmov q15, q1 + b 8b + +9: // End of one vertical slice. + subs r3, r3, #8 + ble 0f + // Move pointers back up to the top and loop horizontally. + mls r0, r1, lr, r0 + mls r2, r7, r12, r2 + add r0, r0, #16 + add r2, r2, #16 + mov r4, lr + b 1b + +0: + vpop {q4-q5} + pop {r4-r7,pc} +.purgem filter +endfunc + +// void dav1d_copy_narrow_16bpc_neon(pixel *dst, ptrdiff_t stride, +// const pixel *src, int w, int h); +function copy_narrow_16bpc_neon, export=1 + push {r4,lr} + ldr r4, [sp, #8] + adr r12, L(copy_narrow_tbl) + ldr r3, [r12, r3, lsl #2] + add r12, r12, r3 + bx r12 + + .align 2 +L(copy_narrow_tbl): + .word 0 + .word 10f - L(copy_narrow_tbl) + CONFIG_THUMB + .word 20f - L(copy_narrow_tbl) + CONFIG_THUMB + .word 30f - L(copy_narrow_tbl) + CONFIG_THUMB + .word 40f - L(copy_narrow_tbl) + CONFIG_THUMB + .word 50f - L(copy_narrow_tbl) + CONFIG_THUMB + .word 60f - L(copy_narrow_tbl) + CONFIG_THUMB + .word 70f - L(copy_narrow_tbl) + CONFIG_THUMB + +10: + add r3, r0, r1 + lsl r1, r1, #1 +18: + subs r4, r4, #8 + blt 110f + vld1.16 {q0}, [r2, :128]! + vst1.16 {d0[0]}, [r0, :16], r1 + vst1.16 {d0[1]}, [r3, :16], r1 + vst1.16 {d0[2]}, [r0, :16], r1 + vst1.16 {d0[3]}, [r3, :16], r1 + vst1.16 {d1[0]}, [r0, :16], r1 + vst1.16 {d1[1]}, [r3, :16], r1 + vst1.16 {d1[2]}, [r0, :16], r1 + vst1.16 {d1[3]}, [r3, :16], r1 + ble 0f + b 18b +110: + add r4, r4, #8 + asr r1, r1, #1 +11: + subs r4, r4, #1 + vld1.16 {d0[]}, [r2]! + vst1.16 {d0[0]}, [r0], r1 + bgt 11b +0: + pop {r4,pc} + +20: + add r3, r0, r1 + lsl r1, r1, #1 +24: + subs r4, r4, #4 + blt 210f + vld1.32 {q0}, [r2, :128]! + vst1.32 {d0[0]}, [r0, :32], r1 + vst1.32 {d0[1]}, [r3, :32], r1 + vst1.32 {d1[0]}, [r0, :32], r1 + vst1.32 {d1[1]}, [r3, :32], r1 + ble 0f + b 24b +210: + add r4, r4, #4 + asr r1, r1, #1 +22: + subs r4, r4, #1 + vld1.32 {d0[]}, [r2, :32]! + vst1.32 {d0[0]}, [r0, :32], r1 + bgt 22b +0: + pop {r4,pc} + +30: + ldr r3, [r2] + ldrh r12, [r2, #4] + add r2, r2, #6 + subs r4, r4, #1 + str r3, [r0] + strh r12, [r0, #4] + add r0, r0, r1 + bgt 30b + pop {r4,pc} + +40: + add r3, r0, r1 + lsl r1, r1, #1 +42: + subs r4, r4, #2 + blt 41f + vld1.16 {q0}, [r2, :128]! + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d1}, [r3, :64], r1 + ble 0f + b 42b +41: + vld1.16 {d0}, [r2, :64] + vst1.16 {d0}, [r0, :64] +0: + pop {r4,pc} + +50: + vld1.16 {d0}, [r2] + ldrh r12, [r2, #8] + add r2, r2, #10 + subs r4, r4, #1 + vst1.16 {d0}, [r0] + strh r12, [r0, #8] + add r0, r0, r1 + bgt 50b + pop {r4,pc} + +60: + vld1.16 {d0}, [r2] + ldr r12, [r2, #8] + add r2, r2, #12 + subs r4, r4, #1 + vst1.16 {d0}, [r0] + str r12, [r0, #8] + add r0, r0, r1 + bgt 60b + pop {r4,pc} + +70: + vld1.16 {d0}, [r2] + ldr r12, [r2, #8] + ldrh lr, [r2, #12] + add r2, r2, #14 + subs r4, r4, #1 + vst1.16 {d0}, [r0] + str r12, [r0, #8] + strh lr, [r0, #12] + add r0, r0, r1 + bgt 70b + pop {r4,pc} +endfunc diff --git a/third_party/dav1d/src/arm/32/mc.S b/third_party/dav1d/src/arm/32/mc.S new file mode 100644 index 0000000000..1a12d93ad9 --- /dev/null +++ b/third_party/dav1d/src/arm/32/mc.S @@ -0,0 +1,3349 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Janne Grunau + * Copyright © 2018, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +.macro avg dst0, dst1, t0, t1, t2, t3 + vld1.16 {\t0,\t1}, [r2, :128]! + vld1.16 {\t2,\t3}, [r3, :128]! + vadd.i16 \t0, \t0, \t2 + vadd.i16 \t1, \t1, \t3 + vqrshrun.s16 \dst0, \t0, #5 + vqrshrun.s16 \dst1, \t1, #5 +.endm + +.macro w_avg dst0, dst1, t0, t1, t2, t3 + vld1.16 {\t0,\t1}, [r2, :128]! + vld1.16 {\t2,\t3}, [r3, :128]! + vsub.i16 \t0, \t2, \t0 + vsub.i16 \t1, \t3, \t1 + vqdmulh.s16 \t0, \t0, q15 + vqdmulh.s16 \t1, \t1, q15 + vadd.i16 \t0, \t2, \t0 + vadd.i16 \t1, \t3, \t1 + vqrshrun.s16 \dst0, \t0, #4 + vqrshrun.s16 \dst1, \t1, #4 +.endm + +.macro mask dst0, dst1, t0, t1, t2, t3 + vld1.8 {q14}, [lr, :128]! + vld1.16 {\t0,\t1}, [r2, :128]! + vmul.i8 q14, q14, q15 + vld1.16 {\t2,\t3}, [r3, :128]! + vshll.i8 q13, d28, #8 + vshll.i8 q14, d29, #8 + vsub.i16 \t0, \t2, \t0 + vsub.i16 \t1, \t3, \t1 + vqdmulh.s16 \t0, \t0, q13 + vqdmulh.s16 \t1, \t1, q14 + vadd.i16 \t0, \t2, \t0 + vadd.i16 \t1, \t3, \t1 + vqrshrun.s16 \dst0, \t0, #4 + vqrshrun.s16 \dst1, \t1, #4 +.endm + +.macro bidir_fn type +function \type\()_8bpc_neon, export=1 + push {r4-r6,lr} + ldr r4, [sp, #16] + ldr r5, [sp, #20] + clz r4, r4 +.ifnc \type, avg + ldr lr, [sp, #24] +.endif +.ifc \type, w_avg + vdup.s16 q15, lr + vneg.s16 q15, q15 + vshl.i16 q15, q15, #11 +.endif +.ifc \type, mask + vmov.i8 q15, #256-2 +.endif + adr r12, L(\type\()_tbl) + sub r4, r4, #24 + ldr r4, [r12, r4, lsl #2] + \type d16, d17, q0, q1, q2, q3 + add r12, r12, r4 + bx r12 + + .align 2 +L(\type\()_tbl): + .word 1280f - L(\type\()_tbl) + CONFIG_THUMB + .word 640f - L(\type\()_tbl) + CONFIG_THUMB + .word 320f - L(\type\()_tbl) + CONFIG_THUMB + .word 160f - L(\type\()_tbl) + CONFIG_THUMB + .word 80f - L(\type\()_tbl) + CONFIG_THUMB + .word 4f - L(\type\()_tbl) + CONFIG_THUMB + +4: + add r6, r0, r1 + lsl r1, r1, #1 + cmp r5, #4 + vst1.32 {d16[0]}, [r0, :32], r1 + vst1.32 {d16[1]}, [r6, :32], r1 + vst1.32 {d17[0]}, [r0, :32], r1 + vst1.32 {d17[1]}, [r6, :32], r1 + beq 0f + \type d18, d19, q0, q1, q2, q3 + cmp r5, #8 + vst1.32 {d18[0]}, [r0, :32], r1 + vst1.32 {d18[1]}, [r6, :32], r1 + vst1.32 {d19[0]}, [r0, :32], r1 + vst1.32 {d19[1]}, [r6, :32], r1 + beq 0f + \type d16, d17, q0, q1, q2, q3 + vst1.32 {d16[0]}, [r0, :32], r1 + vst1.32 {d16[1]}, [r6, :32], r1 + \type d18, d19, q0, q1, q2, q3 + vst1.32 {d17[0]}, [r0, :32], r1 + vst1.32 {d17[1]}, [r6, :32], r1 + vst1.32 {d18[0]}, [r0, :32], r1 + vst1.32 {d18[1]}, [r6, :32], r1 + vst1.32 {d19[0]}, [r0, :32], r1 + vst1.32 {d19[1]}, [r6, :32], r1 + pop {r4-r6,pc} +80: + add r6, r0, r1 + lsl r1, r1, #1 +8: + vst1.8 {d16}, [r0, :64], r1 + \type d18, d19, q0, q1, q2, q3 + vst1.8 {d17}, [r6, :64], r1 + vst1.8 {d18}, [r0, :64], r1 + subs r5, r5, #4 + vst1.8 {d19}, [r6, :64], r1 + ble 0f + \type d16, d17, q0, q1, q2, q3 + b 8b +160: + add r6, r0, r1 + lsl r1, r1, #1 +16: + \type d18, d19, q0, q1, q2, q3 + vst1.8 {q8}, [r0, :128], r1 + \type d20, d21, q0, q1, q2, q3 + vst1.8 {q9}, [r6, :128], r1 + \type d22, d23, q0, q1, q2, q3 + vst1.8 {q10}, [r0, :128], r1 + subs r5, r5, #4 + vst1.8 {q11}, [r6, :128], r1 + ble 0f + \type d16, d17, q0, q1, q2, q3 + b 16b +320: + add r6, r0, r1 + lsl r1, r1, #1 +32: + \type d18, d19, q0, q1, q2, q3 + \type d20, d21, q0, q1, q2, q3 + vst1.8 {q8, q9}, [r0, :128], r1 + \type d22, d23, q0, q1, q2, q3 + subs r5, r5, #2 + vst1.8 {q10, q11}, [r6, :128], r1 + ble 0f + \type d16, d17, q0, q1, q2, q3 + b 32b +640: + add r6, r0, #32 +64: + \type d18, d19, q0, q1, q2, q3 + \type d20, d21, q0, q1, q2, q3 + \type d22, d23, q0, q1, q2, q3 + vst1.8 {q8, q9}, [r0, :128], r1 + \type d16, d17, q0, q1, q2, q3 + vst1.8 {q10, q11}, [r6, :128], r1 + \type d18, d19, q0, q1, q2, q3 + \type d20, d21, q0, q1, q2, q3 + vst1.8 {q8, q9}, [r0, :128], r1 + \type d22, d23, q0, q1, q2, q3 + subs r5, r5, #2 + vst1.8 {q10, q11}, [r6, :128], r1 + ble 0f + \type d16, d17, q0, q1, q2, q3 + b 64b +1280: + sub r1, r1, #32 + add r6, r0, #64 +128: + \type d18, d19, q0, q1, q2, q3 + \type d20, d21, q0, q1, q2, q3 + \type d22, d23, q0, q1, q2, q3 + vst1.8 {q8, q9}, [r0, :128]! + \type d16, d17, q0, q1, q2, q3 + vst1.8 {q10, q11}, [r0, :128], r1 + \type d18, d19, q0, q1, q2, q3 + \type d20, d21, q0, q1, q2, q3 + vst1.8 {q8, q9}, [r6, :128]! + \type d22, d23, q0, q1, q2, q3 + subs r5, r5, #1 + vst1.8 {q10, q11}, [r6, :128], r1 + ble 0f + \type d16, d17, q0, q1, q2, q3 + b 128b + +0: + pop {r4-r6,pc} +endfunc +.endm + +bidir_fn avg +bidir_fn w_avg +bidir_fn mask + + +.macro w_mask_fn type +function w_mask_\type\()_8bpc_neon, export=1 + push {r4-r9,lr} + ldr r4, [sp, #28] + ldr r5, [sp, #32] + ldr r6, [sp, #36] + ldr r7, [sp, #40] + clz r8, r4 + adr r9, L(w_mask_\type\()_tbl) + sub r8, r8, #24 + ldr r8, [r9, r8, lsl #2] + add r9, r9, r8 + movw r12, #6903 + vdup.16 q14, r12 +.if \type == 444 + vmov.i8 q15, #64 +.elseif \type == 422 + vdup.8 d0, r7 // d0[] <- sign + vmov.i8 d30, #129 + vsub.i8 d30, d30, d0 // 129 - sign +.elseif \type == 420 + vdup.16 q0, r7 // d0[] <- sign + vmov.i16 q15, #256 + vsub.i16 q15, q15, q0 // 256 - sign +.endif + add r12, r0, r1 + lsl r1, r1, #1 + bx r9 + + .align 2 +L(w_mask_\type\()_tbl): + .word 1280f - L(w_mask_\type\()_tbl) + CONFIG_THUMB + .word 640f - L(w_mask_\type\()_tbl) + CONFIG_THUMB + .word 320f - L(w_mask_\type\()_tbl) + CONFIG_THUMB + .word 160f - L(w_mask_\type\()_tbl) + CONFIG_THUMB + .word 8f - L(w_mask_\type\()_tbl) + CONFIG_THUMB + .word 4f - L(w_mask_\type\()_tbl) + CONFIG_THUMB + +4: + vld1.16 {d0, d1, d2, d3}, [r2, :128]! // tmp1 (four rows at once) + vld1.16 {d4, d5, d6, d7}, [r3, :128]! // tmp2 (four rows at once) + subs r5, r5, #4 + vsub.i16 q8, q2, q0 // tmp2-tmp1 + vsub.i16 q9, q3, q1 + vabd.s16 q10, q0, q2 // (abs(tmp1[x] - tmp2[x])) + vabd.s16 q11, q1, q3 + vqsub.u16 q10, q14, q10 // 6903 - abs () + vqsub.u16 q11, q14, q11 + vshr.s16 q10, q10, #8 // 64-m = (6903 - abs()) >> 8 + vshr.s16 q11, q11, #8 + vshl.s16 q12, q10, #9 // (64-m)<<9 + vshl.s16 q13, q11, #9 + vqdmulh.s16 q12, q12, q8 // ((tmp2-tmp1)*(64-m)<<9)>>15 + vqdmulh.s16 q13, q13, q9 + vadd.i16 q12, q12, q0 // (((tmp2-tmp1)*(64-m)<<9)>>15) + tmp1 + vadd.i16 q13, q13, q1 + vqrshrun.s16 d24, q12, #4 // (((((tmp2-tmp1)*(64-m)<<9)>>15) + tmp1) + 8) >> 4 + vqrshrun.s16 d25, q13, #4 +.if \type == 444 + vmovn.u16 d20, q10 // 64 - m + vmovn.u16 d21, q11 + vsub.i8 q10, q15, q10 // m + vst1.8 {d20, d21}, [r6, :128]! +.elseif \type == 422 + vpadd.s16 d20, d20, d21 // (64 - m) + (64 - n) (column wise addition) + vpadd.s16 d21, d22, d23 + vmovn.s16 d6, q10 + vhsub.u8 d6, d30, d6 // ((129 - sign) - ((64 - m) + (64 - n))) >> 1 + vst1.8 {d6}, [r6, :64]! +.elseif \type == 420 + vadd.s16 d20, d20, d21 // (64 - my1) + (64 - my2) (row wise addition) + vadd.s16 d21, d22, d23 + vpadd.s16 d20, d20, d21 // (128 - m) + (128 - n) (column wise addition) + vsub.s16 d20, d30, d20 // (256 - sign) - ((128 - m) + (128 - n)) + vrshrn.u16 d20, q10, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 + vst1.32 {d20[0]}, [r6, :32]! +.endif + vst1.32 {d24[0]}, [r0, :32], r1 + vst1.32 {d24[1]}, [r12, :32], r1 + vst1.32 {d25[0]}, [r0, :32], r1 + vst1.32 {d25[1]}, [r12, :32], r1 + bgt 4b + pop {r4-r9,pc} +8: + vld1.16 {d0, d1, d2, d3}, [r2, :128]! // tmp1y1, tmp1y2 + vld1.16 {d4, d5, d6, d7}, [r3, :128]! // tmp2y1, tmp2y2 + subs r5, r5, #2 + vsub.i16 q8, q2, q0 // tmp2y1 - tmp1y1 + vsub.i16 q9, q3, q1 // tmp2y2 - tmp1y2 + vabd.s16 q10, q0, q2 // abs(tmp1y1 - tmp2y1) + vabd.s16 q11, q1, q3 // abs(tmp1y2 - tmp2y2) + vqsub.u16 q10, q14, q10 // 6903 - abs(tmp1y1 - tmp2y1) + vqsub.u16 q11, q14, q11 // 6903 - abs(tmp1y2 - tmp2y2) + vshr.s16 q10, q10, #8 // 64 - my1 = 6903 - abs(tmp1y1 - tmp2y1) >> 8 + vshr.s16 q11, q11, #8 // 64 - my2 = 6903 - abs(tmp1y2 - tmp2y2) >> 8 + vshl.s16 q12, q10, #9 // (64 - my1) << 9 + vshl.s16 q13, q11, #9 // (64 - my2) << 9 + vqdmulh.s16 q12, q12, q8 // ((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15 + vqdmulh.s16 q13, q13, q9 // ((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15 + vadd.s16 q12, q12, q0 // (((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15) + tmp1y1 + vadd.s16 q13, q13, q1 // (((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15) + tmp1y2 + vqrshrun.s16 d24, q12, #4 // (((((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15) + tmp1y1) + 8) >> 4 + vqrshrun.s16 d25, q13, #4 // (((((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15) + tmp1y2) + 8) >> 4 +.if \type == 444 + vmovn.u16 d20, q10 // 64 - m + vmovn.u16 d21, q11 + vsub.i8 q10, q15, q10 // m + vst1.8 {d20, d21}, [r6, :128]! +.elseif \type == 422 + vpadd.s16 d20, d20, d21 // (64 - my1) + (64 - ny1) (column wise addition) + vpadd.s16 d21, d22, d23 // (64 - my2) + (64 - ny2) + vmovn.s16 d20, q10 + vhsub.u8 d20, d30, d20 // ((129 - sign) - ((64 - my1/y2) + (64 - ny1/y2))) >> 1 + vst1.8 {d20}, [r6, :64]! +.elseif \type == 420 + vadd.s16 q10, q10, q11 // (64 - my1) + (64 - my2) (row wise addition) + vpadd.s16 d20, d20, d21 // (128 - m) + (128 - n) (column wise addition) + vsub.s16 d20, d30, d20 // (256 - sign) - ((128 - m) + (128 - n)) + vrshrn.u16 d20, q10, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 + vst1.32 {d20[0]}, [r6, :32]! +.endif + vst1.16 {d24}, [r0, :64], r1 + vst1.16 {d25}, [r12, :64], r1 + bgt 8b + pop {r4-r9,pc} +1280: +640: +320: +160: + sub r1, r1, r4 +.if \type == 444 + add lr, r6, r4 +.elseif \type == 422 + add lr, r6, r4, lsr #1 +.endif + add r9, r3, r4, lsl #1 + add r7, r2, r4, lsl #1 +161: + mov r8, r4 +16: + vld1.16 {d0, d1, d2, d3}, [r2, :128]! // tmp1y1 + vld1.16 {d4, d5, d6, d7}, [r3, :128]! // tmp2y1 + vld1.16 {d16, d17, d18, d19}, [r7, :128]! // tmp1y2 + subs r8, r8, #16 + vsub.i16 q2, q2, q0 // tmp2y1 - tmp1y1 + vsub.i16 q3, q3, q1 + vabs.s16 q10, q2 // abs(tm2y1 - tmp1y1) + vabs.s16 q11, q3 + vqsub.u16 q10, q14, q10 // 6903 - abs(tmp1y1 - tmp2y1) + vqsub.u16 q11, q14, q11 + vshr.s16 q10, q10, #8 // 64 - my1 = 6903 - abs(tmp1y1 - tmp2y1) >> 8 + vshr.s16 q11, q11, #8 + vshl.s16 q12, q10, #9 // (64 - my1) << 9 + vshl.s16 q13, q11, #9 + vqdmulh.s16 q12, q12, q2 // ((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15 + vqdmulh.s16 q13, q13, q3 + vadd.i16 q12, q12, q0 // (((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15) + tmp1y1 + vadd.i16 q13, q13, q1 + vld1.16 {d0, d1, d2, d3}, [r9, :128]! // tmp2h2 +.if \type == 444 + vmovn.u16 d20, q10 // 64 - my1 + vmovn.u16 d21, q11 + vsub.i8 q10, q15, q10 // my1 + vst1.8 {d20, d21}, [r6, :128]! +.elseif \type == 422 + vpadd.s16 d20, d20, d21 // (64 - my1) + (64 - ny1) (column wise addition) + vpadd.s16 d21, d22, d23 + vmovn.s16 d20, q10 + vhsub.u8 d20, d30, d20 // ((129 - sign) - ((64 - my1) + (64 - ny1))) >> 1 + vst1.8 {d20}, [r6, :64]! +.endif + vqrshrun.s16 d24, q12, #4 // (((((tmp2y1 - tmp1y1)*(64 - my1) << 9) >> 15) + tmp1y1) + 8) >> 4 + vqrshrun.s16 d25, q13, #4 + vsub.i16 q0, q0, q8 // tmp2y2 - tmp1y2 + vsub.i16 q1, q1, q9 + vst1.16 {d24, d25}, [r0, :128]! // store dsty1 + vabs.s16 q2, q0 // abs(tmp2y2 - tmp1y2) + vabs.s16 q3, q1 + vqsub.u16 q2, q14, q2 // 6903 - abs(tmp2y2 - tmp1y2) + vqsub.u16 q3, q14, q3 + vshr.s16 q2, q2, #8 // (6903 - abs(tmp2y2 - tmp1y2)) >> 8 + vshr.s16 q3, q3, #8 + vshl.s16 q12, q2, #9 // (64 - my2) << 9 + vshl.s16 q13, q3, #9 +.if \type == 444 + vmovn.u16 d4, q2 // 64 - my2 + vmovn.u16 d5, q3 + vsub.i8 q2, q15, q2 // my2 + vst1.8 {d4, d5}, [lr, :128]! +.elseif \type == 422 + vpadd.s16 d4, d4, d5 // (64 - my2) + (64 - ny2) (column wise addition) + vpadd.s16 d5, d6, d7 + vmovn.s16 d4, q2 + vhsub.u8 d4, d30, d4 // ((129 - sign) - ((64 - my2) + (64 - ny2))) >> 1 + vst1.8 {d4}, [lr, :64]! +.elseif \type == 420 + vadd.s16 q10, q10, q2 // (64 - my1) + (64 - my2) (row wise addition) + vadd.s16 q11, q11, q3 + vpadd.s16 d20, d20, d21 // (128 - m) + (128 - n) (column wise addition) + vpadd.s16 d21, d22, d23 + vsub.s16 q10, q15, q10 // (256 - sign) - ((128 - m) + (128 - n)) + vrshrn.u16 d20, q10, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 + vst1.8 {d20}, [r6, :64]! +.endif + vqdmulh.s16 q12, q12, q0 // ((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15 + vqdmulh.s16 q13, q13, q1 + vadd.i16 q12, q12, q8 // (((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15) + tmp1y2 + vadd.i16 q13, q13, q9 + vqrshrun.s16 d24, q12, #4 // (((((tmp2y2 - tmp1y2)*(64 - my2) << 9) >> 15) + tmp1y2) + 8) >> 4 + vqrshrun.s16 d25, q13, #4 + vst1.16 {d24, d25}, [r12, :128]! // store dsty2 + bgt 16b + subs r5, r5, #2 + add r2, r2, r4, lsl #1 + add r3, r3, r4, lsl #1 + add r7, r7, r4, lsl #1 + add r9, r9, r4, lsl #1 +.if \type == 444 + add r6, r6, r4 + add lr, lr, r4 +.elseif \type == 422 + add r6, r6, r4, lsr #1 + add lr, lr, r4, lsr #1 +.endif + add r0, r0, r1 + add r12, r12, r1 + bgt 161b + pop {r4-r9,pc} +endfunc +.endm + +w_mask_fn 444 +w_mask_fn 422 +w_mask_fn 420 + + +function blend_8bpc_neon, export=1 + push {r4-r5,lr} + ldr r4, [sp, #12] + ldr r5, [sp, #16] + clz lr, r3 + adr r3, L(blend_tbl) + sub lr, lr, #26 + ldr lr, [r3, lr, lsl #2] + add r3, r3, lr + bx r3 + + .align 2 +L(blend_tbl): + .word 320f - L(blend_tbl) + CONFIG_THUMB + .word 160f - L(blend_tbl) + CONFIG_THUMB + .word 80f - L(blend_tbl) + CONFIG_THUMB + .word 40f - L(blend_tbl) + CONFIG_THUMB + +40: + vmov.i8 d22, #64 + add r12, r0, r1 + lsl r1, r1, #1 +4: + vld1.u8 {d2}, [r5, :64]! + vld1.u8 {d1}, [r2, :64]! + vld1.32 {d0[]}, [r0, :32] + subs r4, r4, #2 + vld1.32 {d0[1]}, [r12, :32] + vsub.i8 d3, d22, d2 + vmull.u8 q8, d1, d2 + vmlal.u8 q8, d0, d3 + vrshrn.i16 d20, q8, #6 + vst1.32 {d20[0]}, [r0, :32], r1 + vst1.32 {d20[1]}, [r12, :32], r1 + bgt 4b + pop {r4-r5,pc} +80: + vmov.i8 d16, #64 + add r12, r0, r1 + lsl r1, r1, #1 +8: + vld1.u8 {q1}, [r5, :128]! + vld1.u8 {q2}, [r2, :128]! + vld1.u8 {d0}, [r0, :64] + vsub.i8 d17, d16, d2 + vld1.u8 {d1}, [r12, :64] + subs r4, r4, #2 + vsub.i8 d18, d16, d3 + vmull.u8 q3, d2, d4 + vmlal.u8 q3, d0, d17 + vmull.u8 q10, d3, d5 + vmlal.u8 q10, d1, d18 + vrshrn.i16 d22, q3, #6 + vrshrn.i16 d23, q10, #6 + vst1.u8 {d22}, [r0, :64], r1 + vst1.u8 {d23}, [r12, :64], r1 + bgt 8b + pop {r4-r5,pc} +160: + vmov.i8 q12, #64 + add r12, r0, r1 + lsl r1, r1, #1 +16: + vld1.u8 {q1, q2}, [r5, :128]! + vld1.u8 {q8, q9}, [r2, :128]! + vld1.u8 {q0}, [r0, :128] + subs r4, r4, #2 + vsub.i8 q15, q12, q1 + vld1.u8 {q13}, [r12, :128] + vmull.u8 q3, d16, d2 + vmlal.u8 q3, d0, d30 + vmull.u8 q14, d17, d3 + vmlal.u8 q14, d1, d31 + vsub.i8 q15, q12, q2 + vrshrn.i16 d20, q3, #6 + vrshrn.i16 d21, q14, #6 + vmull.u8 q3, d18, d4 + vmlal.u8 q3, d26, d30 + vmull.u8 q14, d19, d5 + vmlal.u8 q14, d27, d31 + vrshrn.i16 d22, q3, #6 + vrshrn.i16 d23, q14, #6 + vst1.u8 {q10}, [r0, :128], r1 + vst1.u8 {q11}, [r12, :128], r1 + bgt 16b + pop {r4-r5,pc} +320: + vmov.i8 q10, #64 +32: + vld1.u8 {q2, q3}, [r5, :128]! + vld1.u8 {q8, q9}, [r2, :128]! + vld1.u8 {q0, q1}, [r0, :128] + subs r4, r4, #1 + vsub.i8 q11, q10, q2 + vmull.u8 q15, d16, d4 + vmlal.u8 q15, d0, d22 + vmull.u8 q14, d17, d5 + vmlal.u8 q14, d1, d23 + vsub.i8 q11, q10, q3 + vrshrn.i16 d24, q15, #6 + vrshrn.i16 d25, q14, #6 + vmull.u8 q15, d18, d6 + vmlal.u8 q15, d2, d22 + vmull.u8 q14, d19, d7 + vmlal.u8 q14, d3, d23 + vrshrn.i16 d26, q15, #6 + vrshrn.i16 d27, q14, #6 + vst1.u8 {q12, q13}, [r0, :128], r1 + bgt 32b + pop {r4-r5,pc} +endfunc + +function blend_h_8bpc_neon, export=1 + push {r4-r8,lr} + ldr r4, [sp, #24] + movrel r5, X(obmc_masks) + add r5, r5, r4 + sub r4, r4, r4, lsr #2 + clz r6, r3 + adr r7, L(blend_h_tbl) + sub r6, r6, #24 + ldr r6, [r7, r6, lsl #2] + add r7, r7, r6 + bx r7 + + .align 2 +L(blend_h_tbl): + .word 1280f - L(blend_h_tbl) + CONFIG_THUMB + .word 640f - L(blend_h_tbl) + CONFIG_THUMB + .word 320f - L(blend_h_tbl) + CONFIG_THUMB + .word 160f - L(blend_h_tbl) + CONFIG_THUMB + .word 80f - L(blend_h_tbl) + CONFIG_THUMB + .word 40f - L(blend_h_tbl) + CONFIG_THUMB + .word 20f - L(blend_h_tbl) + CONFIG_THUMB + +20: + vmov.i8 d22, #64 + add r12, r0, r1 + lsl r1, r1, #1 +2: + vld1.16 {d2[], d3[]}, [r5, :16]! + vld1.32 {d1[0]}, [r2, :32]! + subs r4, r4, #2 + vld1.16 {d0[]}, [r0, :16] + vzip.8 d2, d3 + vsub.i8 d4, d22, d2 + vld1.16 {d0[1]}, [r12, :16] + vmull.u8 q8, d1, d2 + vmlal.u8 q8, d0, d4 + vrshrn.i16 d20, q8, #6 + vst1.16 {d20[0]}, [r0, :16], r1 + vst1.16 {d20[1]}, [r12, :16], r1 + bgt 2b + pop {r4-r8,pc} +40: + vmov.i8 d22, #64 + add r12, r0, r1 + lsl r1, r1, #1 +4: + vld2.u8 {d2[], d3[]}, [r5, :16]! + vld1.u8 {d1}, [r2, :64]! + subs r4, r4, #2 + vext.u8 d2, d2, d3, #4 + vld1.32 {d0[]}, [r0, :32] + vsub.i8 d6, d22, d2 + vld1.32 {d0[1]}, [r12, :32] + vmull.u8 q8, d1, d2 + vmlal.u8 q8, d0, d6 + vrshrn.i16 d20, q8, #6 + vst1.32 {d20[0]}, [r0, :32], r1 + vst1.32 {d20[1]}, [r12, :32], r1 + bgt 4b + pop {r4-r8,pc} +80: + vmov.i8 q8, #64 + add r12, r0, r1 + lsl r1, r1, #1 +8: + vld2.u8 {d2[], d3[]}, [r5, :16]! + vld1.u8 {d4, d5}, [r2, :128]! + vld1.u8 {d0}, [r0, :64] + vsub.i8 q9, q8, q1 + vld1.u8 {d1}, [r12, :64] + subs r4, r4, #2 + vmull.u8 q3, d2, d4 + vmlal.u8 q3, d0, d18 + vmull.u8 q10, d3, d5 + vmlal.u8 q10, d1, d19 + vrshrn.i16 d22, q3, #6 + vrshrn.i16 d23, q10, #6 + vst1.u8 {d22}, [r0, :64], r1 + vst1.u8 {d23}, [r12, :64], r1 + bgt 8b + pop {r4-r8,pc} +160: + vmov.i8 q12, #64 + add r12, r0, r1 + lsl r1, r1, #1 +16: + vld2.u8 {d28[], d29[]}, [r5, :16]! + vld1.u8 {d2, d3, d4, d5}, [r2, :128]! + vsub.i8 q15, q12, q14 + vld1.u8 {q0}, [r0, :128] + subs r4, r4, #2 + vld1.u8 {q13}, [r12, :128] + vmull.u8 q3, d2, d28 + vmlal.u8 q3, d0, d30 + vmull.u8 q8, d3, d28 + vmlal.u8 q8, d1, d30 + vrshrn.i16 d18, q3, #6 + vrshrn.i16 d19, q8, #6 + vmull.u8 q3, d4, d29 + vmlal.u8 q3, d26, d31 + vmull.u8 q8, d5, d29 + vmlal.u8 q8, d27, d31 + vrshrn.i16 d20, q3, #6 + vrshrn.i16 d21, q8, #6 + vst1.u8 {q9}, [r0, :128], r1 + vst1.u8 {q10}, [r12, :128], r1 + bgt 16b + pop {r4-r8,pc} +320: +640: +1280: + vmov.i8 d20, #64 + sub r1, r1, r3 +321: + vld1.u8 {d6[]}, [r5]! + vsub.i8 d7, d20, d6 + mov r8, r3 +32: + vld1.u8 {q8, q9}, [r2, :128]! + vld1.u8 {q0, q1}, [r0, :128] + vmull.u8 q15, d16, d6 + vmlal.u8 q15, d0, d7 + vmull.u8 q14, d17, d6 + vmlal.u8 q14, d1, d7 + vrshrn.i16 d0, q15, #6 + vrshrn.i16 d1, q14, #6 + vmull.u8 q15, d18, d6 + vmlal.u8 q15, d2, d7 + vmull.u8 q14, d19, d6 + vmlal.u8 q14, d3, d7 + vrshrn.i16 d2, q15, #6 + vrshrn.i16 d3, q14, #6 + vst1.u8 {q0, q1}, [r0, :128]! + subs r8, r8, #32 + bgt 32b + add r0, r0, r1 + subs r4, r4, #1 + bgt 321b + pop {r4-r8,pc} +endfunc + +function blend_v_8bpc_neon, export=1 + push {r4-r5,lr} + ldr r4, [sp, #12] + movrel r5, X(obmc_masks) + add r5, r5, r3 + clz lr, r3 + adr r3, L(blend_v_tbl) + sub lr, lr, #26 + ldr lr, [r3, lr, lsl #2] + add r3, r3, lr + bx r3 + + .align 2 +L(blend_v_tbl): + .word 320f - L(blend_v_tbl) + CONFIG_THUMB + .word 160f - L(blend_v_tbl) + CONFIG_THUMB + .word 80f - L(blend_v_tbl) + CONFIG_THUMB + .word 40f - L(blend_v_tbl) + CONFIG_THUMB + .word 20f - L(blend_v_tbl) + CONFIG_THUMB + +20: + vmov.i8 d22, #64 + vld1.8 {d2[]}, [r5] + add r12, r0, r1 + lsl r1, r1, #1 + vsub.i8 d3, d22, d2 +2: + vld1.16 {d1[0]}, [r2, :16]! + vld1.8 {d0[]}, [r0] + subs r4, r4, #2 + vld1.8 {d1[1]}, [r2] + vld1.8 {d0[1]}, [r12] + vmull.u8 q2, d1, d2 + vmlal.u8 q2, d0, d3 + vrshrn.i16 d6, q2, #6 + add r2, r2, #2 + vst1.8 {d6[0]}, [r0], r1 + vst1.8 {d6[1]}, [r12], r1 + bgt 2b + pop {r4-r5,pc} +40: + vmov.i8 d22, #64 + vld1.32 {d4[]}, [r5, :32] + add r12, r0, r1 + lsl r1, r1, #1 + vsub.i8 d5, d22, d4 + sub r1, r1, #2 +4: + vld1.u8 {d2}, [r2, :64]! + vld1.32 {d0[]}, [r0, :32] + vld1.32 {d0[1]}, [r12, :32] + subs r4, r4, #2 + vmull.u8 q3, d2, d4 + vmlal.u8 q3, d0, d5 + vrshrn.i16 d20, q3, #6 + vst1.16 {d20[0]}, [r0, :16]! + vst1.16 {d20[2]}, [r12, :16]! + vst1.8 {d20[2]}, [r0], r1 + vst1.8 {d20[6]}, [r12], r1 + bgt 4b + pop {r4-r5,pc} +80: + vmov.i8 d16, #64 + vld1.u8 {d2}, [r5, :64] + add r12, r0, r1 + lsl r1, r1, #1 + vsub.i8 d17, d16, d2 + sub r1, r1, #4 +8: + vld1.u8 {d4, d5}, [r2, :128]! + vld1.u8 {d0}, [r0, :64] + vld1.u8 {d1}, [r12, :64] + subs r4, r4, #2 + vmull.u8 q3, d2, d4 + vmlal.u8 q3, d0, d17 + vmull.u8 q10, d2, d5 + vmlal.u8 q10, d1, d17 + vrshrn.i16 d22, q3, #6 + vrshrn.i16 d23, q10, #6 + vst1.32 {d22[0]}, [r0, :32]! + vst1.32 {d23[0]}, [r12, :32]! + vst1.16 {d22[2]}, [r0, :16], r1 + vst1.16 {d23[2]}, [r12, :16], r1 + bgt 8b + pop {r4-r5,pc} +160: + vmov.i8 q12, #64 + vld1.u8 {q14}, [r5, :128] + add r12, r0, r1 + lsl r1, r1, #1 + vsub.i8 q11, q12, q14 + sub r1, r1, #8 +16: + vld1.u8 {q1, q2}, [r2, :128]! + vld1.u8 {q0}, [r0, :128] + subs r4, r4, #2 + vld1.u8 {q13}, [r12, :128] + vmull.u8 q3, d2, d28 + vmlal.u8 q3, d0, d22 + vmull.u8 q8, d3, d29 + vmlal.u8 q8, d1, d23 + vrshrn.i16 d18, q3, #6 + vrshrn.i16 d19, q8, #6 + vmull.u8 q3, d4, d28 + vmlal.u8 q3, d26, d22 + vmull.u8 q8, d5, d29 + vmlal.u8 q8, d27, d23 + vrshrn.i16 d20, q3, #6 + vrshrn.i16 d21, q8, #6 + vst1.u8 {d18}, [r0, :64]! + vst1.u8 {d20}, [r12, :64]! + vst1.32 {d19[0]}, [r0, :32], r1 + vst1.32 {d21[0]}, [r12, :32], r1 + bgt 16b + pop {r4-r5,pc} +320: + vmov.i8 q10, #64 + vld1.u8 {q2, q3}, [r5, :128] + vsub.i8 q11, q10, q2 + vsub.i8 d24, d20, d6 +32: + vld1.u8 {q8, q9}, [r2, :128]! + vld1.u8 {d0, d1, d2}, [r0, :64] + subs r4, r4, #1 + vmull.u8 q15, d16, d4 + vmlal.u8 q15, d0, d22 + vmull.u8 q14, d17, d5 + vmlal.u8 q14, d1, d23 + vrshrn.i16 d0, q15, #6 + vrshrn.i16 d1, q14, #6 + vmull.u8 q15, d18, d6 + vmlal.u8 q15, d2, d24 + vrshrn.i16 d2, q15, #6 + vst1.u8 {d0, d1, d2}, [r0, :64], r1 + bgt 32b + pop {r4-r5,pc} +endfunc + + +// This has got the same signature as the put_8tap functions, +// assumes that the caller has loaded the h argument into r5, +// and assumes that r8 is set to (clz(w)-24). +function put_neon + adr r9, L(put_tbl) + ldr r8, [r9, r8, lsl #2] + add r9, r9, r8 + bx r9 + + .align 2 +L(put_tbl): + .word 1280f - L(put_tbl) + CONFIG_THUMB + .word 640f - L(put_tbl) + CONFIG_THUMB + .word 32f - L(put_tbl) + CONFIG_THUMB + .word 160f - L(put_tbl) + CONFIG_THUMB + .word 8f - L(put_tbl) + CONFIG_THUMB + .word 4f - L(put_tbl) + CONFIG_THUMB + .word 2f - L(put_tbl) + CONFIG_THUMB + +2: + vld1.16 {d0[]}, [r2], r3 + vld1.16 {d1[]}, [r2], r3 + subs r5, r5, #2 + vst1.16 {d0[0]}, [r0, :16], r1 + vst1.16 {d1[0]}, [r0, :16], r1 + bgt 2b + pop {r4-r11,pc} +4: + vld1.32 {d0[]}, [r2], r3 + vld1.32 {d1[]}, [r2], r3 + subs r5, r5, #2 + vst1.32 {d0[0]}, [r0, :32], r1 + vst1.32 {d1[0]}, [r0, :32], r1 + bgt 4b + pop {r4-r11,pc} +8: + vld1.8 {d0}, [r2], r3 + vld1.8 {d1}, [r2], r3 + subs r5, r5, #2 + vst1.8 {d0}, [r0, :64], r1 + vst1.8 {d1}, [r0, :64], r1 + bgt 8b + pop {r4-r11,pc} +160: + add r8, r0, r1 + lsl r1, r1, #1 + add r9, r2, r3 + lsl r3, r3, #1 +16: + vld1.8 {q0}, [r2], r3 + vld1.8 {q1}, [r9], r3 + subs r5, r5, #2 + vst1.8 {q0}, [r0, :128], r1 + vst1.8 {q1}, [r8, :128], r1 + bgt 16b + pop {r4-r11,pc} +32: + vld1.8 {q0, q1}, [r2], r3 + subs r5, r5, #1 + vst1.8 {q0, q1}, [r0, :128], r1 + bgt 32b + pop {r4-r11,pc} +640: + sub r1, r1, #32 + sub r3, r3, #32 +64: + vld1.8 {q0, q1}, [r2]! + vst1.8 {q0, q1}, [r0, :128]! + vld1.8 {q2, q3}, [r2], r3 + subs r5, r5, #1 + vst1.8 {q2, q3}, [r0, :128], r1 + bgt 64b + pop {r4-r11,pc} +1280: + sub r1, r1, #96 + sub r3, r3, #96 +128: + vld1.8 {q8, q9}, [r2]! + vst1.8 {q8, q9}, [r0, :128]! + vld1.8 {q10, q11}, [r2]! + vst1.8 {q10, q11}, [r0, :128]! + vld1.8 {q12, q13}, [r2]! + vst1.8 {q12, q13}, [r0, :128]! + vld1.8 {q14, q15}, [r2], r3 + subs r5, r5, #1 + vst1.8 {q14, q15}, [r0, :128], r1 + bgt 128b + pop {r4-r11,pc} +endfunc + + +// This has got the same signature as the put_8tap functions, +// assumes that the caller has loaded the h argument into r4, +// and assumes that r8 is set to (clz(w)-24), and r7 to w*2. +function prep_neon + adr r9, L(prep_tbl) + ldr r8, [r9, r8, lsl #2] + add r9, r9, r8 + bx r9 + + .align 2 +L(prep_tbl): + .word 1280f - L(prep_tbl) + CONFIG_THUMB + .word 640f - L(prep_tbl) + CONFIG_THUMB + .word 320f - L(prep_tbl) + CONFIG_THUMB + .word 160f - L(prep_tbl) + CONFIG_THUMB + .word 8f - L(prep_tbl) + CONFIG_THUMB + .word 4f - L(prep_tbl) + CONFIG_THUMB + +4: + vld1.32 {d0[]}, [r1], r2 + vld1.32 {d2[]}, [r1], r2 + subs r4, r4, #2 + vshll.u8 q0, d0, #4 + vshll.u8 q1, d2, #4 + vst1.16 {d1, d2}, [r0, :64]! + bgt 4b + pop {r4-r11,pc} +8: + vld1.8 {d0}, [r1], r2 + vld1.8 {d2}, [r1], r2 + subs r4, r4, #2 + vshll.u8 q0, d0, #4 + vshll.u8 q1, d2, #4 + vst1.16 {q0, q1}, [r0, :128]! + bgt 8b + pop {r4-r11,pc} +160: + add r9, r1, r2 + lsl r2, r2, #1 + add r8, r0, r7 + lsl r7, r7, #1 +16: + vld1.8 {q2}, [r1], r2 + vld1.8 {q3}, [r9], r2 + subs r4, r4, #2 + vshll.u8 q0, d4, #4 + vshll.u8 q1, d5, #4 + vshll.u8 q2, d6, #4 + vshll.u8 q3, d7, #4 + vst1.16 {q0, q1}, [r0, :128], r7 + vst1.16 {q2, q3}, [r8, :128], r7 + bgt 16b + pop {r4-r11,pc} +320: + add r8, r0, r3 +32: + vld1.8 {q0, q1}, [r1], r2 + subs r4, r4, #2 + vshll.u8 q8, d0, #4 + vshll.u8 q9, d1, #4 + vld1.8 {q2, q3}, [r1], r2 + vshll.u8 q10, d2, #4 + vshll.u8 q11, d3, #4 + vshll.u8 q12, d4, #4 + vst1.16 {q8, q9}, [r0, :128], r7 + vshll.u8 q13, d5, #4 + vst1.16 {q10, q11}, [r8, :128], r7 + vshll.u8 q14, d6, #4 + vst1.16 {q12, q13}, [r0, :128], r7 + vshll.u8 q15, d7, #4 + vst1.16 {q14, q15}, [r8, :128], r7 + bgt 32b + pop {r4-r11,pc} +640: + sub r2, r2, #32 + add r8, r0, #32 + mov r6, #64 +64: + vld1.8 {q0, q1}, [r1]! + subs r4, r4, #1 + vshll.u8 q8, d0, #4 + vshll.u8 q9, d1, #4 + vld1.8 {q2, q3}, [r1], r2 + vshll.u8 q10, d2, #4 + vshll.u8 q11, d3, #4 + vshll.u8 q12, d4, #4 + vst1.16 {q8, q9}, [r0, :128], r6 + vshll.u8 q13, d5, #4 + vshll.u8 q14, d6, #4 + vst1.16 {q10, q11}, [r8, :128], r6 + vshll.u8 q15, d7, #4 + vst1.16 {q12, q13}, [r0, :128], r6 + vst1.16 {q14, q15}, [r8, :128], r6 + bgt 64b + pop {r4-r11,pc} +1280: + sub r2, r2, #96 + add r8, r0, #32 + mov r6, #64 +128: + vld1.8 {q0, q1}, [r1]! + vld1.8 {q2, q3}, [r1]! + vshll.u8 q10, d0, #4 + vshll.u8 q11, d1, #4 + vshll.u8 q12, d2, #4 + vshll.u8 q13, d3, #4 + vshll.u8 q14, d4, #4 + vshll.u8 q15, d5, #4 + vld1.8 {q8, q9}, [r1]! + vst1.16 {q10, q11}, [r0, :128], r6 + vst1.16 {q12, q13}, [r8, :128], r6 + vshll.u8 q0, d6, #4 + vshll.u8 q1, d7, #4 + vshll.u8 q2, d16, #4 + vshll.u8 q3, d17, #4 + vshll.u8 q8, d18, #4 + vshll.u8 q9, d19, #4 + vld1.8 {q10, q11}, [r1], r2 + vst1.16 {q14, q15}, [r0, :128], r6 + vst1.16 {q0, q1}, [r8, :128], r6 + vshll.u8 q12, d20, #4 + vshll.u8 q13, d21, #4 + vshll.u8 q14, d22, #4 + vshll.u8 q15, d23, #4 + subs r4, r4, #1 + vst1.16 {q2, q3}, [r0, :128], r6 + vst1.16 {q8, q9}, [r8, :128], r6 + vst1.16 {q12, q13}, [r0, :128], r6 + vst1.16 {q14, q15}, [r8, :128], r6 + bgt 128b + pop {r4-r11,pc} +endfunc + + +.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 + vld1.\wd {\d0[]}, [\s0], \strd + vld1.\wd {\d1[]}, [\s1], \strd +.ifnb \d2 + vld1.\wd {\d2[]}, [\s0], \strd + vld1.\wd {\d3[]}, [\s1], \strd +.endif +.ifnb \d4 + vld1.\wd {\d4[]}, [\s0], \strd +.endif +.ifnb \d5 + vld1.\wd {\d5[]}, [\s1], \strd +.endif +.ifnb \d6 + vld1.\wd {\d6[]}, [\s0], \strd +.endif +.endm +.macro load_reg s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 + vld1.8 {\d0}, [\s0], \strd + vld1.8 {\d1}, [\s1], \strd +.ifnb \d2 + vld1.8 {\d2}, [\s0], \strd + vld1.8 {\d3}, [\s1], \strd +.endif +.ifnb \d4 + vld1.8 {\d4}, [\s0], \strd +.endif +.ifnb \d5 + vld1.8 {\d5}, [\s1], \strd +.endif +.ifnb \d6 + vld1.8 {\d6}, [\s0], \strd +.endif +.endm +.macro load_16 s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 + load_slice \s0, \s1, \strd, 16, \d0, \d1, \d2, \d3, \d4, \d5, \d6 +.endm +.macro load_32 s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 + load_slice \s0, \s1, \strd, 32, \d0, \d1, \d2, \d3, \d4, \d5, \d6 +.endm +.macro interleave_1_16 r0, r1, r2, r3, r4 + vext.8 \r0, \r0, \r1, #6 + vext.8 \r1, \r1, \r2, #6 +.ifnb \r3 + vext.8 \r2, \r2, \r3, #6 + vext.8 \r3, \r3, \r4, #6 +.endif +.endm +.macro interleave_1_32 r0, r1, r2, r3, r4 + vext.8 \r0, \r0, \r1, #4 + vext.8 \r1, \r1, \r2, #4 +.ifnb \r3 + vext.8 \r2, \r2, \r3, #4 + vext.8 \r3, \r3, \r4, #4 +.endif +.endm +.macro vmovl_u8 q0, d0, q1, d1, q2, d2, q3, d3, q4, d4, q5, d5, q6, d6 + vmovl.u8 \q0, \d0 + vmovl.u8 \q1, \d1 +.ifnb \q2 + vmovl.u8 \q2, \d2 + vmovl.u8 \q3, \d3 +.endif +.ifnb \q4 + vmovl.u8 \q4, \d4 +.endif +.ifnb \q5 + vmovl.u8 \q5, \d5 +.endif +.ifnb \q6 + vmovl.u8 \q6, \d6 +.endif +.endm +.macro mul_mla_4 d, s0, s1, s2, s3 + vmul.s16 \d, \s0, d0[0] + vmla.s16 \d, \s1, d0[1] + vmla.s16 \d, \s2, d0[2] + vmla.s16 \d, \s3, d0[3] +.endm +.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8 + vmul.s16 \d0, \s0, d0[0] + vmla.s16 \d0, \s1, d0[1] + vmla.s16 \d0, \s2, d0[2] + vmla.s16 \d0, \s3, d0[3] + vmla.s16 \d0, \s4, d1[0] + vmla.s16 \d0, \s5, d1[1] + vmla.s16 \d0, \s6, d1[2] + vmla.s16 \d0, \s7, d1[3] + vmul.s16 \d1, \s1, d0[0] + vmla.s16 \d1, \s2, d0[1] + vmla.s16 \d1, \s3, d0[2] + vmla.s16 \d1, \s4, d0[3] + vmla.s16 \d1, \s5, d1[0] + vmla.s16 \d1, \s6, d1[1] + vmla.s16 \d1, \s7, d1[2] + vmla.s16 \d1, \s8, d1[3] +.endm +.macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9 + vmul.s16 \d0, \s0, d0[0] + vmla.s16 \d0, \s1, d0[1] + vmla.s16 \d0, \s2, d0[2] + vmla.s16 \d0, \s3, d0[3] + vmla.s16 \d0, \s4, d1[0] + vmla.s16 \d0, \s5, d1[1] + vmla.s16 \d0, \s6, d1[2] + vmla.s16 \d0, \s7, d1[3] + vmul.s16 \d1, \s2, d0[0] + vmla.s16 \d1, \s3, d0[1] + vmla.s16 \d1, \s4, d0[2] + vmla.s16 \d1, \s5, d0[3] + vmla.s16 \d1, \s6, d1[0] + vmla.s16 \d1, \s7, d1[1] + vmla.s16 \d1, \s8, d1[2] + vmla.s16 \d1, \s9, d1[3] +.endm +.macro mul_mla_8_4 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11 + vmul.s16 \d0, \s0, d0[0] + vmla.s16 \d0, \s1, d0[1] + vmla.s16 \d0, \s2, d0[2] + vmla.s16 \d0, \s3, d0[3] + vmla.s16 \d0, \s4, d1[0] + vmla.s16 \d0, \s5, d1[1] + vmla.s16 \d0, \s6, d1[2] + vmla.s16 \d0, \s7, d1[3] + vmul.s16 \d1, \s4, d0[0] + vmla.s16 \d1, \s5, d0[1] + vmla.s16 \d1, \s6, d0[2] + vmla.s16 \d1, \s7, d0[3] + vmla.s16 \d1, \s8, d1[0] + vmla.s16 \d1, \s9, d1[1] + vmla.s16 \d1, \s10, d1[2] + vmla.s16 \d1, \s11, d1[3] +.endm +.macro vqrshrun_s16 shift, q0, d0, q1, d1, q2, d2, q3, d3 + vqrshrun.s16 \d0, \q0, #\shift +.ifnb \q1 + vqrshrun.s16 \d1, \q1, #\shift +.endif +.ifnb \q2 + vqrshrun.s16 \d2, \q2, #\shift + vqrshrun.s16 \d3, \q3, #\shift +.endif +.endm +.macro vrshr_s16 shift, r0, r1, r2, r3 + vrshr.s16 \r0, \r0, #\shift +.ifnb \r1 + vrshr.s16 \r1, \r1, #\shift +.endif +.ifnb \r2 + vrshr.s16 \r2, \r2, #\shift + vrshr.s16 \r3, \r3, #\shift +.endif +.endm +.macro st_16 strd, reg, lanes + vst1.16 {\reg[0]}, [r0, :16], \strd + vst1.16 {\reg[1]}, [r8, :16], \strd +.if \lanes > 2 + vst1.16 {\reg[2]}, [r0, :16], \strd + vst1.16 {\reg[3]}, [r8, :16], \strd +.endif +.endm +.macro st_32 strd, r0, r1 + vst1.32 {\r0[0]}, [r0, :32], \strd + vst1.32 {\r0[1]}, [r8, :32], \strd +.ifnb \r1 + vst1.32 {\r1[0]}, [r0, :32], \strd + vst1.32 {\r1[1]}, [r8, :32], \strd +.endif +.endm +.macro st_reg strd, align, r0, r1, r2, r3, r4, r5, r6, r7 + vst1.8 {\r0}, [r0, \align], \strd + vst1.8 {\r1}, [r8, \align], \strd +.ifnb \r2 + vst1.8 {\r2}, [r0, \align], \strd + vst1.8 {\r3}, [r8, \align], \strd +.endif +.ifnb \r4 + vst1.8 {\r4}, [r0, \align], \strd + vst1.8 {\r5}, [r8, \align], \strd + vst1.8 {\r6}, [r0, \align], \strd + vst1.8 {\r7}, [r8, \align], \strd +.endif +.endm +.macro shift_store_4 type, strd, q0, d0, d1, q1, d2, d3 +.ifc \type, put + vqrshrun_s16 6, \q0, \d0, \q1, \d2 + st_32 \strd, \d0, \d2 +.else + vrshr_s16 2, \q0, \q1 + st_reg \strd, :64, \d0, \d1, \d2, \d3 +.endif +.endm +.macro shift_store_8 type, strd, q0, d0, q1, d1, q2, d2, q3, d3 +.ifc \type, put + vqrshrun_s16 6, \q0, \d0, \q1, \d1, \q2, \d2, \q3, \d3 + st_reg \strd, :64, \d0, \d1, \d2, \d3 +.else + vrshr_s16 2, \q0, \q1, \q2, \q3 + st_reg \strd, :128,\q0, \q1, \q2, \q3 +.endif +.endm +.macro shift_store_16 type, strd, q0, d0, d1, q1, q2, d4, d5, q3 +.ifc \type, put + vqrshrun.s16 \d0, \q0, #6 + vqrshrun.s16 \d1, \q1, #6 + vqrshrun.s16 \d4, \q2, #6 + vqrshrun.s16 \d5, \q3, #6 + st_reg \strd, :128, \q0, \q2 +.else + vrshr_s16 2, \q0, \q1, \q2, \q3 + vst1.16 {\q0, \q1}, [r0, :128], \strd + vst1.16 {\q2, \q3}, [r8, :128], \strd +.endif +.endm + +.macro make_8tap_fn op, type, type_h, type_v +function \op\()_8tap_\type\()_8bpc_neon, export=1 + push {r4-r11,lr} + movw r8, \type_h + movw r9, \type_v + b \op\()_8tap_neon +endfunc +.endm + +// No spaces in these expressions, due to gas-preprocessor. +#define REGULAR ((0*15<<7)|3*15) +#define SMOOTH ((1*15<<7)|4*15) +#define SHARP ((2*15<<7)|3*15) + +.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, my, ds2, sr2, shift_hv +make_8tap_fn \type, regular, REGULAR, REGULAR +make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH +make_8tap_fn \type, regular_sharp, REGULAR, SHARP +make_8tap_fn \type, smooth, SMOOTH, SMOOTH +make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR +make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP +make_8tap_fn \type, sharp, SHARP, SHARP +make_8tap_fn \type, sharp_regular, SHARP, REGULAR +make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH + +function \type\()_8tap_neon + ldrd r4, r5, [sp, #36] + ldrd r6, r7, [sp, #44] + movw r10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) + mul \mx, \mx, r10 + mul \my, \my, r10 + add \mx, \mx, r8 // mx, 8tap_h, 4tap_h + add \my, \my, r9 // my, 8tap_v, 4tap_v +.ifc \type, prep + lsl \d_strd, \w, #1 +.endif + + clz r8, \w + tst \mx, #(0x7f << 14) + sub r8, r8, #24 + movrel r10, X(mc_subpel_filters), -8 + bne L(\type\()_8tap_h) + tst \my, #(0x7f << 14) + bne L(\type\()_8tap_v) + b \type\()_neon + +L(\type\()_8tap_h): + cmp \w, #4 + ubfx r9, \mx, #7, #7 + and \mx, \mx, #0x7f + it gt + movgt \mx, r9 + tst \my, #(0x7f << 14) + add \mx, r10, \mx, lsl #3 + bne L(\type\()_8tap_hv) + + adr r9, L(\type\()_8tap_h_tbl) + ldr r8, [r9, r8, lsl #2] + add r9, r9, r8 + bx r9 + + .align 2 +L(\type\()_8tap_h_tbl): + .word 1280f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB + .word 640f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB + .word 320f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB + .word 160f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB + .word 80f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB + .word 40f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB + .word 20f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB + +20: // 2xN h +.ifc \type, put + add \mx, \mx, #2 + vld1.32 {d0[]}, [\mx] + sub \src, \src, #1 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + vmovl.s8 q0, d0 +2: + vld1.8 {d4}, [\src], \s_strd + vld1.8 {d6}, [\sr2], \s_strd + vmovl.u8 q2, d4 + vmovl.u8 q3, d6 + vext.8 d5, d4, d5, #2 + vext.8 d7, d6, d7, #2 + subs \h, \h, #2 + vtrn.32 d4, d6 + vtrn.32 d5, d7 + vmul.s16 d2, d4, d0[0] + vmla.s16 d2, d5, d0[1] + vmla.s16 d2, d6, d0[2] + vmla.s16 d2, d7, d0[3] + vrshr.s16 d2, d2, #2 + vqrshrun.s16 d2, q1, #4 + vst1.16 {d2[0]}, [\dst, :16], \d_strd + vst1.16 {d2[1]}, [\ds2, :16], \d_strd + bgt 2b + pop {r4-r11,pc} +.endif + +40: // 4xN h + add \mx, \mx, #2 + vld1.32 {d0[]}, [\mx] + sub \src, \src, #1 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + vmovl.s8 q0, d0 +4: + vld1.8 {d16}, [\src], \s_strd + vld1.8 {d24}, [\sr2], \s_strd + vmovl.u8 q8, d16 + vmovl.u8 q12, d24 + vext.8 d18, d16, d17, #2 + vext.8 d20, d16, d17, #4 + vext.8 d22, d16, d17, #6 + vext.8 d26, d24, d25, #2 + vext.8 d28, d24, d25, #4 + vext.8 d30, d24, d25, #6 + subs \h, \h, #2 + vmul.s16 d4, d16, d0[0] + vmla.s16 d4, d18, d0[1] + vmla.s16 d4, d20, d0[2] + vmla.s16 d4, d22, d0[3] + vmul.s16 d5, d24, d0[0] + vmla.s16 d5, d26, d0[1] + vmla.s16 d5, d28, d0[2] + vmla.s16 d5, d30, d0[3] + vrshr.s16 q2, q2, #2 +.ifc \type, put + vqrshrun.s16 d4, q2, #4 + vst1.32 {d4[0]}, [\dst, :32], \d_strd + vst1.32 {d4[1]}, [\ds2, :32], \d_strd +.else + vst1.16 {d4}, [\dst, :64], \d_strd + vst1.16 {d5}, [\ds2, :64], \d_strd +.endif + bgt 4b + pop {r4-r11,pc} + +80: // 8xN h + vld1.8 {d0}, [\mx, :64] + sub \src, \src, #3 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + vmovl.s8 q0, d0 +8: + vld1.8 {q8}, [\src], \s_strd + vld1.8 {q12}, [\sr2], \s_strd + vmovl.u8 q9, d17 + vmovl.u8 q8, d16 + vmovl.u8 q13, d25 + vmovl.u8 q12, d24 + + vmul.s16 q10, q8, d0[0] + vmul.s16 q14, q12, d0[0] +.irpc i, 1234567 + vext.8 q11, q8, q9, #(2*\i) + vext.8 q15, q12, q13, #(2*\i) +.if \i < 4 + vmla.s16 q10, q11, d0[\i] + vmla.s16 q14, q15, d0[\i] +.else + vmla.s16 q10, q11, d1[\i-4] + vmla.s16 q14, q15, d1[\i-4] +.endif +.endr + subs \h, \h, #2 + vrshr.s16 q10, q10, #2 + vrshr.s16 q14, q14, #2 +.ifc \type, put + vqrshrun.s16 d20, q10, #4 + vqrshrun.s16 d28, q14, #4 + vst1.8 {d20}, [\dst, :64], \d_strd + vst1.8 {d28}, [\ds2, :64], \d_strd +.else + vst1.16 {q10}, [\dst, :128], \d_strd + vst1.16 {q14}, [\ds2, :128], \d_strd +.endif + bgt 8b + pop {r4-r11,pc} + +160: +320: +640: +1280: // 16xN, 32xN, ... h + // This could be done without touching q4-q6, by using only + // one temporary for vext in the loop. That's slower on A7 and A53, + // (but surprisingly, marginally faster on A8 and A73). + vpush {q4-q6} + vld1.8 {d0}, [\mx, :64] + sub \src, \src, #3 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + vmovl.s8 q0, d0 + + sub \s_strd, \s_strd, \w + sub \s_strd, \s_strd, #8 +.ifc \type, put + lsl \d_strd, \d_strd, #1 + sub \d_strd, \d_strd, \w +.endif +161: + vld1.8 {d16, d17, d18}, [\src]! + vld1.8 {d24, d25, d26}, [\sr2]! + mov \mx, \w + vmovl.u8 q10, d18 + vmovl.u8 q9, d17 + vmovl.u8 q8, d16 + vmovl.u8 q14, d26 + vmovl.u8 q13, d25 + vmovl.u8 q12, d24 + +16: + vmul.s16 q1, q8, d0[0] + vmul.s16 q2, q9, d0[0] + vmul.s16 q3, q12, d0[0] + vmul.s16 q4, q13, d0[0] +.irpc i, 1234567 + vext.8 q5, q8, q9, #(2*\i) + vext.8 q6, q9, q10, #(2*\i) + vext.8 q11, q12, q13, #(2*\i) + vext.8 q15, q13, q14, #(2*\i) +.if \i < 4 + vmla.s16 q1, q5, d0[\i] + vmla.s16 q2, q6, d0[\i] + vmla.s16 q3, q11, d0[\i] + vmla.s16 q4, q15, d0[\i] +.else + vmla.s16 q1, q5, d1[\i-4] + vmla.s16 q2, q6, d1[\i-4] + vmla.s16 q3, q11, d1[\i-4] + vmla.s16 q4, q15, d1[\i-4] +.endif +.endr + vrshr.s16 q1, q1, #2 + vrshr.s16 q2, q2, #2 + vrshr.s16 q3, q3, #2 + vrshr.s16 q4, q4, #2 + subs \mx, \mx, #16 +.ifc \type, put + vqrshrun.s16 d2, q1, #4 + vqrshrun.s16 d3, q2, #4 + vqrshrun.s16 d4, q3, #4 + vqrshrun.s16 d5, q4, #4 + vst1.8 {q1}, [\dst, :128]! + vst1.8 {q2}, [\ds2, :128]! +.else + vst1.16 {q1, q2}, [\dst, :128]! + vst1.16 {q3, q4}, [\ds2, :128]! +.endif + ble 9f + + vmov q8, q10 + vmov q12, q14 + vld1.8 {d18, d19}, [\src]! + vld1.8 {d26, d27}, [\sr2]! + vmovl.u8 q10, d19 + vmovl.u8 q9, d18 + vmovl.u8 q14, d27 + vmovl.u8 q13, d26 + b 16b + +9: + add \dst, \dst, \d_strd + add \ds2, \ds2, \d_strd + add \src, \src, \s_strd + add \sr2, \sr2, \s_strd + + subs \h, \h, #2 + bgt 161b + vpop {q4-q6} + pop {r4-r11,pc} + +L(\type\()_8tap_v): + cmp \h, #4 + ubfx r9, \my, #7, #7 + and \my, \my, #0x7f + it gt + movgt \my, r9 + add \my, r10, \my, lsl #3 + + adr r9, L(\type\()_8tap_v_tbl) + ldr r8, [r9, r8, lsl #2] + add r9, r9, r8 + bx r9 + + .align 2 +L(\type\()_8tap_v_tbl): + .word 1280f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB + .word 640f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB + .word 320f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB + .word 160f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB + .word 80f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB + .word 40f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB + .word 20f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB + +20: // 2xN v +.ifc \type, put + bgt 28f + + cmp \h, #2 + add \my, \my, #2 + vld1.32 {d0[]}, [\my] + sub \src, \src, \s_strd + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + + // 2x2 v + load_16 \src, \sr2, \s_strd, d1, d2, d3, d4, d5 + interleave_1_16 d1, d2, d3, d4, d5 + bgt 24f + vmovl_u8 q8, d1, q9, d2, q10, d3, q11, d4 + mul_mla_4 d6, d16, d18, d20, d22 + vqrshrun_s16 6, q3, d6 + st_16 \d_strd, d6, 2 + pop {r4-r11,pc} + +24: // 2x4 v + load_16 \sr2, \src, \s_strd, d6, d7 + interleave_1_16 d5, d6, d7 + vmovl_u8 q8, d1, q9, d2, q10, d3, q11, d4, q12, d5, q13, d6 + vmov d17, d20 + vmov d19, d22 + vmov d21, d24 + vmov d23, d26 + mul_mla_4 q3, q8, q9, q10, q11 + vqrshrun_s16 6, q3, d6 + st_16 \d_strd, d6, 4 + pop {r4-r11,pc} + +28: // 2x8, 2x16 v + vpush {q4-q7} + vld1.8 {d0}, [\my, :64] + sub \sr2, \src, \s_strd, lsl #1 + add \ds2, \dst, \d_strd + sub \src, \sr2, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + vmovl.s8 q0, d0 + + load_16 \src, \sr2, \s_strd, d2, d4, d6, d8, d10, d12, d14 + interleave_1_16 d2, d4, d6, d8, d10 + interleave_1_16 d10, d12, d14 + vmovl_u8 q1, d2, q2, d4, q3, d6, q4, d8, q5, d10, q6, d12 + vmov d3, d6 + vmov d5, d8 + vmov d7, d10 + vmov d9, d12 +216: + subs \h, \h, #8 + load_16 \sr2, \src, \s_strd, d16, d18, d20, d22 + load_16 \sr2, \src, \s_strd, d24, d26, d28, d30 + interleave_1_16 d14, d16, d18, d20, d22 + interleave_1_16 d22, d24, d26, d28, d30 + vmovl_u8 q7, d14, q8, d16, q9, d18, q10, d20 + vmovl_u8 q11, d22, q12, d24, q13, d26, q14, d28 + vmov d11, d14 + vmov d13, d16 + vmov d15, d18 + vmov d17, d20 + vmov d19, d22 + vmov d21, d24 + vmov d23, d26 + vmov d25, d28 + mul_mla_8_4 q1, q2, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12 + vqrshrun_s16 6, q1, d2, q2, d4 + st_16 \d_strd, d2, 4 + st_16 \d_strd, d4, 4 + ble 0f + vmov q1, q9 + vmov q2, q10 + vmov q3, q11 + vmov q4, q12 + vmov q5, q13 + vmov q6, q14 + vmov d14, d30 + b 216b +0: + vpop {q4-q7} + pop {r4-r11,pc} +.endif + +40: + bgt 480f + + // 4x2, 4x4 v + cmp \h, #2 + add \my, \my, #2 + vld1.32 {d0[]}, [\my] + sub \src, \src, \s_strd + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + + load_32 \src, \sr2, \s_strd, d1, d2, d3, d4, d5 + interleave_1_32 d1, d2, d3, d4, d5 + vmovl_u8 q8, d1, q9, d2, q10, d3, q11, d4 + mul_mla_4 q3, q8, q9, q10, q11 + shift_store_4 \type, \d_strd, q3, d6, d7 + ble 0f + load_32 \sr2, \src, \s_strd, d6, d7 + interleave_1_32 d5, d6, d7 + vmovl_u8 q12, d5, q13, d6 + mul_mla_4 q3, q10, q11, q12, q13 + shift_store_4 \type, \d_strd, q3, d6, d7 +0: + pop {r4-r11,pc} + +480: // 4x8, 4x16 v + vpush {q4} + vld1.8 {d0}, [\my, :64] + sub \sr2, \src, \s_strd, lsl #1 + add \ds2, \dst, \d_strd + sub \src, \sr2, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + + load_32 \src, \sr2, \s_strd, d2, d4, d6, d8, d16, d18, d20 + interleave_1_32 d2, d4, d6 + interleave_1_32 d6, d8, d16, d18, d20 + vmovl_u8 q1, d2, q2, d4, q3, d6, q4, d8, q8, d16, q9, d18 + +48: + subs \h, \h, #4 + load_32 \sr2, \src, \s_strd, d22, d24, d26, d28 + interleave_1_32 d20, d22, d24, d26, d28 + vmovl_u8 q10, d20, q11, d22, q12, d24, q13, d26 + mul_mla_8_2 q1, q2, q1, q2, q3, q4, q8, q9, q10, q11, q12, q13 + shift_store_4 \type, \d_strd, q1, d2, d3, q2, d4, d5 + ble 0f + subs \h, \h, #4 + load_32 \sr2, \src, \s_strd, d30, d2, d4, d6 + interleave_1_32 d28, d30, d2, d4, d6 + vmovl_u8 q14, d28, q15, d30, q1, d2, q2, d4 + mul_mla_8_2 q8, q9, q8, q9, q10, q11, q12, q13, q14, q15, q1, q2 + shift_store_4 \type, \d_strd, q8, d16, d17, q9, d18, d19 + ble 0f + subs \h, \h, #4 + load_32 \sr2, \src, \s_strd, d8, d16, d18, d20 + interleave_1_32 d6, d8, d16, d18, d20 + vmovl_u8 q3, d6, q4, d8, q8, d16, q9, d18 + mul_mla_8_2 q12, q13, q12, q13, q14, q15, q1, q2, q3, q4, q8, q9 + shift_store_4 \type, \d_strd, q12, d24, d25, q13, d26, d27 + bgt 48b +0: + vpop {q4} + pop {r4-r11,pc} + +80: + bgt 880f + + // 8x2, 8x4 v + cmp \h, #2 + add \my, \my, #2 + vld1.32 {d0[]}, [\my] + sub \src, \src, \s_strd + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + + load_reg \src, \sr2, \s_strd, d1, d2, d3, d4, d5 + vmovl_u8 q8, d1, q9, d2, q10, d3, q11, d4, q12, d5 + mul_mla_4 q1, q8, q9, q10, q11 + mul_mla_4 q2, q9, q10, q11, q12 + shift_store_8 \type, \d_strd, q1, d2, q2, d4 + ble 0f + load_reg \sr2, \src, \s_strd, d6, d7 + vmovl_u8 q13, d6, q14, d7 + mul_mla_4 q1, q10, q11, q12, q13 + mul_mla_4 q2, q11, q12, q13, q14 + shift_store_8 \type, \d_strd, q1, d2, q2, d4 +0: + pop {r4-r11,pc} + +880: // 8x6, 8x8, 8x16, 8x32 v +1680: // 16x8, 16x16, ... +320: // 32x8, 32x16, ... +640: +1280: + vpush {q4} + vld1.8 {d0}, [\my, :64] + sub \src, \src, \s_strd + sub \src, \src, \s_strd, lsl #1 + vmovl.s8 q0, d0 + mov \my, \h +168: + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + load_reg \src, \sr2, \s_strd, d2, d4, d6, d8, d16, d18, d20 + vmovl_u8 q1, d2, q2, d4, q3, d6, q4, d8, q8, d16, q9, d18, q10, d20 + +88: + subs \h, \h, #2 + load_reg \sr2, \src, \s_strd, d22, d24 + vmovl_u8 q11, d22, q12, d24 + mul_mla_8_1 q1, q2, q1, q2, q3, q4, q8, q9, q10, q11, q12 + shift_store_8 \type, \d_strd, q1, d2, q2, d4 + ble 9f + subs \h, \h, #2 + load_reg \sr2, \src, \s_strd, d26, d28 + vmovl_u8 q13, d26, q14, d28 + mul_mla_8_1 q3, q4, q3, q4, q8, q9, q10, q11, q12, q13, q14 + shift_store_8 \type, \d_strd, q3, d6, q4, d8 + ble 9f + subs \h, \h, #2 + load_reg \sr2, \src, \s_strd, d30, d2 + vmovl_u8 q15, d30, q1, d2 + mul_mla_8_1 q8, q9, q8, q9, q10, q11, q12, q13, q14, q15, q1 + shift_store_8 \type, \d_strd, q8, d16, q9, d18 + ble 9f + subs \h, \h, #2 + load_reg \sr2, \src, \s_strd, d4, d6 + vmovl_u8 q2, d4, q3, d6 + mul_mla_8_1 q10, q11, q10, q11, q12, q13, q14, q15, q1, q2, q3 + shift_store_8 \type, \d_strd, q10, d20, q11, d22 + ble 9f + subs \h, \h, #4 + load_reg \sr2, \src, \s_strd, d8, d16, d18, d20 + vmovl_u8 q4, d8, q8, d16, q9, d18, q10, d20 + mul_mla_8_1 q12, q13, q12, q13, q14, q15, q1, q2, q3, q4, q8 + mul_mla_8_1 q14, q15, q14, q15, q1, q2, q3, q4, q8, q9, q10 + shift_store_8 \type, \d_strd, q12, d24, q13, d26, q14, d28, q15, d30 + bgt 88b +9: + subs \w, \w, #8 + ble 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + mls \src, \s_strd, \my, \src + mls \dst, \d_strd, \my, \dst + sub \src, \src, \s_strd, lsl #3 + mov \h, \my + add \src, \src, #8 +.ifc \type, put + add \dst, \dst, #8 +.else + add \dst, \dst, #16 +.endif + b 168b +0: + vpop {q4} + pop {r4-r11,pc} + +160: + bgt 1680b + + // 16x2, 16x4 v + add \my, \my, #2 + vld1.32 {d0[]}, [\my] + sub \src, \src, \s_strd + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + + cmp \h, #2 + load_reg \src, \sr2, \s_strd, q11, q12, q13, q14, q15 + vmovl.u8 q1, d22 + vmovl.u8 q2, d24 + vmovl.u8 q3, d26 + vmovl.u8 q8, d28 + vmovl.u8 q9, d30 + vmovl.u8 q11, d23 + vmovl.u8 q12, d25 + vmovl.u8 q13, d27 + vmovl.u8 q14, d29 + vmovl.u8 q15, d31 + mul_mla_4 q1, q1, q2, q3, q8 + mul_mla_4 q10, q2, q3, q8, q9 + mul_mla_4 q2, q11, q12, q13, q14 + mul_mla_4 q11, q12, q13, q14, q15 + shift_store_16 \type, \d_strd, q1, d2, d3, q2, q10, d20, d21, q11 + ble 0f + load_reg \sr2, \src, \s_strd, q10, q11 + vmovl.u8 q1, d20 + vmovl.u8 q10, d21 + vmovl.u8 q12, d22 + vmovl.u8 q11, d23 + mul_mla_4 q2, q3, q8, q9, q1 + mul_mla_4 q3, q13, q14, q15, q10 + mul_mla_4 q13, q8, q9, q1, q12 + mul_mla_4 q14, q14, q15, q10, q11 + shift_store_16 \type, \d_strd, q2, d4, d5, q3, q13, d26, d27, q14 +0: + pop {r4-r11,pc} + +L(\type\()_8tap_hv): + cmp \h, #4 + ubfx r9, \my, #7, #7 + and \my, \my, #0x7f + it gt + movgt \my, r9 + add \my, r10, \my, lsl #3 + + adr r9, L(\type\()_8tap_hv_tbl) + ldr r8, [r9, r8, lsl #2] + add r9, r9, r8 + bx r9 + + .align 2 +L(\type\()_8tap_hv_tbl): + .word 1280f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB + .word 640f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB + .word 320f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB + .word 160f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB + .word 80f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB + .word 40f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB + .word 20f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB + +20: +.ifc \type, put + add \mx, \mx, #2 + vld1.32 {d0[]}, [\mx] + bgt 280f + add \my, \my, #2 + vld1.32 {d2[]}, [\my] + + // 2x2, 2x4 hv + sub \sr2, \src, #1 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + vmovl.s8 q1, d2 + + + vld1.8 {d26}, [\src], \s_strd + vmovl.u8 q13, d26 + vext.8 q14, q13, q13, #2 + vmul.s16 d26, d26, d0 + vmul.s16 d28, d28, d0 + vpadd.s16 d26, d26, d28 + vpadd.s16 d26, d26, d26 + vrshr.s16 d16, d26, #2 + bl L(\type\()_8tap_filter_2) + + vext.8 d16, d16, d16, #4 + vmov d17, d26 + vext.8 d16, d16, d26, #4 + +2: + bl L(\type\()_8tap_filter_2) + + vext.8 d18, d17, d26, #4 + vmull.s16 q2, d16, d2[0] + vmlal.s16 q2, d17, d2[1] + vmlal.s16 q2, d18, d2[2] + vmlal.s16 q2, d26, d2[3] + + vqrshrn.s32 d4, q2, #\shift_hv + vqmovun.s16 d4, q2 + subs \h, \h, #2 + vst1.16 {d4[0]}, [\dst, :16], \d_strd + vst1.16 {d4[1]}, [\ds2, :16], \d_strd + ble 0f + vmov d16, d18 + vmov d17, d26 + b 2b + +280: // 2x8, 2x16, 2x32 hv + vld1.8 {d2}, [\my, :64] + sub \src, \src, #1 + sub \sr2, \src, \s_strd, lsl #1 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + vmovl.s8 q1, d2 + + vld1.8 {d26}, [\src], \s_strd + vmovl.u8 q13, d26 + vext.8 q14, q13, q13, #2 + vmul.s16 d26, d26, d0 + vmul.s16 d28, d28, d0 + vpadd.s16 d26, d26, d28 + vpadd.s16 d26, d26, d26 + vrshr.s16 d16, d26, #2 + + bl L(\type\()_8tap_filter_2) + vext.8 d16, d16, d16, #4 + vmov d17, d26 + vext.8 d16, d16, d26, #4 + bl L(\type\()_8tap_filter_2) + vext.8 d18, d17, d26, #4 + vmov d19, d26 + bl L(\type\()_8tap_filter_2) + vext.8 d20, d19, d26, #4 + vmov d21, d26 + +28: + bl L(\type\()_8tap_filter_2) + vext.8 d22, d21, d26, #4 + vmull.s16 q2, d16, d2[0] + vmlal.s16 q2, d17, d2[1] + vmlal.s16 q2, d18, d2[2] + vmlal.s16 q2, d19, d2[3] + vmlal.s16 q2, d20, d3[0] + vmlal.s16 q2, d21, d3[1] + vmlal.s16 q2, d22, d3[2] + vmlal.s16 q2, d26, d3[3] + + vqrshrn.s32 d4, q2, #\shift_hv + vqmovun.s16 d4, q2 + subs \h, \h, #2 + vst1.16 {d4[0]}, [\dst, :16], \d_strd + vst1.16 {d4[1]}, [\ds2, :16], \d_strd + ble 0f + vmov d16, d18 + vmov d17, d19 + vmov d18, d20 + vmov d19, d21 + vmov d20, d22 + vmov d21, d26 + b 28b + +0: + pop {r4-r11,pc} + +L(\type\()_8tap_filter_2): + vld1.8 {d28}, [\sr2], \s_strd + vld1.8 {d30}, [\src], \s_strd + vext.8 d29, d28, d28, #1 + vext.8 d31, d30, d30, #1 + vmovl.u8 q13, d28 + vmovl.u8 q14, d29 + vmov d27, d28 + vmovl.u8 q14, d30 + vmovl.u8 q15, d31 + vtrn.32 d26, d28 + vtrn.32 d27, d30 + vmul.s16 d26, d26, d0[0] + vmla.s16 d26, d27, d0[1] + vmla.s16 d26, d28, d0[2] + vmla.s16 d26, d30, d0[3] + vrshr.s16 d26, d26, #2 + vext.8 d27, d26, d26, #4 + bx lr +.endif + +40: + add \mx, \mx, #2 + vld1.32 {d0[]}, [\mx] + bgt 480f + add \my, \my, #2 + vld1.32 {d2[]}, [\my] + sub \sr2, \src, #1 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + vmovl.s8 q1, d2 + + // 4x2, 4x4 hv + vld1.8 {d30}, [\src], \s_strd + vmovl.u8 q14, d30 + vext.8 d27, d28, d29, #2 + vext.8 d30, d28, d29, #4 + vext.8 d31, d28, d29, #6 + vmul.s16 d26, d28, d0[0] + vmla.s16 d26, d27, d0[1] + vmla.s16 d26, d30, d0[2] + vmla.s16 d26, d31, d0[3] + vrshr.s16 d16, d26, #2 + + bl L(\type\()_8tap_filter_4) + vmov d17, d26 + vmov d18, d27 + +4: + bl L(\type\()_8tap_filter_4) + vmull.s16 q2, d16, d2[0] + vmlal.s16 q2, d17, d2[1] + vmlal.s16 q2, d18, d2[2] + vmlal.s16 q2, d26, d2[3] + vmull.s16 q3, d17, d2[0] + vmlal.s16 q3, d18, d2[1] + vmlal.s16 q3, d26, d2[2] + vmlal.s16 q3, d27, d2[3] + vqrshrn.s32 d4, q2, #\shift_hv + vqrshrn.s32 d6, q3, #\shift_hv + subs \h, \h, #2 +.ifc \type, put + vqmovun.s16 d4, q2 + vqmovun.s16 d6, q3 + vst1.32 {d4[0]}, [\dst, :32], \d_strd + vst1.32 {d6[0]}, [\ds2, :32], \d_strd +.else + vst1.16 {d4}, [\dst, :64], \d_strd + vst1.16 {d6}, [\ds2, :64], \d_strd +.endif + ble 0f + vmov d16, d18 + vmov d17, d26 + vmov d18, d27 + b 4b + +480: // 4x8, 4x16, 4x32 hv + vld1.8 {d2}, [\my, :64] + sub \src, \src, #1 + sub \sr2, \src, \s_strd, lsl #1 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + vmovl.s8 q1, d2 + + vld1.8 {d30}, [\src], \s_strd + vmovl.u8 q14, d30 + vext.8 d27, d28, d29, #2 + vext.8 d30, d28, d29, #4 + vext.8 d31, d28, d29, #6 + vmul.s16 d26, d28, d0[0] + vmla.s16 d26, d27, d0[1] + vmla.s16 d26, d30, d0[2] + vmla.s16 d26, d31, d0[3] + vrshr.s16 d16, d26, #2 + + bl L(\type\()_8tap_filter_4) + vmov d17, d26 + vmov d18, d27 + bl L(\type\()_8tap_filter_4) + vmov d19, d26 + vmov d20, d27 + bl L(\type\()_8tap_filter_4) + vmov d21, d26 + vmov d22, d27 + +48: + bl L(\type\()_8tap_filter_4) + vmull.s16 q2, d16, d2[0] + vmlal.s16 q2, d17, d2[1] + vmlal.s16 q2, d18, d2[2] + vmlal.s16 q2, d19, d2[3] + vmlal.s16 q2, d20, d3[0] + vmlal.s16 q2, d21, d3[1] + vmlal.s16 q2, d22, d3[2] + vmlal.s16 q2, d26, d3[3] + vmull.s16 q3, d17, d2[0] + vmlal.s16 q3, d18, d2[1] + vmlal.s16 q3, d19, d2[2] + vmlal.s16 q3, d20, d2[3] + vmlal.s16 q3, d21, d3[0] + vmlal.s16 q3, d22, d3[1] + vmlal.s16 q3, d26, d3[2] + vmlal.s16 q3, d27, d3[3] + vqrshrn.s32 d4, q2, #\shift_hv + vqrshrn.s32 d6, q3, #\shift_hv + subs \h, \h, #2 +.ifc \type, put + vqmovun.s16 d4, q2 + vqmovun.s16 d6, q3 + vst1.32 {d4[0]}, [\dst, :32], \d_strd + vst1.32 {d6[0]}, [\ds2, :32], \d_strd +.else + vst1.16 {d4}, [\dst, :64], \d_strd + vst1.16 {d6}, [\ds2, :64], \d_strd +.endif + ble 0f + vmov d16, d18 + vmov d17, d19 + vmov d18, d20 + vmov d19, d21 + vmov d20, d22 + vmov d21, d26 + vmov d22, d27 + b 48b +0: + pop {r4-r11,pc} + +L(\type\()_8tap_filter_4): + vld1.8 {d30}, [\sr2], \s_strd + vld1.8 {d31}, [\src], \s_strd + vmovl.u8 q14, d30 + vext.8 d27, d28, d29, #2 + vext.8 d30, d28, d29, #4 + vext.8 d1, d28, d29, #6 + vmul.s16 d26, d28, d0[0] + vmla.s16 d26, d27, d0[1] + vmla.s16 d26, d30, d0[2] + vmla.s16 d26, d1, d0[3] + + vmovl.u8 q14, d31 + vext.8 d30, d28, d29, #2 + vext.8 d31, d28, d29, #4 + vext.8 d1, d28, d29, #6 + vmul.s16 d27, d28, d0[0] + vmla.s16 d27, d30, d0[1] + vmla.s16 d27, d31, d0[2] + vmla.s16 d27, d1, d0[3] + vrshr.s16 d26, d26, #2 + vrshr.s16 d27, d27, #2 + bx lr + +80: +160: +320: + bgt 880f + vpush {q4-q7} + add \my, \my, #2 + vld1.8 {d0}, [\mx, :64] + vld1.32 {d2[]}, [\my] + sub \src, \src, #3 + sub \src, \src, \s_strd + vmovl.s8 q0, d0 + vmovl.s8 q1, d2 + mov \my, \h + +164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + + vld1.8 {q14}, [\src], \s_strd + vmovl.u8 q12, d28 + vmovl.u8 q13, d29 + vmul.s16 q10, q12, d0[0] +.irpc i, 123 + vext.8 q14, q12, q13, #(2*\i) + vmla.s16 q10, q14, d0[\i] +.endr +.irpc i, 4567 + vext.8 q14, q12, q13, #(2*\i) + vmla.s16 q10, q14, d1[\i-4] +.endr + vrshr.s16 q3, q10, #2 + + bl L(\type\()_8tap_filter_8) + vmov q4, q10 + vmov q5, q11 + +8: + bl L(\type\()_8tap_filter_8) + vmull.s16 q12, d6, d2[0] + vmull.s16 q13, d7, d2[0] + vmull.s16 q14, d8, d2[0] + vmull.s16 q15, d9, d2[0] + vmlal.s16 q12, d8, d2[1] + vmlal.s16 q13, d9, d2[1] + vmlal.s16 q14, d10, d2[1] + vmlal.s16 q15, d11, d2[1] + vmlal.s16 q12, d10, d2[2] + vmlal.s16 q13, d11, d2[2] + vmlal.s16 q14, d20, d2[2] + vmlal.s16 q15, d21, d2[2] + vmlal.s16 q12, d20, d2[3] + vmlal.s16 q13, d21, d2[3] + vmlal.s16 q14, d22, d2[3] + vmlal.s16 q15, d23, d2[3] + vqrshrn.s32 d24, q12, #\shift_hv + vqrshrn.s32 d25, q13, #\shift_hv + vqrshrn.s32 d28, q14, #\shift_hv + vqrshrn.s32 d29, q15, #\shift_hv + subs \h, \h, #2 +.ifc \type, put + vqmovun.s16 d24, q12 + vqmovun.s16 d28, q14 + vst1.8 {d24}, [\dst, :64], \d_strd + vst1.8 {d28}, [\ds2, :64], \d_strd +.else + vst1.16 {q12}, [\dst, :128], \d_strd + vst1.16 {q14}, [\ds2, :128], \d_strd +.endif + ble 9f + vmov q3, q5 + vmov q4, q10 + vmov q5, q11 + b 8b +9: + subs \w, \w, #8 + ble 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + mls \src, \s_strd, \my, \src + mls \dst, \d_strd, \my, \dst + sub \src, \src, \s_strd, lsl #2 + mov \h, \my + add \src, \src, #8 +.ifc \type, put + add \dst, \dst, #8 +.else + add \dst, \dst, #16 +.endif + b 164b + +880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv +640: +1280: + vpush {q4-q7} + vld1.8 {d0}, [\mx, :64] + vld1.8 {d2}, [\my, :64] + sub \src, \src, #3 + sub \src, \src, \s_strd + sub \src, \src, \s_strd, lsl #1 + vmovl.s8 q0, d0 + vmovl.s8 q1, d2 + mov \my, \h + +168: + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + + vld1.8 {q14}, [\src], \s_strd + vmovl.u8 q12, d28 + vmovl.u8 q13, d29 + vmul.s16 q10, q12, d0[0] +.irpc i, 123 + vext.8 q14, q12, q13, #(2*\i) + vmla.s16 q10, q14, d0[\i] +.endr +.irpc i, 4567 + vext.8 q14, q12, q13, #(2*\i) + vmla.s16 q10, q14, d1[\i-4] +.endr + vrshr.s16 q3, q10, #2 + + bl L(\type\()_8tap_filter_8) + vmov q4, q10 + vmov q5, q11 + bl L(\type\()_8tap_filter_8) + vmov q6, q10 + vmov q7, q11 + bl L(\type\()_8tap_filter_8) + vmov q8, q10 + vmov q9, q11 + +88: + bl L(\type\()_8tap_filter_8) + vmull.s16 q12, d6, d2[0] + vmull.s16 q13, d7, d2[0] + vmull.s16 q14, d8, d2[0] + vmull.s16 q15, d9, d2[0] + vmlal.s16 q12, d8, d2[1] + vmlal.s16 q13, d9, d2[1] + vmlal.s16 q14, d10, d2[1] + vmlal.s16 q15, d11, d2[1] + vmlal.s16 q12, d10, d2[2] + vmlal.s16 q13, d11, d2[2] + vmlal.s16 q14, d12, d2[2] + vmlal.s16 q15, d13, d2[2] + vmlal.s16 q12, d12, d2[3] + vmlal.s16 q13, d13, d2[3] + vmlal.s16 q14, d14, d2[3] + vmlal.s16 q15, d15, d2[3] + vmlal.s16 q12, d14, d3[0] + vmlal.s16 q13, d15, d3[0] + vmlal.s16 q14, d16, d3[0] + vmlal.s16 q15, d17, d3[0] + vmlal.s16 q12, d16, d3[1] + vmlal.s16 q13, d17, d3[1] + vmlal.s16 q14, d18, d3[1] + vmlal.s16 q15, d19, d3[1] + vmlal.s16 q12, d18, d3[2] + vmlal.s16 q13, d19, d3[2] + vmlal.s16 q14, d20, d3[2] + vmlal.s16 q15, d21, d3[2] + vmlal.s16 q12, d20, d3[3] + vmlal.s16 q13, d21, d3[3] + vmlal.s16 q14, d22, d3[3] + vmlal.s16 q15, d23, d3[3] + vqrshrn.s32 d24, q12, #\shift_hv + vqrshrn.s32 d25, q13, #\shift_hv + vqrshrn.s32 d28, q14, #\shift_hv + vqrshrn.s32 d29, q15, #\shift_hv + subs \h, \h, #2 +.ifc \type, put + vqmovun.s16 d24, q12 + vqmovun.s16 d28, q14 + vst1.8 {d24}, [\dst, :64], \d_strd + vst1.8 {d28}, [\ds2, :64], \d_strd +.else + vst1.16 {q12}, [\dst, :128], \d_strd + vst1.16 {q14}, [\ds2, :128], \d_strd +.endif + ble 9f + vmov q3, q5 + vmov q4, q6 + vmov q5, q7 + vmov q6, q8 + vmov q7, q9 + vmov q8, q10 + vmov q9, q11 + b 88b +9: + subs \w, \w, #8 + ble 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + mls \src, \s_strd, \my, \src + mls \dst, \d_strd, \my, \dst + sub \src, \src, \s_strd, lsl #3 + mov \h, \my + add \src, \src, #8 +.ifc \type, put + add \dst, \dst, #8 +.else + add \dst, \dst, #16 +.endif + b 168b +0: + vpop {q4-q7} + pop {r4-r11,pc} + +L(\type\()_8tap_filter_8): + vld1.8 {q14}, [\sr2], \s_strd + vld1.8 {q15}, [\src], \s_strd + vmovl.u8 q12, d28 + vmovl.u8 q13, d29 + vmul.s16 q10, q12, d0[0] +.irpc i, 123 + vext.8 q14, q12, q13, #(2*\i) + vmla.s16 q10, q14, d0[\i] +.endr +.irpc i, 4567 + vext.8 q14, q12, q13, #(2*\i) + vmla.s16 q10, q14, d1[\i-4] +.endr + vmovl.u8 q12, d30 + vmovl.u8 q13, d31 + vmul.s16 q11, q12, d0[0] +.irpc i, 123 + vext.8 q14, q12, q13, #(2*\i) + vmla.s16 q11, q14, d0[\i] +.endr +.irpc i, 4567 + vext.8 q14, q12, q13, #(2*\i) + vmla.s16 q11, q14, d1[\i-4] +.endr + vrshr.s16 q10, q10, #2 + vrshr.s16 q11, q11, #2 + bx lr +endfunc + + +function \type\()_bilin_8bpc_neon, export=1 + push {r4-r11,lr} + ldrd r4, r5, [sp, #36] + ldrd r6, r7, [sp, #44] + vdup.8 d1, \mx + vdup.8 d3, \my + rsb r8, \mx, #16 + rsb r9, \my, #16 + vdup.8 d0, r8 + vdup.8 d2, r9 +.ifc \type, prep + lsl \d_strd, \w, #1 +.endif + clz r8, \w + cmp \mx, #0 + sub r8, r8, #24 + bne L(\type\()_bilin_h) + cmp \my, #0 + bne L(\type\()_bilin_v) + b \type\()_neon + +L(\type\()_bilin_h): + cmp \my, #0 + bne L(\type\()_bilin_hv) + + adr r9, L(\type\()_bilin_h_tbl) + ldr r8, [r9, r8, lsl #2] + add r9, r9, r8 + bx r9 + + .align 2 +L(\type\()_bilin_h_tbl): + .word 1280f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB + .word 640f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB + .word 320f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB + .word 160f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB + .word 80f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB + .word 40f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB + .word 20f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB + +20: // 2xN h +.ifc \type, put + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 +2: + vld1.32 {d4[]}, [\src], \s_strd + vld1.32 {d6[]}, [\sr2], \s_strd + vext.8 d5, d4, d4, #1 + vext.8 d7, d6, d6, #1 + vtrn.16 q2, q3 + subs \h, \h, #2 + vmull.u8 q3, d4, d0 + vmlal.u8 q3, d5, d1 + vqrshrn.u16 d4, q3, #4 + vst1.16 {d4[0]}, [\dst, :16], \d_strd + vst1.16 {d4[1]}, [\ds2, :16], \d_strd + bgt 2b + pop {r4-r11,pc} +.endif + +40: // 4xN h + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 +4: + vld1.8 {d4}, [\src], \s_strd + vld1.8 {d6}, [\sr2], \s_strd + vext.8 d5, d4, d4, #1 + vext.8 d7, d6, d6, #1 + vtrn.32 q2, q3 + subs \h, \h, #2 + vmull.u8 q3, d4, d0 + vmlal.u8 q3, d5, d1 +.ifc \type, put + vqrshrn.u16 d4, q3, #4 + vst1.32 {d4[0]}, [\dst, :32], \d_strd + vst1.32 {d4[1]}, [\ds2, :32], \d_strd +.else + vst1.16 {d6}, [\dst, :64], \d_strd + vst1.16 {d7}, [\ds2, :64], \d_strd +.endif + bgt 4b + pop {r4-r11,pc} + +80: // 8xN h + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 +8: + vld1.8 {q8}, [\src], \s_strd + vld1.8 {q10}, [\sr2], \s_strd + vext.8 q9, q8, q8, #1 + vext.8 q11, q10, q10, #1 + subs \h, \h, #2 + vmull.u8 q8, d16, d0 + vmull.u8 q10, d20, d0 + vmlal.u8 q8, d18, d1 + vmlal.u8 q10, d22, d1 +.ifc \type, put + vqrshrn.u16 d16, q8, #4 + vqrshrn.u16 d18, q10, #4 + vst1.8 {d16}, [\dst, :64], \d_strd + vst1.8 {d18}, [\ds2, :64], \d_strd +.else + vst1.16 {q8}, [\dst, :128], \d_strd + vst1.16 {q10}, [\ds2, :128], \d_strd +.endif + bgt 8b + pop {r4-r11,pc} +160: +320: +640: +1280: // 16xN, 32xN, ... h + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + + sub \s_strd, \s_strd, \w + sub \s_strd, \s_strd, #8 +.ifc \type, put + lsl \d_strd, \d_strd, #1 + sub \d_strd, \d_strd, \w +.endif +161: + vld1.8 {d16}, [\src]! + vld1.8 {d22}, [\sr2]! + mov \mx, \w + +16: + vld1.8 {d17,d18}, [\src]! + vld1.8 {d23,d24}, [\sr2]! + vext.8 q10, q8, q9, #1 + vext.8 q13, q11, q12, #1 + vmull.u8 q2, d16, d0 + vmull.u8 q3, d17, d0 + vmull.u8 q14, d22, d0 + vmull.u8 q15, d23, d0 + vmlal.u8 q2, d20, d1 + vmlal.u8 q3, d21, d1 + vmlal.u8 q14, d26, d1 + vmlal.u8 q15, d27, d1 + subs \mx, \mx, #16 +.ifc \type, put + vqrshrn.u16 d4, q2, #4 + vqrshrn.u16 d5, q3, #4 + vqrshrn.u16 d28, q14, #4 + vqrshrn.u16 d29, q15, #4 + vst1.8 {q2}, [\dst, :128]! + vst1.8 {q14}, [\ds2, :128]! +.else + vst1.16 {q2, q3}, [\dst, :128]! + vst1.16 {q14, q15}, [\ds2, :128]! +.endif + ble 9f + + vmov d16, d18 + vmov d22, d24 + b 16b + +9: + add \dst, \dst, \d_strd + add \ds2, \ds2, \d_strd + add \src, \src, \s_strd + add \sr2, \sr2, \s_strd + + subs \h, \h, #2 + bgt 161b + pop {r4-r11,pc} + +L(\type\()_bilin_v): + cmp \h, #4 + adr r9, L(\type\()_bilin_v_tbl) + ldr r8, [r9, r8, lsl #2] + add r9, r9, r8 + bx r9 + + .align 2 +L(\type\()_bilin_v_tbl): + .word 1280f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB + .word 640f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB + .word 320f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB + .word 160f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB + .word 80f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB + .word 40f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB + .word 20f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB + +20: // 2xN v +.ifc \type, put + cmp \h, #2 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + // 2x2 v + vld1.16 {d16[]}, [\src], \s_strd + bgt 24f + vld1.16 {d17[]}, [\sr2], \s_strd + vld1.16 {d18[]}, [\src], \s_strd + vext.8 d16, d16, d17, #6 + vext.8 d17, d17, d18, #6 + vmull.u8 q2, d16, d2 + vmlal.u8 q2, d17, d3 + vqrshrn.u16 d4, q2, #4 + vst1.16 {d4[0]}, [\dst, :16] + vst1.16 {d4[1]}, [\ds2, :16] + pop {r4-r11,pc} +24: // 2x4, 2x8, ... v + vld1.16 {d17[]}, [\sr2], \s_strd + vld1.16 {d18[]}, [\src], \s_strd + vld1.16 {d19[]}, [\sr2], \s_strd + vld1.16 {d20[]}, [\src], \s_strd + vext.8 d16, d16, d17, #6 + vext.8 d17, d17, d18, #6 + vext.8 d18, d18, d19, #6 + vext.8 d19, d19, d20, #6 + vtrn.32 d16, d18 + vtrn.32 d17, d19 + vmull.u8 q2, d16, d2 + vmlal.u8 q2, d17, d3 + subs \h, \h, #4 + vqrshrn.u16 d4, q2, #4 + vst1.16 {d4[0]}, [\dst, :16], \d_strd + vst1.16 {d4[1]}, [\ds2, :16], \d_strd + vst1.16 {d4[2]}, [\dst, :16], \d_strd + vst1.16 {d4[3]}, [\ds2, :16], \d_strd + ble 0f + vmov d16, d20 + b 24b +0: + pop {r4-r11,pc} +.endif + +40: // 4xN v + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vld1.32 {d16[]}, [\src], \s_strd +4: + vld1.32 {d17[]}, [\sr2], \s_strd + vld1.32 {d18[]}, [\src], \s_strd + vext.8 d16, d16, d17, #4 + vext.8 d17, d17, d18, #4 + vmull.u8 q2, d16, d2 + vmlal.u8 q2, d17, d3 + subs \h, \h, #2 +.ifc \type, put + vqrshrn.u16 d4, q2, #4 + vst1.32 {d4[0]}, [\dst, :32], \d_strd + vst1.32 {d4[1]}, [\ds2, :32], \d_strd +.else + vst1.16 {d4}, [\dst, :64], \d_strd + vst1.16 {d5}, [\ds2, :64], \d_strd +.endif + ble 0f + vmov d16, d18 + b 4b +0: + pop {r4-r11,pc} + +80: // 8xN v + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vld1.8 {d16}, [\src], \s_strd +8: + vld1.8 {d17}, [\sr2], \s_strd + vld1.8 {d18}, [\src], \s_strd + vmull.u8 q2, d16, d2 + vmull.u8 q3, d17, d2 + vmlal.u8 q2, d17, d3 + vmlal.u8 q3, d18, d3 + subs \h, \h, #2 +.ifc \type, put + vqrshrn.u16 d4, q2, #4 + vqrshrn.u16 d6, q3, #4 + vst1.8 {d4}, [\dst, :64], \d_strd + vst1.8 {d6}, [\ds2, :64], \d_strd +.else + vst1.16 {q2}, [\dst, :128], \d_strd + vst1.16 {q3}, [\ds2, :128], \d_strd +.endif + ble 0f + vmov d16, d18 + b 8b +0: + pop {r4-r11,pc} + +160: // 16xN, 32xN, ... +320: +640: +1280: + mov \my, \h +1: + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + vld1.8 {q8}, [\src], \s_strd +2: + vld1.8 {q9}, [\sr2], \s_strd + vld1.8 {q10}, [\src], \s_strd + vmull.u8 q12, d16, d2 + vmull.u8 q13, d17, d2 + vmull.u8 q14, d18, d2 + vmull.u8 q15, d19, d2 + vmlal.u8 q12, d18, d3 + vmlal.u8 q13, d19, d3 + vmlal.u8 q14, d20, d3 + vmlal.u8 q15, d21, d3 + subs \h, \h, #2 +.ifc \type, put + vqrshrn.u16 d24, q12, #4 + vqrshrn.u16 d25, q13, #4 + vqrshrn.u16 d28, q14, #4 + vqrshrn.u16 d29, q15, #4 + vst1.8 {q12}, [\dst, :128], \d_strd + vst1.8 {q14}, [\ds2, :128], \d_strd +.else + vst1.16 {q12, q13}, [\dst, :128], \d_strd + vst1.16 {q14, q15}, [\ds2, :128], \d_strd +.endif + ble 9f + vmov q8, q10 + b 2b +9: + subs \w, \w, #16 + ble 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + mls \src, \s_strd, \my, \src + mls \dst, \d_strd, \my, \dst + sub \src, \src, \s_strd, lsl #1 + mov \h, \my + add \src, \src, #16 +.ifc \type, put + add \dst, \dst, #16 +.else + add \dst, \dst, #32 +.endif + b 1b +0: + pop {r4-r11,pc} + +L(\type\()_bilin_hv): + vmovl.u8 q2, d2 + vmovl.u8 q3, d3 + adr r9, L(\type\()_bilin_hv_tbl) + ldr r8, [r9, r8, lsl #2] + add r9, r9, r8 + bx r9 + + .align 2 +L(\type\()_bilin_hv_tbl): + .word 1280f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB + .word 640f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB + .word 320f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB + .word 160f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB + .word 80f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB + .word 40f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB + .word 20f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB + +20: // 2xN hv +.ifc \type, put + add \sr2, \src, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + vld1.32 {d28[]}, [\src], \s_strd + vext.8 d29, d28, d28, #1 + vmull.u8 q8, d28, d0 + vmlal.u8 q8, d29, d1 + +2: + vld1.32 {d28[]}, [\sr2], \s_strd + vld1.32 {d30[]}, [\src], \s_strd + vext.8 d29, d28, d28, #1 + vext.8 d31, d30, d30, #1 + vtrn.16 d28, d30 + vtrn.16 d29, d31 + vmull.u8 q9, d28, d0 + vmlal.u8 q9, d29, d1 + + vtrn.32 d16, d18 + + vmul.u16 d20, d16, d4 + vmla.u16 d20, d19, d6 + vqrshrn.u16 d20, q10, #8 + subs \h, \h, #2 + vst1.16 {d20[0]}, [\dst, :16], \d_strd + vst1.16 {d20[1]}, [\ds2, :16], \d_strd + ble 0f + vtrn.32 d19, d16 + b 2b +0: + pop {r4-r11,pc} +.endif + +40: // 4xN hv + add \sr2, \src, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + vld1.8 {d28}, [\src], \s_strd + vext.8 d29, d28, d28, #1 + vmull.u8 q8, d28, d0 + vmlal.u8 q8, d29, d1 + +4: + vld1.8 {d28}, [\sr2], \s_strd + vld1.8 {d30}, [\src], \s_strd + vext.8 d29, d28, d28, #1 + vext.8 d31, d30, d30, #1 + vtrn.32 d28, d30 + vtrn.32 d29, d31 + vmull.u8 q9, d28, d0 + vmlal.u8 q9, d29, d1 + + vmov d17, d18 + + vmul.u16 q10, q8, q2 + vmla.u16 q10, q9, q3 + subs \h, \h, #2 +.ifc \type, put + vqrshrn.u16 d20, q10, #8 + vst1.32 {d20[0]}, [\dst, :32], \d_strd + vst1.32 {d20[1]}, [\ds2, :32], \d_strd +.else + vrshr.u16 q10, q10, #4 + vst1.16 {d20}, [\dst, :64], \d_strd + vst1.16 {d21}, [\ds2, :64], \d_strd +.endif + ble 0f + vmov d16, d19 + b 4b +0: + pop {r4-r11,pc} + +80: // 8xN, 16xN, ... hv +160: +320: +640: +1280: + mov \my, \h + +1: + add \sr2, \src, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + vld1.8 {q12}, [\src], \s_strd + vext.8 q13, q12, q12, #1 + vmull.u8 q8, d24, d0 + vmlal.u8 q8, d26, d1 + +2: + vld1.8 {q12}, [\sr2], \s_strd + vld1.8 {q14}, [\src], \s_strd + vext.8 q13, q12, q12, #1 + vext.8 q15, q14, q14, #1 + vmull.u8 q9, d24, d0 + vmlal.u8 q9, d26, d1 + vmull.u8 q10, d28, d0 + vmlal.u8 q10, d30, d1 + + vmul.u16 q8, q8, q2 + vmla.u16 q8, q9, q3 + vmul.u16 q9, q9, q2 + vmla.u16 q9, q10, q3 + subs \h, \h, #2 +.ifc \type, put + vqrshrn.u16 d16, q8, #8 + vqrshrn.u16 d18, q9, #8 + vst1.8 {d16}, [\dst, :64], \d_strd + vst1.8 {d18}, [\ds2, :64], \d_strd +.else + vrshr.u16 q8, q8, #4 + vrshr.u16 q9, q9, #4 + vst1.16 {q8}, [\dst, :128], \d_strd + vst1.16 {q9}, [\ds2, :128], \d_strd +.endif + ble 9f + vmov q8, q10 + b 2b +9: + subs \w, \w, #8 + ble 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + mls \src, \s_strd, \my, \src + mls \dst, \d_strd, \my, \dst + sub \src, \src, \s_strd, lsl #1 + mov \h, \my + add \src, \src, #8 +.ifc \type, put + add \dst, \dst, #8 +.else + add \dst, \dst, #16 +.endif + b 1b +0: + pop {r4-r11,pc} +endfunc +.endm + +filter_fn put, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, 10 +filter_fn prep, r0, r7, r1, r2, r3, r4, r5, r6, r8, r9, 6 + +.macro load_filter_ptr src + asr r12, \src, #10 + add r12, r11, r12, lsl #3 +.endm + +.macro load_filter_coef dst, src, inc + vld1.8 {\dst}, [r12, :64] + add \src, \src, \inc +.endm + +.macro load_filter_row dst, src, inc + load_filter_ptr \src + load_filter_coef \dst, \src, \inc +.endm + +function warp_filter_horz_neon + load_filter_ptr r5 // filter 0 + vld1.16 {q7}, [r2], r3 + + load_filter_coef d0, r5, r7 // filter 0 + vmovl.u8 q6, d14 // original pixels + load_filter_row d2, r5, r7 // filter 1 + vmovl.u8 q7, d15 // original pixels + load_filter_row d4, r5, r7 // filter 2 + vmovl.s8 q0, d0 // filter 0 + vext.8 q3, q6, q7, #2*1 // filter 1 pixels + load_filter_ptr r5 // filter 3 + vmovl.s8 q1, d2 // filter 1 + vmul.i16 q5, q6, q0 // filter 0 output + load_filter_coef d0, r5, r7 // filter 3 + vmovl.s8 q2, d4 // filter 2 + load_filter_ptr r5 // filter 4 + vext.8 q4, q6, q7, #2*2 // filter 2 pixels + vmul.i16 q3, q3, q1 // filter 1 output + load_filter_coef d2, r5, r7 // filter 4 + vmul.i16 q4, q4, q2 // filter 2 output + vext.8 q2, q6, q7, #2*3 // filter 3 pixels + vmovl.s8 q0, d0 // filter 3 + vpaddl.s16 q5, q5 // pixel 0 (4x32) + vpaddl.s16 q3, q3 // pixel 1 (4x32) + vmul.i16 q0, q2, q0 // filter 3 output + load_filter_ptr r5 // filter 5 + vext.8 q2, q6, q7, #2*4 // filter 4 pixels + vmovl.s8 q1, d2 // filter 4 + vpaddl.s16 q4, q4 // pixel 2 (4x32) + vpadd.s32 d10, d10, d11 // pixel 0 (2x32) + vpadd.s32 d11, d6, d7 // pixel 1 (2x32) + load_filter_coef d6, r5, r7 // filter 5 + vmul.i16 q1, q2, q1 // filter 4 output + vpadd.s32 d8, d8, d9 // pixel 2 (2x32) + load_filter_ptr r5 // filter 6 + vpaddl.s16 q0, q0 // pixel 3 (4x32) + vpadd.s32 d10, d10, d11 // pixel 0,1 + vext.8 q2, q6, q7, #2*5 // filter 5 pixels + vmovl.s8 q3, d6 // filter 5 + vpaddl.s16 q1, q1 // pixel 4 (4x32) + vpadd.s32 d9, d0, d1 // pixel 3 (2x32) + load_filter_coef d0, r5, r7 // filter 6 + vmul.i16 q2, q2, q3 // filter 5 output + vpadd.s32 d11, d8, d9 // pixel 2,3 + load_filter_ptr r5 // filter 7 + vpaddl.s16 q2, q2 // pixel 5 (4x32) + vpadd.s32 d8, d2, d3 // pixel 4 (2x32) + vext.8 q3, q6, q7, #2*6 // filter 6 pixels + vmovl.s8 q0, d0 // filter 6 + vpadd.s32 d9, d4, d5 // pixel 5 (2x32) + load_filter_coef d4, r5, r7 // filter 7 + vpadd.s32 d8, d8, d9 // pixel 4,5 + vext.8 q1, q6, q7, #2*7 // filter 7 pixels + vmovl.s8 q2, d4 // filter 7 + vmul.i16 q3, q3, q0 // filter 6 output + vmul.i16 q1, q1, q2 // filter 7 output + sub r5, r5, r7, lsl #3 + vpaddl.s16 q3, q3 // pixel 6 (4x32) + vpaddl.s16 q1, q1 // pixel 7 (4x32) + vpadd.s32 d6, d6, d7 // pixel 6 (2x32) + vpadd.s32 d2, d2, d3 // pixel 7 (2x32) + vpadd.s32 d9, d6, d2 // pixel 6,7 + + add r5, r5, r8 + + vrshrn.s32 d10, q5, #3 + vrshrn.s32 d11, q4, #3 + + bx lr +endfunc + +// void dav1d_warp_affine_8x8_8bpc_neon( +// pixel *dst, const ptrdiff_t dst_stride, +// const pixel *src, const ptrdiff_t src_stride, +// const int16_t *const abcd, int mx, int my) +.macro warp t, shift +function warp_affine_8x8\t\()_8bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] + ldr r6, [sp, #108] + ldrd r8, r9, [r4] + sxth r7, r8 + asr r8, r8, #16 + asr r4, r9, #16 + sxth r9, r9 + mov r10, #8 + sub r2, r2, r3, lsl #1 + sub r2, r2, r3 + sub r2, r2, #3 + movrel r11, X(mc_warp_filter), 64*8 +.ifnb \t + lsl r1, r1, #1 +.endif + add r5, r5, #512 + add r6, r6, #512 + + bl warp_filter_horz_neon + vmov q8, q5 + bl warp_filter_horz_neon + vmov q9, q5 + bl warp_filter_horz_neon + vmov q10, q5 + bl warp_filter_horz_neon + vmov q11, q5 + bl warp_filter_horz_neon + vmov q12, q5 + bl warp_filter_horz_neon + vmov q13, q5 + bl warp_filter_horz_neon + vmov q14, q5 + +1: + bl warp_filter_horz_neon + vmov q15, q5 + + load_filter_row d8, r6, r9 + load_filter_row d9, r6, r9 + load_filter_row d10, r6, r9 + load_filter_row d11, r6, r9 + load_filter_row d12, r6, r9 + load_filter_row d13, r6, r9 + load_filter_row d14, r6, r9 + load_filter_row d15, r6, r9 + transpose_8x8b q4, q5, q6, q7, d8, d9, d10, d11, d12, d13, d14, d15 + vmovl.s8 q1, d8 + vmovl.s8 q2, d9 + vmovl.s8 q3, d10 + vmovl.s8 q4, d11 + vmovl.s8 q5, d12 + vmovl.s8 q6, d13 + + sub r6, r6, r9, lsl #3 + + // This ordering of vmull/vmlal is highly beneficial for + // Cortex A8/A9/A53 here, but harmful for Cortex A7. + vmull.s16 q0, d16, d2 + vmlal.s16 q0, d18, d4 + vmlal.s16 q0, d20, d6 + vmlal.s16 q0, d22, d8 + vmlal.s16 q0, d24, d10 + vmlal.s16 q0, d26, d12 + vmull.s16 q1, d17, d3 + vmlal.s16 q1, d19, d5 + vmlal.s16 q1, d21, d7 + vmlal.s16 q1, d23, d9 + vmlal.s16 q1, d25, d11 + vmlal.s16 q1, d27, d13 + + vmovl.s8 q2, d14 + vmovl.s8 q3, d15 + + vmlal.s16 q0, d28, d4 + vmlal.s16 q0, d30, d6 + vmlal.s16 q1, d29, d5 + vmlal.s16 q1, d31, d7 + + vmov q8, q9 + vmov q9, q10 + vqrshrn.s32 d0, q0, #\shift + vmov q10, q11 + vqrshrn.s32 d1, q1, #\shift + vmov q11, q12 + vmov q12, q13 +.ifb \t + vqmovun.s16 d0, q0 +.endif + vmov q13, q14 + vmov q14, q15 + subs r10, r10, #1 +.ifnb \t + vst1.16 {q0}, [r0, :128], r1 +.else + vst1.8 {d0}, [r0, :64], r1 +.endif + + add r6, r6, r4 + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} +endfunc +.endm + +warp , 11 +warp t, 7 + +// void dav1d_emu_edge_8bpc_neon( +// const intptr_t bw, const intptr_t bh, +// const intptr_t iw, const intptr_t ih, +// const intptr_t x, const intptr_t y, +// pixel *dst, const ptrdiff_t dst_stride, +// const pixel *ref, const ptrdiff_t ref_stride) +function emu_edge_8bpc_neon, export=1 + push {r4-r11,lr} + ldrd r4, r5, [sp, #36] + ldrd r6, r7, [sp, #44] + ldrd r8, r9, [sp, #52] + + // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) + // ref += iclip(x, 0, iw - 1) + sub r12, r3, #1 // ih - 1 + cmp r5, r3 + sub lr, r2, #1 // iw - 1 + it lt + movlt r12, r5 // min(y, ih - 1) + cmp r4, r2 + bic r12, r12, r12, asr #31 // max(min(y, ih - 1), 0) + it lt + movlt lr, r4 // min(x, iw - 1) + bic lr, lr, lr, asr #31 // max(min(x, iw - 1), 0) + mla r8, r12, r9, r8 // ref += iclip() * stride + add r8, r8, lr // ref += iclip() + + // bottom_ext = iclip(y + bh - ih, 0, bh - 1) + // top_ext = iclip(-y, 0, bh - 1) + add r10, r5, r1 // y + bh + neg r5, r5 // -y + sub r10, r10, r3 // y + bh - ih + sub r12, r1, #1 // bh - 1 + cmp r10, r1 + bic r5, r5, r5, asr #31 // max(-y, 0) + it ge + movge r10, r12 // min(y + bh - ih, bh-1) + cmp r5, r1 + bic r10, r10, r10, asr #31 // max(min(y + bh - ih, bh-1), 0) + it ge + movge r5, r12 // min(max(-y, 0), bh-1) + + // right_ext = iclip(x + bw - iw, 0, bw - 1) + // left_ext = iclip(-x, 0, bw - 1) + add r11, r4, r0 // x + bw + neg r4, r4 // -x + sub r11, r11, r2 // x + bw - iw + sub lr, r0, #1 // bw - 1 + cmp r11, r0 + bic r4, r4, r4, asr #31 // max(-x, 0) + it ge + movge r11, lr // min(x + bw - iw, bw-1) + cmp r4, r0 + bic r11, r11, r11, asr #31 // max(min(x + bw - iw, bw-1), 0) + it ge + movge r4, lr // min(max(-x, 0), bw - 1) + + // center_h = bh - top_ext - bottom_ext + // dst += top_ext * PXSTRIDE(dst_stride) + // center_w = bw - left_ext - right_ext + sub r1, r1, r5 // bh - top_ext + mla r6, r5, r7, r6 + sub r2, r0, r4 // bw - left_ext + sub r1, r1, r10 // center_h = bh - top_ext - bottom_ext + sub r2, r2, r11 // center_w = bw - left_ext - right_ext + + mov r0, r6 // backup of dst + +.macro v_loop need_left, need_right +0: +.if \need_left + vld1.8 {d0[]}, [r8] + mov r12, r6 // out = dst + mov r3, r4 + vmov d1, d0 +1: + subs r3, r3, #16 + vst1.8 {q0}, [r12, :128]! + bgt 1b +.endif + mov lr, r8 + add r12, r6, r4 // out = dst + left_ext + mov r3, r2 +1: + vld1.8 {q0, q1}, [lr]! + subs r3, r3, #32 +.if \need_left + vst1.8 {q0, q1}, [r12]! +.else + vst1.8 {q0, q1}, [r12, :128]! +.endif + bgt 1b +.if \need_right + add r3, r8, r2 // in + center_w + sub r3, r3, #1 // in + center_w - 1 + add r12, r6, r4 // dst + left_ext + vld1.8 {d0[]}, [r3] + add r12, r12, r2 // out = dst + left_ext + center_w + mov r3, r11 + vmov d1, d0 +1: + subs r3, r3, #16 + vst1.8 {q0}, [r12]! + bgt 1b +.endif + + subs r1, r1, #1 // center_h-- + add r6, r6, r7 + add r8, r8, r9 + bgt 0b +.endm + + cmp r4, #0 + beq 2f + // need_left + cmp r11, #0 + beq 3f + // need_left + need_right + v_loop 1, 1 + b 5f + +2: + // !need_left + cmp r11, #0 + beq 4f + // !need_left + need_right + v_loop 0, 1 + b 5f + +3: + // need_left + !need_right + v_loop 1, 0 + b 5f + +4: + // !need_left + !need_right + v_loop 0, 0 + +5: + cmp r10, #0 + // Storing the original dst in r0 overwrote bw, recalculate it here + add r2, r2, r4 // center_w + left_ext + add r2, r2, r11 // bw = center_w + left_ext + right_ext + + beq 3f + // need_bottom + sub r8, r6, r7 // ref = dst - stride + mov r4, r2 +1: + vld1.8 {q0, q1}, [r8, :128]! + mov r3, r10 +2: + subs r3, r3, #1 + vst1.8 {q0, q1}, [r6, :128], r7 + bgt 2b + mls r6, r7, r10, r6 // dst -= bottom_ext * stride + subs r4, r4, #32 // bw -= 32 + add r6, r6, #32 // dst += 32 + bgt 1b + +3: + cmp r5, #0 + beq 3f + // need_top + mls r6, r7, r5, r0 // dst = stored_dst - top_ext * stride +1: + vld1.8 {q0, q1}, [r0, :128]! + mov r3, r5 +2: + subs r3, r3, #1 + vst1.8 {q0, q1}, [r6, :128], r7 + bgt 2b + mls r6, r7, r5, r6 // dst -= top_ext * stride + subs r2, r2, #32 // bw -= 32 + add r6, r6, #32 // dst += 32 + bgt 1b + +3: + pop {r4-r11,pc} +endfunc diff --git a/third_party/dav1d/src/arm/32/mc16.S b/third_party/dav1d/src/arm/32/mc16.S new file mode 100644 index 0000000000..ca5e9991a1 --- /dev/null +++ b/third_party/dav1d/src/arm/32/mc16.S @@ -0,0 +1,2734 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Janne Grunau + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +#define PREP_BIAS 8192 + +.macro avg d0, d00, d01, d1, d10, d11 + vld1.16 {q0, q1}, [r2, :128]! + vld1.16 {q2, q3}, [r3, :128]! + vqadd.s16 q0, q0, q2 + vqadd.s16 q1, q1, q3 + vmax.s16 q0, q0, q12 // -2*PREP_BIAS - 1 << intermediate_bits + vmax.s16 q1, q1, q12 // -2*PREP_BIAS - 1 << intermediate_bits + vqsub.s16 q0, q0, q12 // -2*PREP_BIAS - 1 << intermediate_bits + vqsub.s16 q1, q1, q12 // -2*PREP_BIAS - 1 << intermediate_bits + vshl.s16 \d0, q0, q13 // -(intermediate_bits+1) + vshl.s16 \d1, q1, q13 // -(intermediate_bits+1) +.endm + +.macro w_avg d0, d00, d01, d1, d10, d11 + vld1.16 {q0, q1}, [r2, :128]! + vld1.16 {q2, q3}, [r3, :128]! + // This difference requires a 17 bit range, and all bits are + // significant for the following multiplication. + vsubl.s16 \d0, d4, d0 + vsubl.s16 q0, d5, d1 + vsubl.s16 \d1, d6, d2 + vsubl.s16 q1, d7, d3 + vmul.s32 \d0, \d0, q4 + vmul.s32 q0, q0, q4 + vmul.s32 \d1, \d1, q4 + vmul.s32 q1, q1, q4 + vshr.s32 \d0, \d0, #4 + vshr.s32 q0, q0, #4 + vshr.s32 \d1, \d1, #4 + vshr.s32 q1, q1, #4 + vaddw.s16 \d0, \d0, d4 + vaddw.s16 q0, q0, d5 + vaddw.s16 \d1, \d1, d6 + vaddw.s16 q1, q1, d7 + vmovn.i32 \d00, \d0 + vmovn.i32 \d01, q0 + vmovn.i32 \d10, \d1 + vmovn.i32 \d11, q1 + vrshl.s16 \d0, \d0, q13 // -intermediate_bits + vrshl.s16 \d1, \d1, q13 // -intermediate_bits + vadd.s16 \d0, \d0, q12 // PREP_BIAS >> intermediate_bits + vadd.s16 \d1, \d1, q12 // PREP_BIAS >> intermediate_bits + vmin.s16 \d0, \d0, q15 // bitdepth_max + vmin.s16 \d1, \d1, q15 // bitdepth_max + vmax.s16 \d0, \d0, q14 // 0 + vmax.s16 \d1, \d1, q14 // 0 +.endm + +.macro mask d0, d00, d01, d1, d10, d11 + vld1.8 {q7}, [r6, :128]! + vld1.16 {q0, q1}, [r2, :128]! + vneg.s8 q7, q7 + vld1.16 {q2, q3}, [r3, :128]! + vmovl.s8 q6, d14 + vmovl.s8 q7, d15 + vmovl.s16 q4, d12 + vmovl.s16 q5, d13 + vmovl.s16 q6, d14 + vmovl.s16 q7, d15 + vsubl.s16 \d0, d4, d0 + vsubl.s16 q0, d5, d1 + vsubl.s16 \d1, d6, d2 + vsubl.s16 q1, d7, d3 + vmul.s32 \d0, \d0, q4 + vmul.s32 q0, q0, q5 + vmul.s32 \d1, \d1, q6 + vmul.s32 q1, q1, q7 + vshr.s32 \d0, \d0, #6 + vshr.s32 q0, q0, #6 + vshr.s32 \d1, \d1, #6 + vshr.s32 q1, q1, #6 + vaddw.s16 \d0, \d0, d4 + vaddw.s16 q0, q0, d5 + vaddw.s16 \d1, \d1, d6 + vaddw.s16 q1, q1, d7 + vmovn.i32 \d00, \d0 + vmovn.i32 \d01, q0 + vmovn.i32 \d10, \d1 + vmovn.i32 \d11, q1 + vrshl.s16 \d0, \d0, q13 // -intermediate_bits + vrshl.s16 \d1, \d1, q13 // -intermediate_bits + vadd.s16 \d0, \d0, q12 // PREP_BIAS >> intermediate_bits + vadd.s16 \d1, \d1, q12 // PREP_BIAS >> intermediate_bits + vmin.s16 \d0, \d0, q15 // bitdepth_max + vmin.s16 \d1, \d1, q15 // bitdepth_max + vmax.s16 \d0, \d0, q14 // 0 + vmax.s16 \d1, \d1, q14 // 0 +.endm + +.macro bidir_fn type, bdmax +function \type\()_16bpc_neon, export=1 + push {r4-r7,lr} + ldr r4, [sp, #20] + ldr r5, [sp, #24] + ldr r6, [sp, #28] + clz r4, r4 +.ifnc \type, avg + ldr r7, [sp, #32] + vmov.i16 q14, #0 + vdup.16 q15, r7 // bitdepth_max +.endif +.ifc \type, w_avg + vpush {q4} +.endif +.ifc \type, mask + vpush {q4-q7} +.endif + clz r7, \bdmax + sub r7, r7, #18 // intermediate_bits = clz(bitdepth_max) - 18 +.ifc \type, avg + mov lr, #1 + movw r12, #2*PREP_BIAS + lsl lr, lr, r7 // 1 << intermediate_bits + neg r12, r12 // -2*PREP_BIAS + add r7, r7, #1 + sub r12, r12, lr // -2*PREP_BIAS - 1 << intermediate_bits + neg r7, r7 // -(intermediate_bits+1) + vdup.16 q12, r12 // -2*PREP_BIAS - 1 << intermediate_bits + vdup.16 q13, r7 // -(intermediate_bits+1) +.else + mov r12, #PREP_BIAS + lsr r12, r12, r7 // PREP_BIAS >> intermediate_bits + neg r7, r7 // -intermediate_bits + vdup.16 q12, r12 // PREP_BIAS >> intermediate_bits + vdup.16 q13, r7 // -intermediate_bits +.endif +.ifc \type, w_avg + vdup.32 q4, r6 + vneg.s32 q4, q4 +.endif + adr r7, L(\type\()_tbl) + sub r4, r4, #24 + \type q8, d16, d17, q9, d18, d19 + ldr r4, [r7, r4, lsl #2] + add r7, r7, r4 + bx r7 + + .align 2 +L(\type\()_tbl): + .word 1280f - L(\type\()_tbl) + CONFIG_THUMB + .word 640f - L(\type\()_tbl) + CONFIG_THUMB + .word 320f - L(\type\()_tbl) + CONFIG_THUMB + .word 160f - L(\type\()_tbl) + CONFIG_THUMB + .word 80f - L(\type\()_tbl) + CONFIG_THUMB + .word 40f - L(\type\()_tbl) + CONFIG_THUMB + +40: + add r7, r0, r1 + lsl r1, r1, #1 +4: + subs r5, r5, #4 + vst1.16 {d16}, [r0, :64], r1 + vst1.16 {d17}, [r7, :64], r1 + vst1.16 {d18}, [r0, :64], r1 + vst1.16 {d19}, [r7, :64], r1 + ble 0f + \type q8, d16, d17, q9, d18, d19 + b 4b +80: + add r7, r0, r1 + lsl r1, r1, #1 +8: + vst1.16 {q8}, [r0, :128], r1 + subs r5, r5, #2 + vst1.16 {q9}, [r7, :128], r1 + ble 0f + \type q8, d16, d17, q9, d18, d19 + b 8b +160: +16: + \type q10, d20, d21, q11, d22, d23 + vst1.16 {q8, q9}, [r0, :128], r1 + subs r5, r5, #2 + vst1.16 {q10, q11}, [r0, :128], r1 + ble 0f + \type q8, d16, d17, q9, d18, d19 + b 16b +320: + add r7, r0, #32 +32: + \type q10, d20, d21, q11, d22, d23 + vst1.16 {q8, q9}, [r0, :128], r1 + subs r5, r5, #1 + vst1.16 {q10, q11}, [r7, :128], r1 + ble 0f + \type q8, d16, d17, q9, d18, d19 + b 32b +640: + add r7, r0, #32 + mov r12, #64 + sub r1, r1, #64 +64: + \type q10, d20, d21, q11, d22, d23 + vst1.16 {q8, q9}, [r0, :128], r12 + \type q8, d16, d17, q9, d18, d19 + vst1.16 {q10, q11}, [r7, :128], r12 + \type q10, d20, d21, q11, d22, d23 + vst1.16 {q8, q9}, [r0, :128], r1 + subs r5, r5, #1 + vst1.16 {q10, q11}, [r7, :128], r1 + ble 0f + \type q8, d16, d17, q9, d18, d19 + b 64b +1280: + add r7, r0, #32 + mov r12, #64 + sub r1, r1, #192 +128: + \type q10, d20, d21, q11, d22, d23 + vst1.16 {q8, q9}, [r0, :128], r12 + \type q8, d16, d17, q9, d18, d19 + vst1.16 {q10, q11}, [r7, :128], r12 + \type q10, d20, d21, q11, d22, d23 + vst1.16 {q8, q9}, [r0, :128], r12 + \type q8, d16, d17, q9, d18, d19 + vst1.16 {q10, q11}, [r7, :128], r12 + \type q10, d20, d21, q11, d22, d23 + vst1.16 {q8, q9}, [r0, :128], r12 + \type q8, d16, d17, q9, d18, d19 + vst1.16 {q10, q11}, [r7, :128], r12 + \type q10, d20, d21, q11, d22, d23 + vst1.16 {q8, q9}, [r0, :128], r1 + subs r5, r5, #1 + vst1.16 {q10, q11}, [r7, :128], r1 + ble 0f + \type q8, d16, d17, q9, d18, d19 + b 128b +0: +.ifc \type, mask + vpop {q4-q7} +.endif +.ifc \type, w_avg + vpop {q4} +.endif + pop {r4-r7,pc} +endfunc +.endm + +bidir_fn avg, r6 +bidir_fn w_avg, r7 +bidir_fn mask, r7 + + +// This has got the same signature as the put_8tap functions, +// and assumes that r9 is set to (clz(w)-24). +function put_neon + adr r10, L(put_tbl) + ldr r9, [r10, r9, lsl #2] + add r10, r10, r9 + bx r10 + + .align 2 +L(put_tbl): + .word 1280f - L(put_tbl) + CONFIG_THUMB + .word 640f - L(put_tbl) + CONFIG_THUMB + .word 320f - L(put_tbl) + CONFIG_THUMB + .word 16f - L(put_tbl) + CONFIG_THUMB + .word 80f - L(put_tbl) + CONFIG_THUMB + .word 4f - L(put_tbl) + CONFIG_THUMB + .word 2f - L(put_tbl) + CONFIG_THUMB + +2: + vld1.32 {d0[]}, [r2], r3 + vld1.32 {d1[]}, [r2], r3 + subs r5, r5, #2 + vst1.32 {d0[0]}, [r0, :32], r1 + vst1.32 {d1[1]}, [r0, :32], r1 + bgt 2b + pop {r4-r11,pc} +4: + vld1.16 {d0}, [r2], r3 + vld1.16 {d1}, [r2], r3 + subs r5, r5, #2 + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d1}, [r0, :64], r1 + bgt 4b + pop {r4-r11,pc} +80: + add r8, r0, r1 + lsl r1, r1, #1 + add r9, r2, r3 + lsl r3, r3, #1 +8: + vld1.16 {q0}, [r2], r3 + vld1.16 {q1}, [r9], r3 + subs r5, r5, #2 + vst1.16 {q0}, [r0, :128], r1 + vst1.16 {q1}, [r8, :128], r1 + bgt 8b + pop {r4-r11,pc} +16: + vld1.16 {q0, q1}, [r2], r3 + subs r5, r5, #1 + vst1.16 {q0, q1}, [r0, :128], r1 + bgt 16b + pop {r4-r11,pc} +320: + sub r1, r1, #32 + sub r3, r3, #32 +32: + vld1.16 {q0, q1}, [r2]! + vst1.16 {q0, q1}, [r0, :128]! + vld1.16 {q2, q3}, [r2], r3 + subs r5, r5, #1 + vst1.16 {q2, q3}, [r0, :128], r1 + bgt 32b + pop {r4-r11,pc} +640: + sub r1, r1, #96 + sub r3, r3, #96 +64: + vld1.16 {q8, q9}, [r2]! + vst1.16 {q8, q9}, [r0, :128]! + vld1.16 {q10, q11}, [r2]! + vst1.16 {q10, q11}, [r0, :128]! + vld1.16 {q12, q13}, [r2]! + vst1.16 {q12, q13}, [r0, :128]! + vld1.16 {q14, q15}, [r2], r3 + subs r5, r5, #1 + vst1.16 {q14, q15}, [r0, :128], r1 + bgt 64b + pop {r4-r11,pc} +1280: + sub r1, r1, #224 + sub r3, r3, #224 +128: + vld1.16 {q8, q9}, [r2]! + vst1.16 {q8, q9}, [r0, :128]! + vld1.16 {q10, q11}, [r2]! + vst1.16 {q10, q11}, [r0, :128]! + vld1.16 {q12, q13}, [r2]! + vst1.16 {q12, q13}, [r0, :128]! + vld1.16 {q14, q15}, [r2]! + vst1.16 {q14, q15}, [r0, :128]! + vld1.16 {q8, q9}, [r2]! + vst1.16 {q8, q9}, [r0, :128]! + vld1.16 {q10, q11}, [r2]! + vst1.16 {q10, q11}, [r0, :128]! + vld1.16 {q12, q13}, [r2]! + vst1.16 {q12, q13}, [r0, :128]! + vld1.16 {q14, q15}, [r2], r3 + subs r5, r5, #1 + vst1.16 {q14, q15}, [r0, :128], r1 + bgt 128b + pop {r4-r11,pc} +endfunc + +// This has got the same signature as the prep_8tap functions, +// and assumes that r9 is set to (clz(w)-24), r7 to intermediate_bits and +// r8 to w*2. +function prep_neon + adr r10, L(prep_tbl) + ldr r9, [r10, r9, lsl #2] + vdup.16 q15, r7 // intermediate_bits + vmov.i16 q14, #PREP_BIAS + add r10, r10, r9 + bx r10 + + .align 2 +L(prep_tbl): + .word 1280f - L(prep_tbl) + CONFIG_THUMB + .word 640f - L(prep_tbl) + CONFIG_THUMB + .word 320f - L(prep_tbl) + CONFIG_THUMB + .word 16f - L(prep_tbl) + CONFIG_THUMB + .word 80f - L(prep_tbl) + CONFIG_THUMB + .word 40f - L(prep_tbl) + CONFIG_THUMB + +40: + add r9, r1, r2 + lsl r2, r2, #1 +4: + vld1.16 {d0}, [r1], r2 + vld1.16 {d1}, [r9], r2 + subs r4, r4, #2 + vshl.s16 q0, q0, q15 + vsub.i16 q0, q0, q14 + vst1.16 {q0}, [r0, :128]! + bgt 4b + pop {r4-r11,pc} +80: + add r9, r1, r2 + lsl r2, r2, #1 +8: + vld1.16 {q0}, [r1], r2 + vld1.16 {q1}, [r9], r2 + subs r4, r4, #2 + vshl.s16 q0, q0, q15 + vshl.s16 q1, q1, q15 + vsub.i16 q0, q0, q14 + vsub.i16 q1, q1, q14 + vst1.16 {q0, q1}, [r0, :128]! + bgt 8b + pop {r4-r11,pc} +16: + vld1.16 {q0, q1}, [r1], r2 + vshl.s16 q0, q0, q15 + vld1.16 {q2, q3}, [r1], r2 + subs r4, r4, #2 + vshl.s16 q1, q1, q15 + vshl.s16 q2, q2, q15 + vshl.s16 q3, q3, q15 + vsub.i16 q0, q0, q14 + vsub.i16 q1, q1, q14 + vsub.i16 q2, q2, q14 + vst1.16 {q0, q1}, [r0, :128]! + vsub.i16 q3, q3, q14 + vst1.16 {q2, q3}, [r0, :128]! + bgt 16b + pop {r4-r11,pc} +320: + sub r2, r2, #32 +32: + vld1.16 {q0, q1}, [r1]! + subs r4, r4, #1 + vshl.s16 q0, q0, q15 + vld1.16 {q2, q3}, [r1], r2 + vshl.s16 q1, q1, q15 + vshl.s16 q2, q2, q15 + vshl.s16 q3, q3, q15 + vsub.i16 q0, q0, q14 + vsub.i16 q1, q1, q14 + vsub.i16 q2, q2, q14 + vst1.16 {q0, q1}, [r0, :128]! + vsub.i16 q3, q3, q14 + vst1.16 {q2, q3}, [r0, :128]! + bgt 32b + pop {r4-r11,pc} +640: + sub r2, r2, #96 +64: + vld1.16 {q0, q1}, [r1]! + subs r4, r4, #1 + vshl.s16 q0, q0, q15 + vld1.16 {q2, q3}, [r1]! + vshl.s16 q1, q1, q15 + vld1.16 {q8, q9}, [r1]! + vshl.s16 q2, q2, q15 + vld1.16 {q10, q11}, [r1], r2 + vshl.s16 q3, q3, q15 + vshl.s16 q8, q8, q15 + vshl.s16 q9, q9, q15 + vshl.s16 q10, q10, q15 + vshl.s16 q11, q11, q15 + vsub.i16 q0, q0, q14 + vsub.i16 q1, q1, q14 + vsub.i16 q2, q2, q14 + vsub.i16 q3, q3, q14 + vsub.i16 q8, q8, q14 + vst1.16 {q0, q1}, [r0, :128]! + vsub.i16 q9, q9, q14 + vst1.16 {q2, q3}, [r0, :128]! + vsub.i16 q10, q10, q14 + vst1.16 {q8, q9}, [r0, :128]! + vsub.i16 q11, q11, q14 + vst1.16 {q10, q11}, [r0, :128]! + bgt 64b + pop {r4-r11,pc} +1280: + sub r2, r2, #224 +128: + vld1.16 {q0, q1}, [r1]! + subs r4, r4, #1 + vshl.s16 q0, q0, q15 + vld1.16 {q2, q3}, [r1]! + vshl.s16 q1, q1, q15 + vld1.16 {q8, q9}, [r1]! + vshl.s16 q2, q2, q15 + vld1.16 {q10, q11}, [r1]! + vshl.s16 q3, q3, q15 + vshl.s16 q8, q8, q15 + vshl.s16 q9, q9, q15 + vshl.s16 q10, q10, q15 + vshl.s16 q11, q11, q15 + vsub.i16 q0, q0, q14 + vsub.i16 q1, q1, q14 + vsub.i16 q2, q2, q14 + vsub.i16 q3, q3, q14 + vsub.i16 q8, q8, q14 + vst1.16 {q0, q1}, [r0, :128]! + vld1.16 {q0, q1}, [r1]! + vsub.i16 q9, q9, q14 + vsub.i16 q10, q10, q14 + vst1.16 {q2, q3}, [r0, :128]! + vld1.16 {q2, q3}, [r1]! + vsub.i16 q11, q11, q14 + vshl.s16 q0, q0, q15 + vst1.16 {q8, q9}, [r0, :128]! + vld1.16 {q8, q9}, [r1]! + vshl.s16 q1, q1, q15 + vshl.s16 q2, q2, q15 + vst1.16 {q10, q11}, [r0, :128]! + vld1.16 {q10, q11}, [r1], r2 + vshl.s16 q3, q3, q15 + vshl.s16 q8, q8, q15 + vshl.s16 q9, q9, q15 + vshl.s16 q10, q10, q15 + vshl.s16 q11, q11, q15 + vsub.i16 q0, q0, q14 + vsub.i16 q1, q1, q14 + vsub.i16 q2, q2, q14 + vsub.i16 q3, q3, q14 + vsub.i16 q8, q8, q14 + vst1.16 {q0, q1}, [r0, :128]! + vsub.i16 q9, q9, q14 + vst1.16 {q2, q3}, [r0, :128]! + vsub.i16 q10, q10, q14 + vst1.16 {q8, q9}, [r0, :128]! + vsub.i16 q11, q11, q14 + vst1.16 {q10, q11}, [r0, :128]! + bgt 128b + pop {r4-r11,pc} +endfunc + +.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 + vld1.\wd {\d0[]}, [\s0], \strd + vld1.\wd {\d1[]}, [\s1], \strd +.ifnb \d2 + vld1.\wd {\d2[]}, [\s0], \strd + vld1.\wd {\d3[]}, [\s1], \strd +.endif +.ifnb \d4 + vld1.\wd {\d4[]}, [\s0], \strd +.endif +.ifnb \d5 + vld1.\wd {\d5[]}, [\s1], \strd +.endif +.ifnb \d6 + vld1.\wd {\d6[]}, [\s0], \strd +.endif +.endm +.macro load_reg s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 + vld1.16 {\d0}, [\s0], \strd + vld1.16 {\d1}, [\s1], \strd +.ifnb \d2 + vld1.16 {\d2}, [\s0], \strd + vld1.16 {\d3}, [\s1], \strd +.endif +.ifnb \d4 + vld1.16 {\d4}, [\s0], \strd +.endif +.ifnb \d5 + vld1.16 {\d5}, [\s1], \strd +.endif +.ifnb \d6 + vld1.16 {\d6}, [\s0], \strd +.endif +.endm +.macro load_regpair s0, s1, strd, d0, d1, d2, d3, d4, d5 + vld1.16 {\d0, \d1}, [\s0], \strd +.ifnb \d2 + vld1.16 {\d2, \d3}, [\s1], \strd +.endif +.ifnb \d4 + vld1.16 {\d4, \d5}, [\s0], \strd +.endif +.endm +.macro load_32 s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 + load_slice \s0, \s1, \strd, 32, \d0, \d1, \d2, \d3, \d4, \d5, \d6 +.endm +.macro load_16s16 s0, s1, strd, d0, d1, d2, d3, d4, d5 + load_regpair \s0, \s1, \strd, \d0, \d1, \d2, \d3, \d4, \d5 +.endm +.macro interleave_1_32 r0, r1, r2, r3, r4 + vext.8 \r0, \r0, \r1, #4 + vext.8 \r1, \r1, \r2, #4 +.ifnb \r3 + vext.8 \r2, \r2, \r3, #4 + vext.8 \r3, \r3, \r4, #4 +.endif +.endm +.macro vmin_u16 c, r0, r1, r2, r3 + vmin.u16 \r0, \r0, \c +.ifnb \r1 + vmin.u16 \r1, \r1, \c +.endif +.ifnb \r2 + vmin.u16 \r2, \r2, \c + vmin.u16 \r3, \r3, \c +.endif +.endm +.macro vsub_i16 c, r0, r1, r2, r3 + vsub.i16 \r0, \r0, \c +.ifnb \r1 + vsub.i16 \r1, \r1, \c +.endif +.ifnb \r2 + vsub.i16 \r2, \r2, \c + vsub.i16 \r3, \r3, \c +.endif +.endm +.macro vmull_vmlal_4 d, s0, s1, s2, s3 + vmull.s16 \d, \s0, d0[0] + vmlal.s16 \d, \s1, d0[1] + vmlal.s16 \d, \s2, d0[2] + vmlal.s16 \d, \s3, d0[3] +.endm +.macro vmull_vmlal_8 d, s0, s1, s2, s3, s4, s5, s6, s7 + vmull.s16 \d, \s0, d0[0] + vmlal.s16 \d, \s1, d0[1] + vmlal.s16 \d, \s2, d0[2] + vmlal.s16 \d, \s3, d0[3] + vmlal.s16 \d, \s4, d1[0] + vmlal.s16 \d, \s5, d1[1] + vmlal.s16 \d, \s6, d1[2] + vmlal.s16 \d, \s7, d1[3] +.endm +.macro vqrshrun_s32 shift, q0, d0, q1, d1, q2, d2, q3, d3 + vqrshrun.s32 \d0, \q0, #\shift +.ifnb \q1 + vqrshrun.s32 \d1, \q1, #\shift +.endif +.ifnb \q2 + vqrshrun.s32 \d2, \q2, #\shift + vqrshrun.s32 \d3, \q3, #\shift +.endif +.endm +.macro vmovn_i32 q0, d0, q1, d1, q2, d2, q3, d3 + vmovn.i32 \d0, \q0 +.ifnb \q1 + vmovn.i32 \d1, \q1 +.endif +.ifnb \q2 + vmovn.i32 \d2, \q2 + vmovn.i32 \d3, \q3 +.endif +.endm +.macro vrshl_s32 shift, r0, r1, r2, r3 + vrshl.s32 \r0, \r0, \shift + vrshl.s32 \r1, \r1, \shift +.ifnb \r2 + vrshl.s32 \r2, \r2, \shift + vrshl.s32 \r3, \r3, \shift +.endif +.endm +.macro vst1_32 strd, r0, r1 + vst1.32 {\r0[0]}, [r0, :32], \strd + vst1.32 {\r0[1]}, [r9, :32], \strd +.ifnb \r1 + vst1.32 {\r1[0]}, [r0, :32], \strd + vst1.32 {\r1[1]}, [r9, :32], \strd +.endif +.endm +.macro vst1_reg strd, align, r0, r1, r2, r3, r4, r5, r6, r7 + vst1.16 {\r0}, [r0, \align], \strd + vst1.16 {\r1}, [r9, \align], \strd +.ifnb \r2 + vst1.16 {\r2}, [r0, \align], \strd + vst1.16 {\r3}, [r9, \align], \strd +.endif +.ifnb \r4 + vst1.16 {\r4}, [r0, \align], \strd + vst1.16 {\r5}, [r9, \align], \strd + vst1.16 {\r6}, [r0, \align], \strd + vst1.16 {\r7}, [r9, \align], \strd +.endif +.endm +.macro finalize type, q0, q1, d0, d1, q2, q3, d2, d3 +.ifc \type, put + vqrshrun_s32 6, \q0, \d0, \q1, \d1, \q2, \d2, \q3, \d3 + vmin_u16 q15, \q0, \q1 +.else + vrshl_s32 q14, \q0, \q1, \q2, \q3 // -(6-intermediate_bits) + vmovn_i32 \q0, \d0, \q1, \d1, \q2, \d2, \q3, \d3 + vsub_i16 q15, \q0, \q1 // PREP_BIAS +.endif +.endm +.macro shift_store_4 type, strd, q0, q1, d0, d1, q2, q3, d2, d3 + finalize \type, \q0, \q1, \d0, \d1, \q2, \q3, \d2, \d3 + vst1_reg \strd, :64, \d0, \d1, \d2, \d3 +.endm +.macro shift_store_8 type, strd, q0, q1, d0, d1, q2, q3, d2, d3 + finalize \type, \q0, \q1, \d0, \d1, \q2, \q3, \d2, \d3 + vst1_reg \strd, :128, \q0, \q1 +.endm +.macro shift_store_16 type, strd, q0, q1, d0, d1, q2, q3, d2, d3 + finalize \type, \q0, \q1, \d0, \d1, \q2, \q3, \d2, \d3 + vst1.16 {\q0, \q1}, [r0, :128], \strd +.endm + +.macro make_8tap_fn op, type, type_h, type_v +function \op\()_8tap_\type\()_16bpc_neon, export=1 + push {r4-r11,lr} + movw r9, \type_h + movw r10, \type_v + b \op\()_8tap_neon +endfunc +.endm + +// No spaces in these expressions, due to gas-preprocessor. +#define REGULAR ((0*15<<7)|3*15) +#define SMOOTH ((1*15<<7)|4*15) +#define SHARP ((2*15<<7)|3*15) + +.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, my, bdmax, ds2, sr2 +make_8tap_fn \type, regular, REGULAR, REGULAR +make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH +make_8tap_fn \type, regular_sharp, REGULAR, SHARP +make_8tap_fn \type, smooth, SMOOTH, SMOOTH +make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR +make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP +make_8tap_fn \type, sharp, SHARP, SHARP +make_8tap_fn \type, sharp_regular, SHARP, REGULAR +make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH + +function \type\()_8tap_neon + ldrd r4, r5, [sp, #36] + ldrd r6, r7, [sp, #44] +.ifc \bdmax, r8 + ldr r8, [sp, #52] +.endif + movw r11, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) + mul \mx, \mx, r11 + mul \my, \my, r11 + add \mx, \mx, r9 // mx, 8tap_h, 4tap_h + add \my, \my, r10 // my, 8tap_v, 4tap_v + +.ifc \type, prep + lsl \d_strd, \w, #1 +.endif + + vdup.16 q15, \bdmax // bitdepth_max + clz \bdmax, \bdmax + clz r9, \w + sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18 + tst \mx, #(0x7f << 14) + sub r9, r9, #24 + add lr, \bdmax, #6 // 6 + intermediate_bits + rsb r12, \bdmax, #6 // 6 - intermediate_bits + movrel r11, X(mc_subpel_filters), -8 + bne L(\type\()_8tap_h) + tst \my, #(0x7f << 14) + bne L(\type\()_8tap_v) + b \type\()_neon + +L(\type\()_8tap_h): + cmp \w, #4 + ubfx r10, \mx, #7, #7 + and \mx, \mx, #0x7f + it gt + movgt \mx, r10 + tst \my, #(0x7f << 14) + add \mx, r11, \mx, lsl #3 + bne L(\type\()_8tap_hv) + + adr r10, L(\type\()_8tap_h_tbl) + vdup.32 q14, r12 // 6 - intermediate_bits + ldr r9, [r10, r9, lsl #2] + vneg.s32 q14, q14 // -(6-intermediate_bits) +.ifc \type, put + vdup.16 q13, \bdmax // intermediate_bits +.else + vmov.i16 q13, #PREP_BIAS +.endif + add r10, r10, r9 +.ifc \type, put + vneg.s16 q13, q13 // -intermediate_bits +.endif + bx r10 + + .align 2 +L(\type\()_8tap_h_tbl): + .word 1280f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB + .word 640f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB + .word 320f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB + .word 160f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB + .word 80f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB + .word 40f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB + .word 20f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB + +20: // 2xN h +.ifc \type, put + add \mx, \mx, #2 + vld1.32 {d0[]}, [\mx] + sub \src, \src, #2 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + vmovl.s8 q0, d0 +2: + vld1.16 {q2}, [\src], \s_strd + vld1.16 {q3}, [\sr2], \s_strd + vext.8 d5, d4, d5, #2 + vext.8 d7, d6, d7, #2 + subs \h, \h, #2 + vtrn.32 d4, d6 + vtrn.32 d5, d7 + vmull.s16 q1, d4, d0[0] + vmlal.s16 q1, d5, d0[1] + vmlal.s16 q1, d6, d0[2] + vmlal.s16 q1, d7, d0[3] + vrshl.s32 q1, q1, q14 // -(6-intermediate_bits) + vqmovun.s32 d2, q1 + vrshl.s16 d2, d2, d26 // -intermediate_bits + vmin.u16 d2, d2, d30 + vst1.32 {d2[0]}, [\dst, :32], \d_strd + vst1.32 {d2[1]}, [\ds2, :32], \d_strd + bgt 2b + pop {r4-r11,pc} +.endif + +40: // 4xN h + add \mx, \mx, #2 + vld1.32 {d0[]}, [\mx] + sub \src, \src, #2 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + vmovl.s8 q0, d0 +4: + vld1.16 {q8}, [\src], \s_strd + vld1.16 {q11}, [\sr2], \s_strd + vext.8 d18, d16, d17, #2 + vext.8 d19, d16, d17, #4 + vext.8 d20, d16, d17, #6 + vext.8 d24, d22, d23, #2 + vext.8 d25, d22, d23, #4 + vext.8 d21, d22, d23, #6 + subs \h, \h, #2 + vmull.s16 q2, d16, d0[0] + vmlal.s16 q2, d18, d0[1] + vmlal.s16 q2, d19, d0[2] + vmlal.s16 q2, d20, d0[3] + vmull.s16 q3, d22, d0[0] + vmlal.s16 q3, d24, d0[1] + vmlal.s16 q3, d25, d0[2] + vmlal.s16 q3, d21, d0[3] + vrshl.s32 q2, q2, q14 // -(6-intermediate_bits) + vrshl.s32 q3, q3, q14 // -(6-intermediate_bits) +.ifc \type, put + vqmovun.s32 d4, q2 + vqmovun.s32 d5, q3 + vrshl.s16 q2, q2, q13 // -intermediate_bits + vmin.u16 q2, q2, q15 +.else + vmovn.s32 d4, q2 + vmovn.s32 d5, q3 + vsub.i16 q2, q2, q13 // PREP_BIAS +.endif + vst1.16 {d4}, [\dst, :64], \d_strd + vst1.16 {d5}, [\ds2, :64], \d_strd + bgt 4b + pop {r4-r11,pc} + +80: +160: +320: +640: +1280: // 8xN, 16xN, 32xN, ... h + vpush {q4-q5} + vld1.8 {d0}, [\mx, :64] + sub \src, \src, #6 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + vmovl.s8 q0, d0 + + sub \s_strd, \s_strd, \w, lsl #1 + sub \s_strd, \s_strd, #16 +.ifc \type, put + lsl \d_strd, \d_strd, #1 + sub \d_strd, \d_strd, \w, lsl #1 +.endif +81: + vld1.16 {q8, q9}, [\src]! + vld1.16 {q10, q11}, [\sr2]! + mov \mx, \w + +8: + vmull.s16 q1, d16, d0[0] + vmull.s16 q2, d17, d0[0] + vmull.s16 q3, d20, d0[0] + vmull.s16 q4, d21, d0[0] +.irpc i, 1234567 + vext.8 q12, q8, q9, #(2*\i) + vext.8 q5, q10, q11, #(2*\i) +.if \i < 4 + vmlal.s16 q1, d24, d0[\i] + vmlal.s16 q2, d25, d0[\i] + vmlal.s16 q3, d10, d0[\i] + vmlal.s16 q4, d11, d0[\i] +.else + vmlal.s16 q1, d24, d1[\i-4] + vmlal.s16 q2, d25, d1[\i-4] + vmlal.s16 q3, d10, d1[\i-4] + vmlal.s16 q4, d11, d1[\i-4] +.endif +.endr + subs \mx, \mx, #8 + vrshl.s32 q1, q1, q14 // -(6-intermediate_bits) + vrshl.s32 q2, q2, q14 // -(6-intermediate_bits) + vrshl.s32 q3, q3, q14 // -(6-intermediate_bits) + vrshl.s32 q4, q4, q14 // -(6-intermediate_bits) +.ifc \type, put + vqmovun.s32 d2, q1 + vqmovun.s32 d3, q2 + vqmovun.s32 d4, q3 + vqmovun.s32 d5, q4 + vrshl.s16 q1, q1, q13 // -intermediate_bits + vrshl.s16 q2, q2, q13 // -intermediate_bits + vmin.u16 q1, q1, q15 + vmin.u16 q2, q2, q15 +.else + vmovn.s32 d2, q1 + vmovn.s32 d3, q2 + vmovn.s32 d4, q3 + vmovn.s32 d5, q4 + vsub.i16 q1, q1, q13 // PREP_BIAS + vsub.i16 q2, q2, q13 // PREP_BIAS +.endif + vst1.16 {q1}, [\dst, :128]! + vst1.16 {q2}, [\ds2, :128]! + ble 9f + + vmov q8, q9 + vmov q10, q11 + vld1.16 {q9}, [\src]! + vld1.16 {q11}, [\sr2]! + b 8b + +9: + add \dst, \dst, \d_strd + add \ds2, \ds2, \d_strd + add \src, \src, \s_strd + add \sr2, \sr2, \s_strd + + subs \h, \h, #2 + bgt 81b + vpop {q4-q5} + pop {r4-r11,pc} + + +L(\type\()_8tap_v): + cmp \h, #4 + ubfx r10, \my, #7, #7 + and \my, \my, #0x7f + it gt + movgt \my, r10 + add \my, r11, \my, lsl #3 + +.ifc \type, prep + vdup.32 q14, r12 // 6 - intermediate_bits + vmov.i16 q15, #PREP_BIAS +.endif + adr r10, L(\type\()_8tap_v_tbl) + ldr r9, [r10, r9, lsl #2] +.ifc \type, prep + vneg.s32 q14, q14 // -(6-intermediate_bits) +.endif + add r10, r10, r9 + bx r10 + + .align 2 +L(\type\()_8tap_v_tbl): + .word 1280f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB + .word 640f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB + .word 320f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB + .word 160f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB + .word 80f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB + .word 40f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB + .word 20f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB + +20: // 2xN v +.ifc \type, put + bgt 28f + + cmp \h, #2 + add \my, \my, #2 + vld1.32 {d0[]}, [\my] + sub \src, \src, \s_strd + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + + // 2x2 v + load_32 \src, \sr2, \s_strd, d1, d2, d3, d4, d5 + interleave_1_32 d1, d2, d3, d4, d5 + bgt 24f + vmull_vmlal_4 q8, d1, d2, d3, d4 + vqrshrun_s32 6, q8, d16 + vmin_u16 d30, d16 + vst1_32 \d_strd, d16 + pop {r4-r11,pc} + +24: // 2x4 v + load_32 \sr2, \src, \s_strd, d6, d7 + interleave_1_32 d5, d6, d7 + vmull_vmlal_4 q8, d1, d2, d3, d4 + vmull_vmlal_4 q9, d3, d4, d5, d6 + vqrshrun_s32 6, q8, d16, q9, d17 + vmin_u16 q15, q8 + vst1_32 \d_strd, d16, d17 + pop {r4-r11,pc} + +28: // 2x8, 2x16 v + vld1.8 {d0}, [\my, :64] + sub \sr2, \src, \s_strd, lsl #1 + add \ds2, \dst, \d_strd + sub \src, \sr2, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + vmovl.s8 q0, d0 + + load_32 \src, \sr2, \s_strd, d2, d3, d4, d5, d6, d7, d16 + interleave_1_32 d2, d3, d4, d5, d6 + interleave_1_32 d6, d7, d16 +216: + subs \h, \h, #8 + load_32 \sr2, \src, \s_strd, d17, d18, d19, d20 + load_32 \sr2, \src, \s_strd, d21, d22, d23, d24 + interleave_1_32 d16, d17, d18, d19, d20 + interleave_1_32 d20, d21, d22, d23, d24 + vmull_vmlal_8 q13, d2, d3, d4, d5, d6, d7, d16, d17 + vmull_vmlal_8 q1, d4, d5, d6, d7, d16, d17, d18, d19 + vmull_vmlal_8 q2, d6, d7, d16, d17, d18, d19, d20, d21 + vmull_vmlal_8 q3, d16, d17, d18, d19, d20, d21, d22, d23 + vqrshrun_s32 6, q13, d26, q1, d27, q2, d2, q3, d3 + vmin_u16 q15, q13, q1 + vst1_32 \d_strd, d26, d27 + vst1_32 \d_strd, d2, d3 + ble 0f + vmov q1, q9 + vmov q2, q10 + vmov q3, q11 + vmov d16, d24 + b 216b +0: + pop {r4-r11,pc} +.endif + +40: + bgt 480f + + // 4x2, 4x4 v + cmp \h, #2 + add \my, \my, #2 + vld1.32 {d0[]}, [\my] + sub \src, \src, \s_strd + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + + load_reg \src, \sr2, \s_strd, d1, d2, d3, d4, d5 + vmull_vmlal_4 q8, d1, d2, d3, d4 + vmull_vmlal_4 q9, d2, d3, d4, d5 + shift_store_4 \type, \d_strd, q8, q9, d16, d17 + ble 0f + load_reg \sr2, \src, \s_strd, d6, d7 + vmull_vmlal_4 q8, d3, d4, d5, d6 + vmull_vmlal_4 q9, d4, d5, d6, d7 + shift_store_4 \type, \d_strd, q8, q9, d16, d17 +0: + pop {r4-r11,pc} + +480: // 4x8, 4x16 v + vld1.8 {d0}, [\my, :64] + sub \sr2, \src, \s_strd, lsl #1 + add \ds2, \dst, \d_strd + sub \src, \sr2, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + + load_reg \src, \sr2, \s_strd, d16, d17, d18, d19, d20, d21, d22 + +48: + subs \h, \h, #4 + load_reg \sr2, \src, \s_strd, d23, d24, d25, d26 + vmull_vmlal_8 q1, d16, d17, d18, d19, d20, d21, d22, d23 + vmull_vmlal_8 q2, d17, d18, d19, d20, d21, d22, d23, d24 + vmull_vmlal_8 q3, d18, d19, d20, d21, d22, d23, d24, d25 + vmull_vmlal_8 q8, d19, d20, d21, d22, d23, d24, d25, d26 + shift_store_4 \type, \d_strd, q1, q2, d2, d3, q3, q8, d4, d5 + ble 0f + vmov q8, q10 + vmov q9, q11 + vmov q10, q12 + vmov d22, d26 + b 48b +0: + pop {r4-r11,pc} + +80: + bgt 880f + + // 8x2, 8x4 v + cmp \h, #2 + add \my, \my, #2 + vld1.32 {d0[]}, [\my] + sub \src, \src, \s_strd + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + + load_reg \src, \sr2, \s_strd, q1, q2, q3, q8, q9 + vmull_vmlal_4 q10, d2, d4, d6, d16 + vmull_vmlal_4 q11, d3, d5, d7, d17 + vmull_vmlal_4 q12, d4, d6, d16, d18 + vmull_vmlal_4 q13, d5, d7, d17, d19 + shift_store_8 \type, \d_strd, q10, q11, d20, d21, q12, q13, d22, d23 + ble 0f + load_reg \sr2, \src, \s_strd, q10, q11 + vmull_vmlal_4 q1, d6, d16, d18, d20 + vmull_vmlal_4 q2, d7, d17, d19, d21 + vmull_vmlal_4 q12, d16, d18, d20, d22 + vmull_vmlal_4 q13, d17, d19, d21, d23 + shift_store_8 \type, \d_strd, q1, q2, d2, d3, q12, q13, d4, d5 +0: + pop {r4-r11,pc} + +880: // 8x6, 8x8, 8x16, 8x32 v +1680: // 16x8, 16x16, ... +320: // 32x8, 32x16, ... +640: +1280: + vpush {q4-q7} + vld1.8 {d0}, [\my, :64] + sub \src, \src, \s_strd + sub \src, \src, \s_strd, lsl #1 + vmovl.s8 q0, d0 + mov \my, \h +168: + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + load_reg \src, \sr2, \s_strd, q5, q6, q7, q8, q9, q10, q11 + +88: + subs \h, \h, #2 + load_reg \sr2, \src, \s_strd, q12, q13 + vmull_vmlal_8 q1, d10, d12, d14, d16, d18, d20, d22, d24 + vmull_vmlal_8 q2, d11, d13, d15, d17, d19, d21, d23, d25 + vmull_vmlal_8 q3, d12, d14, d16, d18, d20, d22, d24, d26 + vmull_vmlal_8 q4, d13, d15, d17, d19, d21, d23, d25, d27 + shift_store_8 \type, \d_strd, q1, q2, d2, d3, q3, q4, d4, d5 + ble 9f + subs \h, \h, #2 + load_reg \sr2, \src, \s_strd, q1, q2 + vmull_vmlal_8 q3, d14, d16, d18, d20, d22, d24, d26, d2 + vmull_vmlal_8 q4, d15, d17, d19, d21, d23, d25, d27, d3 + vmull_vmlal_8 q5, d16, d18, d20, d22, d24, d26, d2, d4 + vmull_vmlal_8 q6, d17, d19, d21, d23, d25, d27, d3, d5 + shift_store_8 \type, \d_strd, q3, q4, d6, d7, q5, q6, d8, d9 + ble 9f + vmov q5, q9 + vmov q6, q10 + vmov q7, q11 + vmov q8, q12 + vmov q9, q13 + vmov q10, q1 + vmov q11, q2 + b 88b +9: + subs \w, \w, #8 + ble 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + mls \src, \s_strd, \my, \src + mls \dst, \d_strd, \my, \dst + sub \src, \src, \s_strd, lsl #3 + mov \h, \my + add \src, \src, #16 + add \dst, \dst, #16 + b 168b +0: + vpop {q4-q7} + pop {r4-r11,pc} + +160: + bgt 1680b + + // 16x2, 16x4 v + vpush {q6-q7} + add \my, \my, #2 + vld1.32 {d0[]}, [\my] + sub \src, \src, \s_strd + vmovl.s8 q0, d0 + + load_16s16 \src, \src, \s_strd, q6, q7, q8, q9, q10, q11 +16: + load_16s16 \src, \src, \s_strd, q12, q13 + subs \h, \h, #1 + vmull_vmlal_4 q1, d12, d16, d20, d24 + vmull_vmlal_4 q2, d13, d17, d21, d25 + vmull_vmlal_4 q3, d14, d18, d22, d26 + vmull_vmlal_4 q6, d15, d19, d23, d27 + shift_store_16 \type, \d_strd, q1, q2, d2, d3, q3, q6, d4, d5 + ble 0f + vmov q6, q8 + vmov q7, q9 + vmov q8, q10 + vmov q9, q11 + vmov q10, q12 + vmov q11, q13 + b 16b +0: + vpop {q6-q7} + pop {r4-r11,pc} + + +L(\type\()_8tap_hv): + cmp \h, #4 + ubfx r10, \my, #7, #7 + and \my, \my, #0x7f + it gt + movgt \my, r10 +4: + add \my, r11, \my, lsl #3 + + adr r10, L(\type\()_8tap_hv_tbl) + neg r12, r12 // -(6-intermediate_bits) + ldr r9, [r10, r9, lsl #2] + vdup.32 q14, r12 // -(6-intermediate_bits) +.ifc \type, put + neg r8, lr // -(6+intermeidate_bits) +.else + vmov.i16 q13, #PREP_BIAS +.endif + add r10, r10, r9 +.ifc \type, put + vdup.32 q13, r8 // -(6+intermediate_bits) +.endif + bx r10 + + .align 2 +L(\type\()_8tap_hv_tbl): + .word 1280f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB + .word 640f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB + .word 320f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB + .word 160f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB + .word 80f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB + .word 40f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB + .word 20f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB + +20: +.ifc \type, put + add \mx, \mx, #2 + vld1.32 {d0[]}, [\mx] + bgt 280f + add \my, \my, #2 + vld1.32 {d2[]}, [\my] + + // 2x2, 2x4 hv + sub \sr2, \src, #2 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + vmovl.s8 q1, d2 + + vld1.16 {q11}, [\src], \s_strd + vext.8 d24, d22, d23, #2 + vmull.s16 q11, d22, d0 + vmull.s16 q12, d24, d0 + vpadd.s32 d22, d22, d23 + vpadd.s32 d23, d24, d25 + vpadd.s32 d22, d22, d23 + vrshl.s32 d16, d22, d28 // -(6-intermediate_bits) + vmovn.i32 d16, q8 + bl L(\type\()_8tap_filter_2) + + vext.8 d16, d16, d16, #4 + vext.8 d16, d16, d24, #4 + vmov d17, d24 + +2: + bl L(\type\()_8tap_filter_2) + + vext.8 d18, d17, d24, #4 + vmull.s16 q2, d16, d2[0] + vmlal.s16 q2, d17, d2[1] + vmlal.s16 q2, d18, d2[2] + vmlal.s16 q2, d24, d2[3] + + vrshl.s32 q2, q2, q13 // -(6+intermediate_bits) + vqmovun.s32 d4, q2 + vmin.u16 d4, d4, d30 + subs \h, \h, #2 + vst1.32 {d4[0]}, [\dst, :32], \d_strd + vst1.32 {d4[1]}, [\ds2, :32], \d_strd + ble 0f + vmov d16, d18 + vmov d17, d24 + b 2b + +280: // 2x8, 2x16, 2x32 hv + vld1.8 {d2}, [\my, :64] + sub \src, \src, #2 + sub \sr2, \src, \s_strd, lsl #1 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + vmovl.s8 q1, d2 + + vld1.16 {q11}, [\src], \s_strd + vext.8 d24, d22, d23, #2 + vmull.s16 q11, d22, d0 + vmull.s16 q12, d24, d0 + vpadd.s32 d22, d22, d23 + vpadd.s32 d23, d24, d25 + vpadd.s32 d22, d22, d23 + vrshl.s32 d16, d22, d28 // -(6-intermediate_bits) + vmovn.i32 d16, q8 + + bl L(\type\()_8tap_filter_2) + + vext.8 d16, d16, d16, #4 + vext.8 d16, d16, d24, #4 + vmov d17, d24 + bl L(\type\()_8tap_filter_2) + vext.8 d18, d17, d24, #4 + vmov d19, d24 + bl L(\type\()_8tap_filter_2) + vext.8 d20, d19, d24, #4 + vmov d21, d24 + +28: + bl L(\type\()_8tap_filter_2) + vext.8 d22, d21, d24, #4 + vmull.s16 q3, d16, d2[0] + vmlal.s16 q3, d17, d2[1] + vmlal.s16 q3, d18, d2[2] + vmlal.s16 q3, d19, d2[3] + vmlal.s16 q3, d20, d3[0] + vmlal.s16 q3, d21, d3[1] + vmlal.s16 q3, d22, d3[2] + vmlal.s16 q3, d24, d3[3] + + vrshl.s32 q3, q3, q13 // -(6+intermediate_bits) + vqmovun.s32 d6, q3 + vmin.u16 d6, d6, d30 + subs \h, \h, #2 + vst1.32 {d6[0]}, [\dst, :32], \d_strd + vst1.32 {d6[1]}, [\ds2, :32], \d_strd + ble 0f + vmov q8, q9 + vmov q9, q10 + vmov d20, d22 + vmov d21, d24 + b 28b +0: + pop {r4-r11,pc} + +L(\type\()_8tap_filter_2): + vld1.16 {q11}, [\sr2], \s_strd + vld1.16 {q12}, [\src], \s_strd + vext.8 d23, d22, d23, #2 + vext.8 d25, d24, d25, #2 + vtrn.32 q11, q12 + vmull.s16 q3, d22, d0[0] + vmlal.s16 q3, d23, d0[1] + vmlal.s16 q3, d24, d0[2] + vmlal.s16 q3, d25, d0[3] + vrshl.s32 q3, q3, q14 // -(6-intermediate_bits) + vmovn.i32 d24, q3 + bx lr +.endif + +40: + add \mx, \mx, #2 + vld1.32 {d0[]}, [\mx] + bgt 480f + add \my, \my, #2 + vld1.32 {d2[]}, [\my] + sub \sr2, \src, #2 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + vmovl.s8 q1, d2 + + // 4x2, 4x4 hv + vld1.16 {q11}, [\src], \s_strd + vext.8 d24, d22, d23, #2 + vext.8 d25, d22, d23, #4 + vext.8 d23, d22, d23, #6 + vmull.s16 q10, d22, d0[0] + vmlal.s16 q10, d24, d0[1] + vmlal.s16 q10, d25, d0[2] + vmlal.s16 q10, d23, d0[3] + vrshl.s32 q10, q10, q14 // -(6-intermediate_bits) + vmovn.i32 d17, q10 + + bl L(\type\()_8tap_filter_4) + vmov q9, q12 + +4: + bl L(\type\()_8tap_filter_4) + vmull.s16 q2, d17, d2[0] + vmlal.s16 q2, d18, d2[1] + vmlal.s16 q2, d19, d2[2] + vmlal.s16 q2, d24, d2[3] + vmull.s16 q3, d18, d2[0] + vmlal.s16 q3, d19, d2[1] + vmlal.s16 q3, d24, d2[2] + vmlal.s16 q3, d25, d2[3] +.ifc \type, put + vrshl.s32 q2, q2, q13 // -(6+intermediate_bits) + vrshl.s32 q3, q3, q13 // -(6+intermediate_bits) + vqmovun.s32 d4, q2 + vqmovun.s32 d5, q3 + vmin.u16 q2, q2, q15 +.else + vrshrn.i32 d4, q2, #6 + vrshrn.i32 d5, q3, #6 + vsub.i16 q2, q2, q13 // PREP_BIAS +.endif + subs \h, \h, #2 + + vst1.16 {d4}, [\dst, :64], \d_strd + vst1.16 {d5}, [\ds2, :64], \d_strd + ble 0f + vmov d17, d19 + vmov q9, q12 + b 4b +0: + pop {r4-r11,pc} + +480: // 4x8, 4x16, 4x32 hv + vpush {d13-d15} + vld1.8 {d2}, [\my, :64] + sub \src, \src, #2 + sub \sr2, \src, \s_strd, lsl #1 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + vmovl.s8 q1, d2 + + vld1.16 {q11}, [\src], \s_strd + vext.8 d24, d22, d23, #2 + vext.8 d25, d22, d23, #4 + vext.8 d23, d22, d23, #6 + vmull.s16 q10, d22, d0[0] + vmlal.s16 q10, d24, d0[1] + vmlal.s16 q10, d25, d0[2] + vmlal.s16 q10, d23, d0[3] + vrshl.s32 q10, q10, q14 // -(6-intermediate_bits) + vmovn.i32 d13, q10 + + bl L(\type\()_8tap_filter_4) + vmov q7, q12 + bl L(\type\()_8tap_filter_4) + vmov q8, q12 + bl L(\type\()_8tap_filter_4) + vmov q9, q12 + +48: + bl L(\type\()_8tap_filter_4) + vmull.s16 q2, d13, d2[0] + vmlal.s16 q2, d14, d2[1] + vmlal.s16 q2, d15, d2[2] + vmlal.s16 q2, d16, d2[3] + vmlal.s16 q2, d17, d3[0] + vmlal.s16 q2, d18, d3[1] + vmlal.s16 q2, d19, d3[2] + vmlal.s16 q2, d24, d3[3] + vmull.s16 q3, d14, d2[0] + vmlal.s16 q3, d15, d2[1] + vmlal.s16 q3, d16, d2[2] + vmlal.s16 q3, d17, d2[3] + vmlal.s16 q3, d18, d3[0] + vmlal.s16 q3, d19, d3[1] + vmlal.s16 q3, d24, d3[2] + vmlal.s16 q3, d25, d3[3] +.ifc \type, put + vrshl.s32 q2, q2, q13 // -(6+intermediate_bits) + vrshl.s32 q3, q3, q13 // -(6+intermediate_bits) + vqmovun.s32 d4, q2 + vqmovun.s32 d5, q3 + vmin.u16 q2, q2, q15 +.else + vrshrn.i32 d4, q2, #6 + vrshrn.i32 d5, q3, #6 + vsub.i16 q2, q2, q13 // PREP_BIAS +.endif + subs \h, \h, #2 + vst1.16 {d4}, [\dst, :64], \d_strd + vst1.16 {d5}, [\ds2, :64], \d_strd + ble 0f + vmov d13, d15 + vmov q7, q8 + vmov q8, q9 + vmov q9, q12 + b 48b +0: + vpop {d13-d15} + pop {r4-r11,pc} + +L(\type\()_8tap_filter_4): + vld1.16 {q10}, [\sr2], \s_strd + vld1.16 {q11}, [\src], \s_strd + vext.8 d24, d20, d21, #2 + vext.8 d25, d20, d21, #4 + vext.8 d21, d20, d21, #6 + vmull.s16 q3, d20, d0[0] + vmlal.s16 q3, d24, d0[1] + vmlal.s16 q3, d25, d0[2] + vmlal.s16 q3, d21, d0[3] + vext.8 d24, d22, d23, #2 + vext.8 d25, d22, d23, #4 + vext.8 d23, d22, d23, #6 + vmull.s16 q10, d22, d0[0] + vmlal.s16 q10, d24, d0[1] + vmlal.s16 q10, d25, d0[2] + vmlal.s16 q10, d23, d0[3] + vrshl.s32 q3, q3, q14 // -(6-intermediate_bits) + vrshl.s32 q10, q10, q14 // -(6-intermediate_bits) + vmovn.i32 d24, q3 + vmovn.i32 d25, q10 + bx lr + +80: +160: +320: + bgt 880f + add \my, \my, #2 + vld1.8 {d0}, [\mx, :64] + vld1.32 {d2[]}, [\my] + sub \src, \src, #6 + sub \src, \src, \s_strd + vmovl.s8 q0, d0 + vmovl.s8 q1, d2 + mov \my, \h + +164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + + vld1.16 {q11, q12}, [\src], \s_strd + vmull.s16 q2, d22, d0[0] + vmull.s16 q3, d23, d0[0] + vdup.32 q14, r12 // -(6-intermediate_bits) +.irpc i, 1234567 + vext.8 q10, q11, q12, #(2*\i) +.if \i < 4 + vmlal.s16 q2, d20, d0[\i] + vmlal.s16 q3, d21, d0[\i] +.else + vmlal.s16 q2, d20, d1[\i - 4] + vmlal.s16 q3, d21, d1[\i - 4] +.endif +.endr + vrshl.s32 q2, q2, q14 // -(6-intermediate_bits) + vrshl.s32 q3, q3, q14 // -(6-intermediate_bits) + vmovn.i32 d16, q2 + vmovn.i32 d17, q3 + + bl L(\type\()_8tap_filter_8) + vmov q9, q11 + vmov q10, q12 + +8: + bl L(\type\()_8tap_filter_8) + vmull.s16 q2, d16, d2[0] + vmull.s16 q3, d17, d2[0] + vmull.s16 q13, d18, d2[0] + vmull.s16 q14, d19, d2[0] +.ifc \type, put + vdup.32 q8, r8 // -(6+intermediate_bits) +.endif + vmlal.s16 q2, d18, d2[1] + vmlal.s16 q3, d19, d2[1] + vmlal.s16 q13, d20, d2[1] + vmlal.s16 q14, d21, d2[1] + vmlal.s16 q2, d20, d2[2] + vmlal.s16 q3, d21, d2[2] + vmlal.s16 q13, d22, d2[2] + vmlal.s16 q14, d23, d2[2] + vmlal.s16 q2, d22, d2[3] + vmlal.s16 q3, d23, d2[3] + vmlal.s16 q13, d24, d2[3] + vmlal.s16 q14, d25, d2[3] +.ifc \type, put + vdup.16 q9, \bdmax // bitdepth_max + vrshl.s32 q2, q2, q8 // -(6+intermediate_bits) + vrshl.s32 q3, q3, q8 // -(6+intermediate_bits) + vrshl.s32 q13, q13, q8 // -(6+intermediate_bits) + vrshl.s32 q14, q14, q8 // -(6+intermediate_bits) + vqmovun.s32 d4, q2 + vqmovun.s32 d5, q3 + vqmovun.s32 d6, q13 + vqmovun.s32 d7, q14 + vmin.u16 q2, q2, q15 + vmin.u16 q3, q3, q15 +.else + vmov.i16 q9, #PREP_BIAS + vrshrn.i32 d4, q2, #6 + vrshrn.i32 d5, q3, #6 + vrshrn.i32 d6, q13, #6 + vrshrn.i32 d7, q14, #6 + vsub.i16 q2, q2, q9 // PREP_BIAS + vsub.i16 q3, q3, q9 // PREP_BIAS +.endif + subs \h, \h, #2 + vst1.16 {q2}, [\dst, :128], \d_strd + vst1.16 {q3}, [\ds2, :128], \d_strd + ble 9f + vmov q8, q10 + vmov q9, q11 + vmov q10, q12 + b 8b +9: + subs \w, \w, #8 + ble 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + mls \src, \s_strd, \my, \src + mls \dst, \d_strd, \my, \dst + sub \src, \src, \s_strd, lsl #2 + mov \h, \my + add \src, \src, #16 + add \dst, \dst, #16 + b 164b +0: + pop {r4-r11,pc} + +880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv +640: +1280: + vpush {q4-q7} + vld1.8 {d0}, [\mx, :64] + vld1.8 {d2}, [\my, :64] + sub \src, \src, #6 + sub \src, \src, \s_strd + sub \src, \src, \s_strd, lsl #1 + vmovl.s8 q0, d0 + vmovl.s8 q1, d2 + mov \my, \h + +168: + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + + vld1.16 {q11, q12}, [\src], \s_strd + vmull.s16 q2, d22, d0[0] + vmull.s16 q3, d23, d0[0] + vdup.32 q14, r12 // -(6-intermediate_bits) +.irpc i, 1234567 + vext.8 q10, q11, q12, #(2*\i) +.if \i < 4 + vmlal.s16 q2, d20, d0[\i] + vmlal.s16 q3, d21, d0[\i] +.else + vmlal.s16 q2, d20, d1[\i - 4] + vmlal.s16 q3, d21, d1[\i - 4] +.endif +.endr + vrshl.s32 q2, q2, q14 // -(6-intermediate_bits) + vrshl.s32 q3, q3, q14 // -(6-intermediate_bits) + vmovn.i32 d8, q2 + vmovn.i32 d9, q3 + + bl L(\type\()_8tap_filter_8) + vmov q5, q11 + vmov q6, q12 + bl L(\type\()_8tap_filter_8) + vmov q7, q11 + vmov q8, q12 + bl L(\type\()_8tap_filter_8) + vmov q9, q11 + vmov q10, q12 + +88: + bl L(\type\()_8tap_filter_8) + vmull.s16 q2, d8, d2[0] + vmull.s16 q3, d9, d2[0] + vmull.s16 q13, d10, d2[0] + vmull.s16 q14, d11, d2[0] +.ifc \type, put + vdup.32 q4, r8 // -(6+intermediate_bits) +.endif + vmlal.s16 q2, d10, d2[1] + vmlal.s16 q3, d11, d2[1] + vmlal.s16 q13, d12, d2[1] + vmlal.s16 q14, d13, d2[1] + vmlal.s16 q2, d12, d2[2] + vmlal.s16 q3, d13, d2[2] + vmlal.s16 q13, d14, d2[2] + vmlal.s16 q14, d15, d2[2] + vmlal.s16 q2, d14, d2[3] + vmlal.s16 q3, d15, d2[3] + vmlal.s16 q13, d16, d2[3] + vmlal.s16 q14, d17, d2[3] + vmlal.s16 q2, d16, d3[0] + vmlal.s16 q3, d17, d3[0] + vmlal.s16 q13, d18, d3[0] + vmlal.s16 q14, d19, d3[0] + vmlal.s16 q2, d18, d3[1] + vmlal.s16 q3, d19, d3[1] + vmlal.s16 q13, d20, d3[1] + vmlal.s16 q14, d21, d3[1] + vmlal.s16 q2, d20, d3[2] + vmlal.s16 q3, d21, d3[2] + vmlal.s16 q13, d22, d3[2] + vmlal.s16 q14, d23, d3[2] + vmlal.s16 q2, d22, d3[3] + vmlal.s16 q3, d23, d3[3] + vmlal.s16 q13, d24, d3[3] + vmlal.s16 q14, d25, d3[3] +.ifc \type, put + vrshl.s32 q2, q2, q4 // -(6+intermediate_bits) + vrshl.s32 q3, q3, q4 // -(6+intermediate_bits) + vrshl.s32 q13, q13, q4 // -(6+intermediate_bits) + vrshl.s32 q14, q14, q4 // -(6+intermediate_bits) + vqmovun.s32 d4, q2 + vqmovun.s32 d5, q3 + vqmovun.s32 d6, q13 + vqmovun.s32 d7, q14 + vmin.u16 q2, q2, q15 + vmin.u16 q3, q3, q15 +.else + vmov.i16 q5, #PREP_BIAS + vrshrn.i32 d4, q2, #6 + vrshrn.i32 d5, q3, #6 + vrshrn.i32 d6, q13, #6 + vrshrn.i32 d7, q14, #6 + vsub.i16 q2, q2, q5 // PREP_BIAS + vsub.i16 q3, q3, q5 // PREP_BIAS +.endif + subs \h, \h, #2 + vst1.16 {q2}, [\dst, :128], \d_strd + vst1.16 {q3}, [\ds2, :128], \d_strd + ble 9f + vmov q4, q6 + vmov q5, q7 + vmov q6, q8 + vmov q7, q9 + vmov q8, q10 + vmov q9, q11 + vmov q10, q12 + b 88b +9: + subs \w, \w, #8 + ble 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + mls \src, \s_strd, \my, \src + mls \dst, \d_strd, \my, \dst + sub \src, \src, \s_strd, lsl #3 + mov \h, \my + add \src, \src, #16 + add \dst, \dst, #16 + b 168b +0: + vpop {q4-q7} + pop {r4-r11,pc} + +L(\type\()_8tap_filter_8): + vld1.16 {q13, q14}, [\sr2], \s_strd + vmull.s16 q2, d26, d0[0] + vmull.s16 q3, d27, d0[0] +.irpc i, 1234567 + vext.8 q12, q13, q14, #(2*\i) +.if \i < 4 + vmlal.s16 q2, d24, d0[\i] + vmlal.s16 q3, d25, d0[\i] +.else + vmlal.s16 q2, d24, d1[\i - 4] + vmlal.s16 q3, d25, d1[\i - 4] +.endif +.endr + vdup.32 q12, r12 // -(6-intermediate_bits) + vld1.16 {q13, q14}, [\src], \s_strd + vrshl.s32 q2, q2, q12 // -(6-intermediate_bits) + vrshl.s32 q3, q3, q12 // -(6-intermediate_bits) + vmovn.i32 d4, q2 + vmovn.i32 d5, q3 + + vmull.s16 q3, d26, d0[0] + vmull.s16 q11, d27, d0[0] +.irpc i, 1234567 + vext.8 q12, q13, q14, #(2*\i) +.if \i < 4 + vmlal.s16 q3, d24, d0[\i] + vmlal.s16 q11, d25, d0[\i] +.else + vmlal.s16 q3, d24, d1[\i - 4] + vmlal.s16 q11, d25, d1[\i - 4] +.endif +.endr + vdup.32 q13, r12 // -(6-intermediate_bits) + vrshl.s32 q3, q3, q13 // -(6-intermediate_bits) + vrshl.s32 q11, q11, q13 // -(6-intermediate_bits) + + vmovn.i32 d24, q3 + vmovn.i32 d25, q11 + vmov q11, q2 + bx lr +endfunc + +function \type\()_bilin_16bpc_neon, export=1 + push {r4-r11,lr} + ldrd r4, r5, [sp, #36] + ldrd r6, r7, [sp, #44] +.ifc \bdmax, r8 + ldr r8, [sp, #52] +.endif + vdup.16 q1, \mx + vdup.16 q3, \my + rsb r9, \mx, #16 + rsb r10, \my, #16 + vdup.16 q0, r9 + vdup.16 q2, r10 +.ifc \type, prep + lsl \d_strd, \w, #1 +.endif + clz \bdmax, \bdmax // bitdepth_max + clz r9, \w + sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18 + cmp \mx, #0 + sub r9, r9, #24 + rsb r11, \bdmax, #4 // 4 - intermediate_bits + add r12, \bdmax, #4 // 4 + intermediate_bits + bne L(\type\()_bilin_h) + cmp \my, #0 + bne L(\type\()_bilin_v) + b \type\()_neon + +L(\type\()_bilin_h): + cmp \my, #0 + bne L(\type\()_bilin_hv) + + adr r10, L(\type\()_bilin_h_tbl) + vdup.16 q15, r11 // 4 - intermediate_bits + ldr r9, [r10, r9, lsl #2] + vneg.s16 q15, q15 // -(4-intermediate_bits) +.ifc \type, put + vdup.16 q14, \bdmax // intermediate_bits +.else + vmov.i16 q14, #PREP_BIAS +.endif + add r10, r10, r9 +.ifc \type, put + vneg.s16 q14, q14 // -intermediate_bits +.endif + bx r10 + + .align 2 +L(\type\()_bilin_h_tbl): + .word 1280f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB + .word 640f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB + .word 320f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB + .word 160f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB + .word 80f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB + .word 40f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB + .word 20f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB + +20: // 2xN h +.ifc \type, put + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 +2: + vld1.16 {d16}, [\src], \s_strd + vld1.16 {d18}, [\sr2], \s_strd + vext.8 d17, d16, d16, #2 + vext.8 d19, d18, d18, #2 + vtrn.32 d16, d18 + vtrn.32 d17, d19 + subs \h, \h, #2 + vmul.i16 d16, d16, d0 + vmla.i16 d16, d17, d2 + vrshl.u16 d16, d16, d30 + vrshl.u16 d16, d16, d28 + vst1.32 {d16[0]}, [\dst, :32], \d_strd + vst1.32 {d16[1]}, [\ds2, :32], \d_strd + bgt 2b + pop {r4-r11,pc} +.endif + +40: // 4xN h + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 +4: + vld1.16 {q8}, [\src], \s_strd + vld1.16 {q10}, [\sr2], \s_strd + vext.8 q9, q8, q8, #2 + vext.8 q11, q10, q10, #2 + vmov d17, d20 + vmov d19, d22 + subs \h, \h, #2 + vmul.i16 q8, q8, q0 + vmla.i16 q8, q9, q1 + vrshl.u16 q8, q8, q15 +.ifc \type, put + vrshl.u16 q8, q8, q14 +.else + vsub.i16 q8, q8, q14 +.endif + vst1.16 {d16}, [\dst, :64], \d_strd + vst1.16 {d17}, [\ds2, :64], \d_strd + bgt 4b + pop {r4-r11,pc} + +80: // 8xN h + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 +8: + vld1.16 {d16, d17, d18}, [\src], \s_strd + vld1.16 {d20, d21, d22}, [\sr2], \s_strd + vext.8 q9, q8, q9, #2 + vext.8 q11, q10, q11, #2 + subs \h, \h, #2 + vmul.i16 q8, q8, q0 + vmla.i16 q8, q9, q1 + vmul.i16 q10, q10, q0 + vmla.i16 q10, q11, q1 + vrshl.u16 q8, q8, q15 + vrshl.u16 q10, q10, q15 +.ifc \type, put + vrshl.u16 q8, q8, q14 + vrshl.u16 q10, q10, q14 +.else + vsub.i16 q8, q8, q14 + vsub.i16 q10, q10, q14 +.endif + vst1.16 {q8}, [\dst, :128], \d_strd + vst1.16 {q10}, [\ds2, :128], \d_strd + bgt 8b + pop {r4-r11,pc} +160: +320: +640: +1280: // 16xN, 32xN, ... h + vpush {q4-q7} + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + + sub \s_strd, \s_strd, \w, lsl #1 + sub \s_strd, \s_strd, #16 +.ifc \type, put + lsl \d_strd, \d_strd, #1 + sub \d_strd, \d_strd, \w, lsl #1 +.endif +161: + vld1.16 {q4}, [\src]! + vld1.16 {q9}, [\sr2]! + mov \mx, \w + +16: + vld1.16 {q5, q6}, [\src]! + vld1.16 {q10, q11}, [\sr2]! + vext.8 q7, q4, q5, #2 + vext.8 q8, q5, q6, #2 + vext.8 q12, q9, q10, #2 + vext.8 q13, q10, q11, #2 + vmul.i16 q4, q4, q0 + vmla.i16 q4, q7, q1 + vmul.i16 q5, q5, q0 + vmla.i16 q5, q8, q1 + vmul.i16 q9, q9, q0 + vmla.i16 q9, q12, q1 + vmul.i16 q10, q10, q0 + vmla.i16 q10, q13, q1 + vrshl.u16 q4, q4, q15 + vrshl.u16 q5, q5, q15 + vrshl.u16 q9, q9, q15 + vrshl.u16 q10, q10, q15 + subs \mx, \mx, #16 +.ifc \type, put + vrshl.u16 q4, q4, q14 + vrshl.u16 q5, q5, q14 + vrshl.u16 q9, q9, q14 + vrshl.u16 q10, q10, q14 +.else + vsub.i16 q4, q4, q14 + vsub.i16 q5, q5, q14 + vsub.i16 q9, q9, q14 + vsub.i16 q10, q10, q14 +.endif + vst1.16 {q4, q5}, [\dst, :128]! + vst1.16 {q9, q10}, [\ds2, :128]! + ble 9f + + vmov q4, q6 + vmov q9, q11 + b 16b + +9: + add \dst, \dst, \d_strd + add \ds2, \ds2, \d_strd + add \src, \src, \s_strd + add \sr2, \sr2, \s_strd + + subs \h, \h, #2 + bgt 161b + vpop {q4-q7} + pop {r4-r11,pc} + + +L(\type\()_bilin_v): + cmp \h, #4 + adr r10, L(\type\()_bilin_v_tbl) +.ifc \type, prep + vdup.16 q15, r11 // 4 - intermediate_bits +.endif + ldr r9, [r10, r9, lsl #2] +.ifc \type, prep + vmov.i16 q14, #PREP_BIAS + vneg.s16 q15, q15 // -(4-intermediate_bits) +.endif + add r10, r10, r9 + bx r10 + + .align 2 +L(\type\()_bilin_v_tbl): + .word 1280f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB + .word 640f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB + .word 320f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB + .word 160f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB + .word 80f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB + .word 40f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB + .word 20f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB + +20: // 2xN v +.ifc \type, put + cmp \h, #2 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + // 2x2 v + vld1.32 {d16[]}, [\src], \s_strd + bgt 24f + vld1.32 {d17[]}, [\sr2], \s_strd + vld1.32 {d18[]}, [\src], \s_strd + vext.8 d16, d16, d17, #4 + vext.8 d17, d17, d18, #4 + vmul.i16 d16, d16, d4 + vmla.i16 d16, d17, d6 + vrshr.u16 d16, d16, #4 + vst1.32 {d16[0]}, [\dst, :32] + vst1.32 {d16[1]}, [\ds2, :32] + pop {r4-r11,pc} +24: // 2x4, 2x8, ... v + vld1.32 {d17[]}, [\sr2], \s_strd + vld1.32 {d18[]}, [\src], \s_strd + vld1.32 {d19[]}, [\sr2], \s_strd + vld1.32 {d20[]}, [\src], \s_strd + vext.8 d16, d16, d17, #4 + vext.8 d17, d17, d18, #4 + vext.8 d18, d18, d19, #4 + vext.8 d19, d19, d20, #4 + vswp d17, d18 + vmul.i16 q8, q8, q2 + vmla.i16 q8, q9, q3 + subs \h, \h, #4 + vrshr.u16 q8, q8, #4 + vst1.32 {d16[0]}, [\dst, :32], \d_strd + vst1.32 {d16[1]}, [\ds2, :32], \d_strd + vst1.32 {d17[0]}, [\dst, :32], \d_strd + vst1.32 {d17[1]}, [\ds2, :32], \d_strd + ble 0f + vmov d16, d20 + b 24b +0: + pop {r4-r11,pc} +.endif + +40: // 4xN v + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vld1.16 {d16}, [\src], \s_strd +4: + vld1.16 {d17}, [\sr2], \s_strd + vld1.16 {d19}, [\src], \s_strd + vmov d18, d17 + vmul.i16 q8, q8, q2 + vmla.i16 q8, q9, q3 + subs \h, \h, #2 +.ifc \type, put + vrshr.u16 q8, q8, #4 +.else + vrshl.u16 q8, q8, q15 + vsub.i16 q8, q8, q14 +.endif + vst1.16 {d16}, [\dst, :64], \d_strd + vst1.16 {d17}, [\ds2, :64], \d_strd + ble 0f + vmov d16, d19 + b 4b +0: + pop {r4-r11,pc} + +80: // 8xN v + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vld1.16 {q8}, [\src], \s_strd +8: + vld1.16 {q9}, [\sr2], \s_strd + vld1.16 {q10}, [\src], \s_strd + vmul.i16 q8, q8, q2 + vmla.i16 q8, q9, q3 + vmul.i16 q9, q9, q2 + vmla.i16 q9, q10, q3 + subs \h, \h, #2 +.ifc \type, put + vrshr.u16 q8, q8, #4 + vrshr.u16 q9, q9, #4 +.else + vrshl.u16 q8, q8, q15 + vrshl.u16 q9, q9, q15 + vsub.i16 q8, q8, q14 + vsub.i16 q9, q9, q14 +.endif + vst1.16 {q8}, [\dst, :128], \d_strd + vst1.16 {q9}, [\ds2, :128], \d_strd + ble 0f + vmov q8, q10 + b 8b +0: + pop {r4-r11,pc} + +160: // 16xN, 32xN, ... +320: +640: +1280: + mov \my, \h +1: + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + vld1.16 {q8, q9}, [\src], \s_strd +2: + vld1.16 {q10, q11}, [\sr2], \s_strd + vld1.16 {q12, q13}, [\src], \s_strd + vmul.i16 q8, q8, q2 + vmla.i16 q8, q10, q3 + vmul.i16 q9, q9, q2 + vmla.i16 q9, q11, q3 + vmul.i16 q10, q10, q2 + vmla.i16 q10, q12, q3 + vmul.i16 q11, q11, q2 + vmla.i16 q11, q13, q3 + subs \h, \h, #2 +.ifc \type, put + vrshr.u16 q8, q8, #4 + vrshr.u16 q9, q9, #4 + vrshr.u16 q10, q10, #4 + vrshr.u16 q11, q11, #4 +.else + vrshl.u16 q8, q8, q15 + vrshl.u16 q9, q9, q15 + vrshl.u16 q10, q10, q15 + vrshl.u16 q11, q11, q15 + vsub.i16 q8, q8, q14 + vsub.i16 q9, q9, q14 + vsub.i16 q10, q10, q14 + vsub.i16 q11, q11, q14 +.endif + vst1.16 {q8, q9}, [\dst, :128], \d_strd + vst1.16 {q10, q11}, [\ds2, :128], \d_strd + ble 9f + vmov q8, q12 + vmov q9, q13 + b 2b +9: + subs \w, \w, #16 + ble 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + mls \src, \s_strd, \my, \src + mls \dst, \d_strd, \my, \dst + sub \src, \src, \s_strd, lsl #1 + mov \h, \my + add \src, \src, #32 + add \dst, \dst, #32 + b 1b +0: + pop {r4-r11,pc} + +L(\type\()_bilin_hv): + adr r10, L(\type\()_bilin_hv_tbl) + vdup.16 q15, r11 // 4 - intermediate_bits + ldr r9, [r10, r9, lsl #2] + vneg.s16 q15, q15 // -(4-intermediate_bits) +.ifc \type, put + vdup.32 q14, r12 // 4 + intermediate_bits +.else + vmov.i16 q14, #PREP_BIAS +.endif + add r10, r10, r9 +.ifc \type, put + vneg.s32 q14, q14 // -(4+intermediate_bits) +.endif + bx r10 + + .align 2 +L(\type\()_bilin_hv_tbl): + .word 1280f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB + .word 640f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB + .word 320f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB + .word 160f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB + .word 80f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB + .word 40f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB + .word 20f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB + +20: // 2xN hv +.ifc \type, put + add \sr2, \src, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + vld1.16 {d20}, [\src], \s_strd + vext.8 d21, d20, d20, #2 + vmul.i16 d16, d20, d0 + vmla.i16 d16, d21, d2 + vrshl.u16 d16, d16, d30 + vext.8 d16, d16, d16, #4 + +2: + vld1.16 {d20}, [\sr2], \s_strd + vld1.16 {d22}, [\src], \s_strd + vext.8 d21, d20, d20, #2 + vext.8 d23, d22, d22, #2 + vtrn.32 d20, d22 + vtrn.32 d21, d23 + vmul.i16 d18, d20, d0 + vmla.i16 d18, d21, d2 + vrshl.u16 d18, d18, d30 + + vext.8 d16, d16, d18, #4 + + vmull.u16 q8, d16, d4 + vmlal.u16 q8, d18, d6 + vrshl.u32 q8, q8, q14 + vmovn.i32 d16, q8 + subs \h, \h, #2 + vst1.32 {d16[0]}, [\dst, :32], \d_strd + vst1.32 {d16[1]}, [\ds2, :32], \d_strd + ble 0f + vmov d16, d18 + b 2b +0: + pop {r4-r11,pc} +.endif + +40: // 4xN hv + add \sr2, \src, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + vld1.16 {q10}, [\src], \s_strd + vext.8 d21, d20, d21, #2 + vmul.i16 d16, d20, d0 + vmla.i16 d16, d21, d2 + vrshl.u16 d16, d16, d30 + +4: + vld1.16 {q10}, [\sr2], \s_strd + vld1.16 {q11}, [\src], \s_strd + vext.8 d21, d20, d21, #2 + vext.8 d23, d22, d23, #2 + vswp d21, d22 + vmul.i16 q9, q10, q0 + vmla.i16 q9, q11, q1 + vrshl.u16 q9, q9, q15 + + vmull.u16 q10, d16, d4 + vmlal.u16 q10, d18, d6 + vmull.u16 q11, d18, d4 + vmlal.u16 q11, d19, d6 +.ifc \type, put + vrshl.u32 q10, q10, q14 + vrshl.u32 q11, q11, q14 + vmovn.i32 d20, q10 + vmovn.i32 d21, q11 +.else + vrshrn.i32 d20, q10, #4 + vrshrn.i32 d21, q11, #4 + vsub.i16 q10, q10, q14 +.endif + subs \h, \h, #2 + vst1.16 {d20}, [\dst, :64], \d_strd + vst1.16 {d21}, [\ds2, :64], \d_strd + ble 0f + vmov d16, d19 + b 4b +0: + pop {r4-r11,pc} + +80: // 8xN, 16xN, ... hv +160: +320: +640: +1280: + mov \my, \h + +1: + add \sr2, \src, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + vld1.16 {d20, d21, d22}, [\src], \s_strd + vext.8 q11, q10, q11, #2 + vmul.i16 q8, q10, q0 + vmla.i16 q8, q11, q1 + vrshl.u16 q8, q8, q15 + +2: + vld1.16 {d20, d21, d22}, [\sr2], \s_strd + vld1.16 {d24, d25, d26}, [\src], \s_strd + vext.8 q11, q10, q11, #2 + vext.8 q13, q12, q13, #2 + vmul.i16 q9, q10, q0 + vmla.i16 q9, q11, q1 + vmul.i16 q10, q12, q0 + vmla.i16 q10, q13, q1 + vrshl.u16 q9, q9, q15 + vrshl.u16 q10, q10, q15 + + vmull.u16 q11, d16, d4 + vmlal.u16 q11, d18, d6 + vmull.u16 q12, d17, d4 + vmlal.u16 q12, d19, d6 + vmull.u16 q8, d18, d4 + vmlal.u16 q8, d20, d6 + vmull.u16 q9, d19, d4 + vmlal.u16 q9, d21, d6 +.ifc \type, put + vrshl.u32 q11, q11, q14 + vrshl.u32 q12, q12, q14 + vrshl.u32 q8, q8, q14 + vrshl.u32 q9, q9, q14 + vmovn.i32 d22, q11 + vmovn.i32 d23, q12 + vmovn.i32 d16, q8 + vmovn.i32 d17, q9 +.else + vrshrn.i32 d22, q11, #4 + vrshrn.i32 d23, q12, #4 + vrshrn.i32 d16, q8, #4 + vrshrn.i32 d17, q9, #4 + vsub.i16 q11, q11, q14 + vsub.i16 q8, q8, q14 +.endif + subs \h, \h, #2 + vst1.16 {q11}, [\dst, :128], \d_strd + vst1.16 {q8}, [\ds2, :128], \d_strd + ble 9f + vmov q8, q10 + b 2b +9: + subs \w, \w, #8 + ble 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + mls \src, \s_strd, \my, \src + mls \dst, \d_strd, \my, \dst + sub \src, \src, \s_strd, lsl #1 + mov \h, \my + add \src, \src, #16 + add \dst, \dst, #16 + b 1b +0: + pop {r4-r11,pc} +endfunc +.endm + +filter_fn put, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10 +filter_fn prep, r0, r8, r1, r2, r3, r4, r5, r6, r7, r9, r10 + +.macro load_filter_ptr src + asr r12, \src, #10 + add r12, r11, r12, lsl #3 +.endm + +.macro load_filter_coef dst, src, inc + vld1.8 {\dst}, [r12, :64] + add \src, \src, \inc +.endm + +.macro load_filter_row dst, src, inc + load_filter_ptr \src + load_filter_coef \dst, \src, \inc +.endm + +function warp_filter_horz_neon + load_filter_ptr r5 // filter 0 + vld1.16 {q6,q7}, [r2], r3 + + load_filter_coef d0, r5, r7 // filter 0 + load_filter_row d2, r5, r7 // filter 1 + vmovl.s8 q0, d0 // filter 0 + vext.8 q3, q6, q7, #2*1 // filter 1 pixels + vmovl.s8 q1, d2 // filter 1 + + vmull.s16 q4, d12, d0 // filter 0 output (0-3) + vmull.s16 q5, d13, d1 // filter 0 output (4-7) + + load_filter_ptr r5 // filter 2 + + vmull.s16 q2, d6, d2 // filter 1 output (0-3) + vmull.s16 q3, d7, d3 // filter 1 output (4-7) + + load_filter_coef d0, r5, r7 // filter 2 + + vpadd.i32 d8, d8, d9 // half pixel 0 (2x32) + vpadd.i32 d9, d10, d11 // half pixel 0 (2x32) + + load_filter_ptr r5 // filter 3 + + vpadd.i32 d4, d4, d5 // half pixel 1 (2x32) + vpadd.i32 d5, d6, d7 // half pixel 1 (2x32) + + vmovl.s8 q0, d0 // filter 2 + vext.8 q3, q6, q7, #2*2 // filter 2 pixels + + vpadd.i32 d8, d8, d9 // pixel 0 (2x32) + vpadd.i32 d9, d4, d5 // pixel 1 (2x32) + + load_filter_coef d2, r5, r7 // filter 3 + + vmull.s16 q2, d6, d0 // filter 2 output (0-3) + vmull.s16 q3, d7, d1 // filter 2 output (4-7) + + load_filter_ptr r5 // filter 4 + + vpadd.i32 d8, d8, d9 // pixel 0,1 + + vpadd.i32 d9, d4, d5 // half pixel 2 (2x32) + vpadd.i32 d10, d6, d7 // half pixel 2 (2x32) + + vmovl.s8 q1, d2 // filter 3 + vext.8 q3, q6, q7, #2*3 // filter 3 pixels + + load_filter_coef d0, r5, r7 // filter 4 + + vpadd.i32 d9, d9, d10 // pixel 2 (2x32) + + vmull.s16 q2, d6, d2 // filter 3 output (0-3) + vmull.s16 q3, d7, d3 // filter 3 output (4-7) + + vmovl.s8 q0, d0 // filter 4 + load_filter_ptr r5 // filter 5 + + vpadd.i32 d10, d4, d5 // half pixel 3 (2x32) + vpadd.i32 d11, d6, d7 // half pixel 3 (2x32) + + vext.8 q3, q6, q7, #2*4 // filter 4 pixels + load_filter_coef d2, r5, r7 // filter 5 + + vpadd.i32 d10, d10, d11 // pixel 3 (2x32) + + vpadd.i32 d9, d9, d10 // pixel 2,3 + + vmull.s16 q2, d6, d0 // filter 4 output (0-3) + vmull.s16 q3, d7, d1 // filter 4 output (4-7) + + vmovl.s8 q1, d2 // filter 5 + load_filter_ptr r5 // filter 6 + + vpadd.i32 d10, d4, d5 // half pixel 4 (2x32) + vpadd.i32 d11, d6, d7 // half pixel 4 (2x32) + + vext.8 q3, q6, q7, #2*5 // filter 5 pixels + load_filter_coef d0, r5, r7 // filter 6 + + vpadd.i32 d10, d10, d11 // pixel 4 (2x32) + + vmull.s16 q2, d6, d2 // filter 5 output (0-3) + vmull.s16 q3, d7, d3 // filter 5 output (4-7) + + vmovl.s8 q0, d0 // filter 6 + load_filter_ptr r5 // filter 7 + + vpadd.i32 d4, d4, d5 // half pixel 5 (2x32) + vpadd.i32 d5, d6, d7 // half pixel 5 (2x32) + + vext.8 q3, q6, q7, #2*6 // filter 6 pixels + load_filter_coef d2, r5, r7 // filter 7 + + vpadd.i32 d11, d4, d5 // pixel 5 (2x32) + + vmull.s16 q2, d6, d0 // filter 6 output (0-3) + vmull.s16 q3, d7, d1 // filter 6 output (4-7) + + vmovl.s8 q1, d2 // filter 7 + + vpadd.i32 d10, d10, d11 // pixel 4,5 + + vpadd.i32 d4, d4, d5 // half pixel 6 (2x32) + vpadd.i32 d5, d6, d7 // half pixel 6 (2x32) + + vext.8 q3, q6, q7, #2*7 // filter 7 pixels + + vpadd.i32 d11, d4, d5 // pixel 6 (2x32) + + vmull.s16 q2, d6, d2 // filter 7 output (0-3) + vmull.s16 q3, d7, d3 // filter 7 output (4-7) + + vld1.32 {d14[],d15[]}, [sp] // -(7 - intermediate_bits) + + vpadd.i32 d4, d4, d5 // half pixel 7 (2x32) + vpadd.i32 d5, d6, d7 // half pixel 7 (2x32) + + sub r5, r5, r7, lsl #3 + + vpadd.i32 d4, d4, d5 // pixel 7 (2x32) + + add r5, r5, r8 + + vpadd.i32 d11, d11, d4 // pixel 6,7 + + vrshl.s32 q4, q4, q7 // -(7 - intermediate_bits) + vrshl.s32 q5, q5, q7 // -(7 - intermediate_bits) + + bx lr +endfunc + +// void dav1d_warp_affine_8x8_16bpc_neon( +// pixel *dst, const ptrdiff_t dst_stride, +// const pixel *src, const ptrdiff_t src_stride, +// const int16_t *const abcd, int mx, int my, +// const int bitdepth_max) +.macro warp t +function warp_affine_8x8\t\()_16bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] + ldrd r6, r7, [sp, #108] + sub sp, sp, #8 + + clz r7, r7 + // intermediate_bits = clz(bitdepth_max) - 18 +.ifb \t + sub r8, r7, #11 // 7 + intermediate_bits = clz(bitdepth_max) - 18 + 7 +.endif + sub r7, r7, #25 // -(7 - intermediate_bits) +.ifb \t + neg r8, r8 // -(7 + intermediate_bits) +.endif + str r7, [sp] // spill -(7 - intermediate_bits) on stack +.ifb \t + str r8, [sp, #4] // spill -(7 + intermediate_bits) on stack +.endif + + ldrd r8, r9, [r4] + sxth r7, r8 + asr r8, r8, #16 + asr r4, r9, #16 + sxth r9, r9 + mov r10, #8 + sub r2, r2, r3, lsl #1 + sub r2, r2, r3 + sub r2, r2, #6 + movrel r11, X(mc_warp_filter), 64*8 +.ifnb \t + lsl r1, r1, #1 +.endif + add r5, r5, #512 + add r6, r6, #512 + + bl warp_filter_horz_neon + vmovn.i32 d16, q4 + vmovn.i32 d17, q5 + bl warp_filter_horz_neon + vmovn.i32 d18, q4 + vmovn.i32 d19, q5 + bl warp_filter_horz_neon + vmovn.i32 d20, q4 + vmovn.i32 d21, q5 + bl warp_filter_horz_neon + vmovn.i32 d22, q4 + vmovn.i32 d23, q5 + bl warp_filter_horz_neon + vmovn.i32 d24, q4 + vmovn.i32 d25, q5 + bl warp_filter_horz_neon + vmovn.i32 d26, q4 + vmovn.i32 d27, q5 + bl warp_filter_horz_neon + vmovn.i32 d28, q4 + vmovn.i32 d29, q5 + +1: + bl warp_filter_horz_neon + vmovn.i32 d30, q4 + vmovn.i32 d31, q5 + + load_filter_row d8, r6, r9 + load_filter_row d9, r6, r9 + load_filter_row d10, r6, r9 + load_filter_row d11, r6, r9 + load_filter_row d12, r6, r9 + load_filter_row d13, r6, r9 + load_filter_row d14, r6, r9 + load_filter_row d15, r6, r9 + transpose_8x8b q4, q5, q6, q7, d8, d9, d10, d11, d12, d13, d14, d15 + vmovl.s8 q1, d8 + vmovl.s8 q2, d9 + vmovl.s8 q3, d10 + vmovl.s8 q4, d11 + vmovl.s8 q5, d12 + vmovl.s8 q6, d13 + + sub r6, r6, r9, lsl #3 + + // This ordering of vmull/vmlal is highly beneficial for + // Cortex A8/A9/A53 here, but harmful for Cortex A7. + vmull.s16 q0, d16, d2 + vmlal.s16 q0, d18, d4 + vmlal.s16 q0, d20, d6 + vmlal.s16 q0, d22, d8 + vmlal.s16 q0, d24, d10 + vmlal.s16 q0, d26, d12 + vmull.s16 q1, d17, d3 + vmlal.s16 q1, d19, d5 + vmlal.s16 q1, d21, d7 + vmlal.s16 q1, d23, d9 + vmlal.s16 q1, d25, d11 + vmlal.s16 q1, d27, d13 + + vmovl.s8 q2, d14 + vmovl.s8 q3, d15 + + vmlal.s16 q0, d28, d4 + vmlal.s16 q0, d30, d6 + vmlal.s16 q1, d29, d5 + vmlal.s16 q1, d31, d7 + +.ifb \t + ldr lr, [sp, #4] // -(7 + intermediate_bits) + ldr r12, [sp, #120] // bitdepth_max + vdup.32 q2, lr // -(7 + intermediate_bits) + vdup.16 q3, r12 // bitdepth_max +.endif + + vmov q8, q9 + vmov q9, q10 +.ifb \t + vrshl.s32 q0, q0, q2 // -(7 + intermediate_bits) + vrshl.s32 q1, q1, q2 // -(7 + intermediate_bits) +.else + vrshrn.s32 d0, q0, #7 + vrshrn.s32 d1, q1, #7 + vmov.i16 q3, #PREP_BIAS +.endif + vmov q10, q11 +.ifb \t + vqmovun.s32 d0, q0 + vqmovun.s32 d1, q1 +.else + vsub.i16 q0, q0, q3 // PREP_BIAS +.endif + vmov q11, q12 + vmov q12, q13 +.ifb \t + vmin.u16 q0, q0, q3 // bitdepth_max +.endif + vmov q13, q14 + vmov q14, q15 + subs r10, r10, #1 + vst1.16 {q0}, [r0, :128], r1 + + add r6, r6, r4 + bgt 1b + + add sp, sp, #8 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc +.endm + +warp +warp t diff --git a/third_party/dav1d/src/arm/32/msac.S b/third_party/dav1d/src/arm/32/msac.S new file mode 100644 index 0000000000..b06e109dda --- /dev/null +++ b/third_party/dav1d/src/arm/32/msac.S @@ -0,0 +1,575 @@ +/* + * Copyright © 2019, VideoLAN and dav1d authors + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +#define BUF_POS 0 +#define BUF_END 4 +#define DIF 8 +#define RNG 12 +#define CNT 16 +#define ALLOW_UPDATE_CDF 20 + +const coeffs + .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0 + .short 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +endconst + +const bits, align=4 + .short 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 + .short 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000 +endconst + +.macro vld1_align_n d0, q0, q1, src, n +.if \n == 4 + vld1.16 {\d0}, [\src, :64] +.elseif \n == 8 + vld1.16 {\q0}, [\src, :128] +.else + vld1.16 {\q0, \q1}, [\src, :128] +.endif +.endm + +.macro vld1_n d0, q0, q1, src, n +.if \n == 4 + vld1.16 {\d0}, [\src] +.elseif \n == 8 + vld1.16 {\q0}, [\src] +.else + vld1.16 {\q0, \q1}, [\src] +.endif +.endm + +.macro vst1_align_n d0, q0, q1, src, n +.if \n == 4 + vst1.16 {\d0}, [\src, :64] +.elseif \n == 8 + vst1.16 {\q0}, [\src, :128] +.else + vst1.16 {\q0, \q1}, [\src, :128] +.endif +.endm + +.macro vst1_n d0, q0, q1, src, n +.if \n == 4 + vst1.16 {\d0}, [\src] +.elseif \n == 8 + vst1.16 {\q0}, [\src] +.else + vst1.16 {\q0, \q1}, [\src] +.endif +.endm + +.macro vshr_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n +.if \n == 4 + vshr.u16 \d0, \s0, \s3 +.else + vshr.u16 \d1, \s1, \s4 +.if \n == 16 + vshr.u16 \d2, \s2, \s5 +.endif +.endif +.endm + +.macro vadd_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n +.if \n == 4 + vadd.i16 \d0, \s0, \s3 +.else + vadd.i16 \d1, \s1, \s4 +.if \n == 16 + vadd.i16 \d2, \s2, \s5 +.endif +.endif +.endm + +.macro vsub_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n +.if \n == 4 + vsub.i16 \d0, \s0, \s3 +.else + vsub.i16 \d1, \s1, \s4 +.if \n == 16 + vsub.i16 \d2, \s2, \s5 +.endif +.endif +.endm + +.macro vand_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n +.if \n == 4 + vand \d0, \s0, \s3 +.else + vand \d1, \s1, \s4 +.if \n == 16 + vand \d2, \s2, \s5 +.endif +.endif +.endm + +.macro vcge_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n +.if \n == 4 + vcge.u16 \d0, \s0, \s3 +.else + vcge.u16 \d1, \s1, \s4 +.if \n == 16 + vcge.u16 \d2, \s2, \s5 +.endif +.endif +.endm + +.macro vrhadd_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n +.if \n == 4 + vrhadd.u16 \d0, \s0, \s3 +.else + vrhadd.u16 \d1, \s1, \s4 +.if \n == 16 + vrhadd.u16 \d2, \s2, \s5 +.endif +.endif +.endm + +.macro vshl_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n +.if \n == 4 + vshl.s16 \d0, \s0, \s3 +.else + vshl.s16 \d1, \s1, \s4 +.if \n == 16 + vshl.s16 \d2, \s2, \s5 +.endif +.endif +.endm + +.macro vqdmulh_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n +.if \n == 4 + vqdmulh.s16 \d0, \s0, \s3 +.else + vqdmulh.s16 \d1, \s1, \s4 +.if \n == 16 + vqdmulh.s16 \d2, \s2, \s5 +.endif +.endif +.endm + +// unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf, +// size_t n_symbols); + +function msac_decode_symbol_adapt4_neon, export=1 +.macro decode_update n + push {r4-r10,lr} + sub sp, sp, #48 + add r8, r0, #RNG + + vld1_align_n d0, q0, q1, r1, \n // cdf + vld1.16 {d16[]}, [r8, :16] // rng + movrel_local r9, coeffs, 30 + vmov.i16 d30, #0x7f00 // 0x7f00 + sub r9, r9, r2, lsl #1 + vmvn.i16 q14, #0x3f // 0xffc0 + add r8, sp, #14 + vand d22, d16, d30 // rng & 0x7f00 + vst1.16 {d16[0]}, [r8, :16] // store original u = s->rng + vand_n d4, q2, q3, d0, q0, q1, d28, q14, q14, \n // cdf & 0xffc0 +.if \n > 4 + vmov d23, d22 +.endif + + vld1_n d16, q8, q9, r9, \n // EC_MIN_PROB * (n_symbols - ret) + vqdmulh_n d20, q10, q11, d4, q2, q3, d22, q11, q11, \n // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1 + add r8, r0, #DIF + 2 + + vadd_n d16, q8, q9, d4, q2, q3, d16, q8, q9, \n // v = cdf + EC_MIN_PROB * (n_symbols - ret) +.if \n == 4 + vmov.i16 d17, #0 +.endif + vadd_n d16, q8, q9, d20, q10, q11, d16, q8, q9, \n // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret) + + add r9, sp, #16 + vld1.16 {d20[]}, [r8, :16] // dif >> (EC_WIN_SIZE - 16) + movrel_local r8, bits + vst1_n q8, q8, q9, r9, \n // store v values to allow indexed access + + vmov d21, d20 + vld1_align_n q12, q12, q13, r8, \n +.if \n == 16 + vmov q11, q10 +.endif + + vcge_n q2, q2, q3, q10, q10, q11, q8, q8, q9, \n // c >= v + + vand_n q10, q10, q11, q2, q2, q3, q12, q12, q13, \n // One bit per halfword set in the mask +.if \n == 16 + vadd.i16 q10, q10, q11 +.endif + vadd.i16 d20, d20, d21 // Aggregate mask bits + ldr r4, [r0, #ALLOW_UPDATE_CDF] + vpadd.i16 d20, d20, d20 + lsl r10, r2, #1 + vpadd.i16 d20, d20, d20 + vmov.u16 r3, d20[0] + cmp r4, #0 + rbit r3, r3 + clz lr, r3 // ret + + beq L(renorm) + // update_cdf + ldrh r3, [r1, r10] // count = cdf[n_symbols] + vmov.i8 q10, #0xff +.if \n == 16 + mov r4, #-5 +.else + mvn r12, r2 + mov r4, #-4 + cmn r12, #3 // set C if n_symbols <= 2 +.endif + vrhadd_n d16, q8, q9, d20, q10, q10, d4, q2, q3, \n // i >= val ? -1 : 32768 +.if \n == 16 + sub r4, r4, r3, lsr #4 // -((count >> 4) + 5) +.else + lsr r12, r3, #4 // count >> 4 + sbc r4, r4, r12 // -((count >> 4) + (n_symbols > 2) + 4) +.endif + vsub_n d16, q8, q9, d16, q8, q9, d0, q0, q1, \n // (32768 - cdf[i]) or (-1 - cdf[i]) +.if \n == 4 + vdup.16 d20, r4 // -rate +.else + vdup.16 q10, r4 // -rate +.endif + + sub r3, r3, r3, lsr #5 // count - (count == 32) + vsub_n d0, q0, q1, d0, q0, q1, d4, q2, q3, \n // cdf + (i >= val ? 1 : 0) + vshl_n d16, q8, q9, d16, q8, q9, d20, q10, q10, \n // ({32768,-1} - cdf[i]) >> rate + add r3, r3, #1 // count + (count < 32) + vadd_n d0, q0, q1, d0, q0, q1, d16, q8, q9, \n // cdf + (32768 - cdf[i]) >> rate + vst1_align_n d0, q0, q1, r1, \n + strh r3, [r1, r10] +.endm + + decode_update 4 + +L(renorm): + add r8, sp, #16 + add r8, r8, lr, lsl #1 + ldrh r3, [r8] // v + ldrh r4, [r8, #-2] // u + ldr r6, [r0, #CNT] + ldr r7, [r0, #DIF] + sub r4, r4, r3 // rng = u - v + clz r5, r4 // clz(rng) + eor r5, r5, #16 // d = clz(rng) ^ 16 + mvn r7, r7 // ~dif + add r7, r7, r3, lsl #16 // ~dif + (v << 16) +L(renorm2): + lsl r4, r4, r5 // rng << d + subs r6, r6, r5 // cnt -= d + lsl r7, r7, r5 // (~dif + (v << 16)) << d + str r4, [r0, #RNG] + mvn r7, r7 // ~dif + bhs 9f + + // refill + ldr r3, [r0, #BUF_POS] // BUF_POS + ldr r4, [r0, #BUF_END] // BUF_END + add r5, r3, #4 + cmp r5, r4 + bgt 2f + + ldr r3, [r3] // next_bits + add r8, r6, #23 // shift_bits = cnt + 23 + add r6, r6, #16 // cnt += 16 + rev r3, r3 // next_bits = bswap(next_bits) + sub r5, r5, r8, lsr #3 // buf_pos -= shift_bits >> 3 + and r8, r8, #24 // shift_bits &= 24 + lsr r3, r3, r8 // next_bits >>= shift_bits + sub r8, r8, r6 // shift_bits -= 16 + cnt + str r5, [r0, #BUF_POS] + lsl r3, r3, r8 // next_bits <<= shift_bits + rsb r6, r8, #16 // cnt = cnt + 32 - shift_bits + eor r7, r7, r3 // dif ^= next_bits + b 9f + +2: // refill_eob + rsb r5, r6, #8 // c = 8 - cnt +3: + cmp r3, r4 + bge 4f + ldrb r8, [r3], #1 + lsl r8, r8, r5 + eor r7, r7, r8 + subs r5, r5, #8 + bge 3b + +4: // refill_eob_end + str r3, [r0, #BUF_POS] + rsb r6, r5, #8 // cnt = 8 - c + +9: + str r6, [r0, #CNT] + str r7, [r0, #DIF] + + mov r0, lr + add sp, sp, #48 + + pop {r4-r10,pc} +endfunc + +function msac_decode_symbol_adapt8_neon, export=1 + decode_update 8 + b L(renorm) +endfunc + +function msac_decode_symbol_adapt16_neon, export=1 + decode_update 16 + b L(renorm) +endfunc + +function msac_decode_hi_tok_neon, export=1 + push {r4-r10,lr} + vld1.16 {d0}, [r1, :64] // cdf + add r4, r0, #RNG + vmov.i16 d31, #0x7f00 // 0x7f00 + movrel_local r5, coeffs, 30-2*3 + vmvn.i16 d30, #0x3f // 0xffc0 + ldrh r9, [r1, #6] // count = cdf[n_symbols] + vld1.16 {d1[]}, [r4, :16] // rng + movrel_local r4, bits + vld1.16 {d29}, [r5] // EC_MIN_PROB * (n_symbols - ret) + add r5, r0, #DIF + 2 + vld1.16 {q8}, [r4, :128] + mov r2, #-24 + vand d20, d0, d30 // cdf & 0xffc0 + ldr r10, [r0, #ALLOW_UPDATE_CDF] + vld1.16 {d2[]}, [r5, :16] // dif >> (EC_WIN_SIZE - 16) + sub sp, sp, #48 + ldr r6, [r0, #CNT] + ldr r7, [r0, #DIF] + vmov d3, d2 +1: + vand d23, d1, d31 // rng & 0x7f00 + vqdmulh.s16 d18, d20, d23 // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1 + add r12, sp, #14 + vadd.i16 d6, d20, d29 // v = cdf + EC_MIN_PROB * (n_symbols - ret) + vadd.i16 d6, d18, d6 // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret) + vmov.i16 d7, #0 + vst1.16 {d1[0]}, [r12, :16] // store original u = s->rng + add r12, sp, #16 + vcge.u16 q2, q1, q3 // c >= v + vst1.16 {q3}, [r12] // store v values to allow indexed access + vand q9, q2, q8 // One bit per halfword set in the mask + + vadd.i16 d18, d18, d19 // Aggregate mask bits + vpadd.i16 d18, d18, d18 + vpadd.i16 d18, d18, d18 + vmov.u16 r3, d18[0] + cmp r10, #0 + add r2, r2, #5 + rbit r3, r3 + add r8, sp, #16 + clz lr, r3 // ret + + beq 2f + // update_cdf + vmov.i8 d22, #0xff + mov r4, #-5 + vrhadd.u16 d6, d22, d4 // i >= val ? -1 : 32768 + sub r4, r4, r9, lsr #4 // -((count >> 4) + 5) + vsub.i16 d6, d6, d0 // (32768 - cdf[i]) or (-1 - cdf[i]) + vdup.16 d18, r4 // -rate + + sub r9, r9, r9, lsr #5 // count - (count == 32) + vsub.i16 d0, d0, d4 // cdf + (i >= val ? 1 : 0) + vshl.s16 d6, d6, d18 // ({32768,-1} - cdf[i]) >> rate + add r9, r9, #1 // count + (count < 32) + vadd.i16 d0, d0, d6 // cdf + (32768 - cdf[i]) >> rate + vst1.16 {d0}, [r1, :64] + vand d20, d0, d30 // cdf & 0xffc0 + strh r9, [r1, #6] + +2: + add r8, r8, lr, lsl #1 + ldrh r3, [r8] // v + ldrh r4, [r8, #-2] // u + sub r4, r4, r3 // rng = u - v + clz r5, r4 // clz(rng) + eor r5, r5, #16 // d = clz(rng) ^ 16 + mvn r7, r7 // ~dif + add r7, r7, r3, lsl #16 // ~dif + (v << 16) + lsl r4, r4, r5 // rng << d + subs r6, r6, r5 // cnt -= d + lsl r7, r7, r5 // (~dif + (v << 16)) << d + str r4, [r0, #RNG] + vdup.16 d1, r4 + mvn r7, r7 // ~dif + bhs 9f + + // refill + ldr r3, [r0, #BUF_POS] // BUF_POS + ldr r4, [r0, #BUF_END] // BUF_END + add r5, r3, #4 + cmp r5, r4 + bgt 2f + + ldr r3, [r3] // next_bits + add r8, r6, #23 // shift_bits = cnt + 23 + add r6, r6, #16 // cnt += 16 + rev r3, r3 // next_bits = bswap(next_bits) + sub r5, r5, r8, lsr #3 // buf_pos -= shift_bits >> 3 + and r8, r8, #24 // shift_bits &= 24 + lsr r3, r3, r8 // next_bits >>= shift_bits + sub r8, r8, r6 // shift_bits -= 16 + cnt + str r5, [r0, #BUF_POS] + lsl r3, r3, r8 // next_bits <<= shift_bits + rsb r6, r8, #16 // cnt = cnt + 32 - shift_bits + eor r7, r7, r3 // dif ^= next_bits + b 9f + +2: // refill_eob + rsb r5, r6, #8 // c = 40 - cnt +3: + cmp r3, r4 + bge 4f + ldrb r8, [r3], #1 + lsl r8, r8, r5 + eor r7, r7, r8 + subs r5, r5, #8 + bge 3b + +4: // refill_eob_end + str r3, [r0, #BUF_POS] + rsb r6, r5, #8 // cnt = 40 - c + +9: + lsl lr, lr, #1 + sub lr, lr, #5 + lsr r12, r7, #16 + adds r2, r2, lr // carry = tok_br < 3 || tok == 15 + vdup.16 q1, r12 + bcc 1b // loop if !carry + add r2, r2, #30 + str r6, [r0, #CNT] + add sp, sp, #48 + str r7, [r0, #DIF] + lsr r0, r2, #1 + pop {r4-r10,pc} +endfunc + +function msac_decode_bool_equi_neon, export=1 + push {r4-r10,lr} + ldr r5, [r0, #RNG] + ldr r6, [r0, #CNT] + sub sp, sp, #48 + ldr r7, [r0, #DIF] + bic r4, r5, #0xff // r &= 0xff00 + add r4, r4, #8 + mov r2, #0 + subs r8, r7, r4, lsl #15 // dif - vw + lsr r4, r4, #1 // v + sub r5, r5, r4 // r - v + itee lo + movlo r2, #1 + movhs r4, r5 // if (ret) v = r - v; + movhs r7, r8 // if (ret) dif = dif - vw; + + clz r5, r4 // clz(rng) + mvn r7, r7 // ~dif + eor r5, r5, #16 // d = clz(rng) ^ 16 + mov lr, r2 + b L(renorm2) +endfunc + +function msac_decode_bool_neon, export=1 + push {r4-r10,lr} + ldr r5, [r0, #RNG] + ldr r6, [r0, #CNT] + sub sp, sp, #48 + ldr r7, [r0, #DIF] + lsr r4, r5, #8 // r >> 8 + bic r1, r1, #0x3f // f &= ~63 + mul r4, r4, r1 + mov r2, #0 + lsr r4, r4, #7 + add r4, r4, #4 // v + subs r8, r7, r4, lsl #16 // dif - vw + sub r5, r5, r4 // r - v + itee lo + movlo r2, #1 + movhs r4, r5 // if (ret) v = r - v; + movhs r7, r8 // if (ret) dif = dif - vw; + + clz r5, r4 // clz(rng) + mvn r7, r7 // ~dif + eor r5, r5, #16 // d = clz(rng) ^ 16 + mov lr, r2 + b L(renorm2) +endfunc + +function msac_decode_bool_adapt_neon, export=1 + push {r4-r10,lr} + ldr r9, [r1] // cdf[0-1] + ldr r5, [r0, #RNG] + movw lr, #0xffc0 + ldr r6, [r0, #CNT] + sub sp, sp, #48 + ldr r7, [r0, #DIF] + lsr r4, r5, #8 // r >> 8 + and r2, r9, lr // f &= ~63 + mul r4, r4, r2 + mov r2, #0 + lsr r4, r4, #7 + add r4, r4, #4 // v + subs r8, r7, r4, lsl #16 // dif - vw + sub r5, r5, r4 // r - v + ldr r10, [r0, #ALLOW_UPDATE_CDF] + itee lo + movlo r2, #1 + movhs r4, r5 // if (ret) v = r - v; + movhs r7, r8 // if (ret) dif = dif - vw; + + cmp r10, #0 + clz r5, r4 // clz(rng) + mvn r7, r7 // ~dif + eor r5, r5, #16 // d = clz(rng) ^ 16 + mov lr, r2 + + beq L(renorm2) + + lsr r2, r9, #16 // count = cdf[1] + uxth r9, r9 // cdf[0] + + sub r3, r2, r2, lsr #5 // count - (count >= 32) + lsr r2, r2, #4 // count >> 4 + add r10, r3, #1 // count + (count < 32) + add r2, r2, #4 // rate = (count >> 4) | 4 + + sub r9, r9, lr // cdf[0] -= bit + sub r3, r9, lr, lsl #15 // {cdf[0], cdf[0] - 32769} + asr r3, r3, r2 // {cdf[0], cdf[0] - 32769} >> rate + sub r9, r9, r3 // cdf[0] + + strh r9, [r1] + strh r10, [r1, #2] + + b L(renorm2) +endfunc diff --git a/third_party/dav1d/src/arm/32/util.S b/third_party/dav1d/src/arm/32/util.S new file mode 100644 index 0000000000..6af0158e09 --- /dev/null +++ b/third_party/dav1d/src/arm/32/util.S @@ -0,0 +1,126 @@ +/****************************************************************************** + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2015 Martin Storsjo + * Copyright © 2015 Janne Grunau + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#ifndef DAV1D_SRC_ARM_32_UTIL_S +#define DAV1D_SRC_ARM_32_UTIL_S + +#include "config.h" +#include "src/arm/asm.S" + +.macro movrel_local rd, val, offset=0 +#if defined(PIC) + ldr \rd, 90001f + b 90002f +90001: + .word \val + \offset - (90002f + 8 - 4 * CONFIG_THUMB) +90002: + add \rd, \rd, pc +#else + movw \rd, #:lower16:\val+\offset + movt \rd, #:upper16:\val+\offset +#endif +.endm + +.macro movrel rd, val, offset=0 +#if defined(PIC) && defined(__APPLE__) + ldr \rd, 1f + b 2f +1: + .word 3f - (2f + 8 - 4 * CONFIG_THUMB) +2: + ldr \rd, [pc, \rd] +.if \offset < 0 + sub \rd, \rd, #-(\offset) +.elseif \offset > 0 + add \rd, \rd, #\offset +.endif + .non_lazy_symbol_pointer +3: + .indirect_symbol \val + .word 0 + .text +#else + movrel_local \rd, \val, \offset +#endif +.endm + +.macro transpose_8x8b q0, q1, q2, q3, r0, r1, r2, r3, r4, r5, r6, r7 + vtrn.32 \q0, \q2 + vtrn.32 \q1, \q3 + + vtrn.16 \r0, \r2 + vtrn.16 \r1, \r3 + vtrn.16 \r4, \r6 + vtrn.16 \r5, \r7 + + vtrn.8 \r0, \r1 + vtrn.8 \r2, \r3 + vtrn.8 \r4, \r5 + vtrn.8 \r6, \r7 +.endm + +.macro transpose_8x8h r0, r1, r2, r3, r4, r5, r6, r7, d0, d1, d2, d3, d4, d5, d6, d7 + vswp \d0, \d4 + vswp \d1, \d5 + vswp \d2, \d6 + vswp \d3, \d7 + + vtrn.32 \r0, \r2 + vtrn.32 \r1, \r3 + vtrn.32 \r4, \r6 + vtrn.32 \r5, \r7 + + vtrn.16 \r0, \r1 + vtrn.16 \r2, \r3 + vtrn.16 \r4, \r5 + vtrn.16 \r6, \r7 +.endm + +.macro transpose_4x8b q0, q1, r0, r1, r2, r3 + vtrn.16 \q0, \q1 + + vtrn.8 \r0, \r1 + vtrn.8 \r2, \r3 +.endm + +.macro transpose_4x4h q0, q1, r0, r1, r2, r3 + vtrn.32 \q0, \q1 + + vtrn.16 \r0, \r1 + vtrn.16 \r2, \r3 +.endm + +.macro transpose_4x8h r0, r1, r2, r3 + vtrn.32 \r0, \r2 + vtrn.32 \r1, \r3 + + vtrn.16 \r0, \r1 + vtrn.16 \r2, \r3 +.endm + +#endif /* DAV1D_SRC_ARM_32_UTIL_S */ diff --git a/third_party/dav1d/src/arm/64/cdef.S b/third_party/dav1d/src/arm/64/cdef.S new file mode 100644 index 0000000000..6104470a63 --- /dev/null +++ b/third_party/dav1d/src/arm/64/cdef.S @@ -0,0 +1,517 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2019, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" +#include "cdef_tmpl.S" + +.macro pad_top_bottom s1, s2, w, stride, rn, rw, ret + tst w6, #1 // CDEF_HAVE_LEFT + b.eq 2f + // CDEF_HAVE_LEFT + sub \s1, \s1, #2 + sub \s2, \s2, #2 + tst w6, #2 // CDEF_HAVE_RIGHT + b.eq 1f + // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT + ldr \rn\()0, [\s1] + ldr s1, [\s1, #\w] + ldr \rn\()2, [\s2] + ldr s3, [\s2, #\w] + uxtl v0.8h, v0.8b + uxtl v1.8h, v1.8b + uxtl v2.8h, v2.8b + uxtl v3.8h, v3.8b + str \rw\()0, [x0] + str d1, [x0, #2*\w] + add x0, x0, #2*\stride + str \rw\()2, [x0] + str d3, [x0, #2*\w] +.if \ret + ret +.else + add x0, x0, #2*\stride + b 3f +.endif + +1: + // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT + ldr \rn\()0, [\s1] + ldr h1, [\s1, #\w] + ldr \rn\()2, [\s2] + ldr h3, [\s2, #\w] + uxtl v0.8h, v0.8b + uxtl v1.8h, v1.8b + uxtl v2.8h, v2.8b + uxtl v3.8h, v3.8b + str \rw\()0, [x0] + str s1, [x0, #2*\w] + str s31, [x0, #2*\w+4] + add x0, x0, #2*\stride + str \rw\()2, [x0] + str s3, [x0, #2*\w] + str s31, [x0, #2*\w+4] +.if \ret + ret +.else + add x0, x0, #2*\stride + b 3f +.endif + +2: + // !CDEF_HAVE_LEFT + tst w6, #2 // CDEF_HAVE_RIGHT + b.eq 1f + // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT + ldr \rn\()0, [\s1] + ldr h1, [\s1, #\w] + ldr \rn\()2, [\s2] + ldr h3, [\s2, #\w] + uxtl v0.8h, v0.8b + uxtl v1.8h, v1.8b + uxtl v2.8h, v2.8b + uxtl v3.8h, v3.8b + str s31, [x0] + stur \rw\()0, [x0, #4] + str s1, [x0, #4+2*\w] + add x0, x0, #2*\stride + str s31, [x0] + stur \rw\()2, [x0, #4] + str s3, [x0, #4+2*\w] +.if \ret + ret +.else + add x0, x0, #2*\stride + b 3f +.endif + +1: + // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT + ldr \rn\()0, [\s1] + ldr \rn\()1, [\s2] + uxtl v0.8h, v0.8b + uxtl v1.8h, v1.8b + str s31, [x0] + stur \rw\()0, [x0, #4] + str s31, [x0, #4+2*\w] + add x0, x0, #2*\stride + str s31, [x0] + stur \rw\()1, [x0, #4] + str s31, [x0, #4+2*\w] +.if \ret + ret +.else + add x0, x0, #2*\stride +.endif +3: +.endm + +.macro load_n_incr dst, src, incr, w +.if \w == 4 + ld1 {\dst\().s}[0], [\src], \incr +.else + ld1 {\dst\().8b}, [\src], \incr +.endif +.endm + +// void dav1d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src, +// ptrdiff_t src_stride, const pixel (*left)[2], +// const pixel *const top, int h, +// enum CdefEdgeFlags edges); + +.macro padding_func w, stride, rn, rw +function cdef_padding\w\()_8bpc_neon, export=1 + cmp w6, #0xf // fully edged + b.eq cdef_padding\w\()_edged_8bpc_neon + movi v30.8h, #0x80, lsl #8 + mov v31.16b, v30.16b + sub x0, x0, #2*(2*\stride+2) + tst w6, #4 // CDEF_HAVE_TOP + b.ne 1f + // !CDEF_HAVE_TOP + st1 {v30.8h, v31.8h}, [x0], #32 +.if \w == 8 + st1 {v30.8h, v31.8h}, [x0], #32 +.endif + b 3f +1: + // CDEF_HAVE_TOP + add x9, x4, x2 + pad_top_bottom x4, x9, \w, \stride, \rn, \rw, 0 + + // Middle section +3: + tst w6, #1 // CDEF_HAVE_LEFT + b.eq 2f + // CDEF_HAVE_LEFT + tst w6, #2 // CDEF_HAVE_RIGHT + b.eq 1f + // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT +0: + ld1 {v0.h}[0], [x3], #2 + ldr h2, [x1, #\w] + load_n_incr v1, x1, x2, \w + subs w5, w5, #1 + uxtl v0.8h, v0.8b + uxtl v1.8h, v1.8b + uxtl v2.8h, v2.8b + str s0, [x0] + stur \rw\()1, [x0, #4] + str s2, [x0, #4+2*\w] + add x0, x0, #2*\stride + b.gt 0b + b 3f +1: + // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT + ld1 {v0.h}[0], [x3], #2 + load_n_incr v1, x1, x2, \w + subs w5, w5, #1 + uxtl v0.8h, v0.8b + uxtl v1.8h, v1.8b + str s0, [x0] + stur \rw\()1, [x0, #4] + str s31, [x0, #4+2*\w] + add x0, x0, #2*\stride + b.gt 1b + b 3f +2: + tst w6, #2 // CDEF_HAVE_RIGHT + b.eq 1f + // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT +0: + ldr h1, [x1, #\w] + load_n_incr v0, x1, x2, \w + subs w5, w5, #1 + uxtl v0.8h, v0.8b + uxtl v1.8h, v1.8b + str s31, [x0] + stur \rw\()0, [x0, #4] + str s1, [x0, #4+2*\w] + add x0, x0, #2*\stride + b.gt 0b + b 3f +1: + // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT + load_n_incr v0, x1, x2, \w + subs w5, w5, #1 + uxtl v0.8h, v0.8b + str s31, [x0] + stur \rw\()0, [x0, #4] + str s31, [x0, #4+2*\w] + add x0, x0, #2*\stride + b.gt 1b + +3: + tst w6, #8 // CDEF_HAVE_BOTTOM + b.ne 1f + // !CDEF_HAVE_BOTTOM + st1 {v30.8h, v31.8h}, [x0], #32 +.if \w == 8 + st1 {v30.8h, v31.8h}, [x0], #32 +.endif + ret +1: + // CDEF_HAVE_BOTTOM + add x9, x1, x2 + pad_top_bottom x1, x9, \w, \stride, \rn, \rw, 1 +endfunc +.endm + +padding_func 8, 16, d, q +padding_func 4, 8, s, d + +// void cdef_paddingX_edged_8bpc_neon(uint8_t *tmp, const pixel *src, +// ptrdiff_t src_stride, const pixel (*left)[2], +// const pixel *const top, int h, +// enum CdefEdgeFlags edges); + +.macro padding_func_edged w, stride, reg +function cdef_padding\w\()_edged_8bpc_neon, export=1 + sub x4, x4, #2 + sub x0, x0, #(2*\stride+2) + +.if \w == 4 + ldr d0, [x4] + ldr d1, [x4, x2] + st1 {v0.8b, v1.8b}, [x0], #16 +.else + add x9, x4, x2 + ldr d0, [x4] + ldr s1, [x4, #8] + ldr d2, [x9] + ldr s3, [x9, #8] + str d0, [x0] + str s1, [x0, #8] + str d2, [x0, #\stride] + str s3, [x0, #\stride+8] + add x0, x0, #2*\stride +.endif + +0: + ld1 {v0.h}[0], [x3], #2 + ldr h2, [x1, #\w] + load_n_incr v1, x1, x2, \w + subs w5, w5, #1 + str h0, [x0] + stur \reg\()1, [x0, #2] + str h2, [x0, #2+\w] + add x0, x0, #\stride + b.gt 0b + + sub x1, x1, #2 +.if \w == 4 + ldr d0, [x1] + ldr d1, [x1, x2] + st1 {v0.8b, v1.8b}, [x0], #16 +.else + add x9, x1, x2 + ldr d0, [x1] + ldr s1, [x1, #8] + ldr d2, [x9] + ldr s3, [x9, #8] + str d0, [x0] + str s1, [x0, #8] + str d2, [x0, #\stride] + str s3, [x0, #\stride+8] +.endif + ret +endfunc +.endm + +padding_func_edged 8, 16, d +padding_func_edged 4, 8, s + +tables + +filter 8, 8 +filter 4, 8 + +find_dir 8 + +.macro load_px_8 d1, d2, w +.if \w == 8 + add x6, x2, w9, sxtb // x + off + sub x9, x2, w9, sxtb // x - off + ld1 {\d1\().d}[0], [x6] // p0 + add x6, x6, #16 // += stride + ld1 {\d2\().d}[0], [x9] // p1 + add x9, x9, #16 // += stride + ld1 {\d1\().d}[1], [x6] // p0 + ld1 {\d2\().d}[1], [x9] // p0 +.else + add x6, x2, w9, sxtb // x + off + sub x9, x2, w9, sxtb // x - off + ld1 {\d1\().s}[0], [x6] // p0 + add x6, x6, #8 // += stride + ld1 {\d2\().s}[0], [x9] // p1 + add x9, x9, #8 // += stride + ld1 {\d1\().s}[1], [x6] // p0 + add x6, x6, #8 // += stride + ld1 {\d2\().s}[1], [x9] // p1 + add x9, x9, #8 // += stride + ld1 {\d1\().s}[2], [x6] // p0 + add x6, x6, #8 // += stride + ld1 {\d2\().s}[2], [x9] // p1 + add x9, x9, #8 // += stride + ld1 {\d1\().s}[3], [x6] // p0 + ld1 {\d2\().s}[3], [x9] // p1 +.endif +.endm +.macro handle_pixel_8 s1, s2, thresh_vec, shift, tap, min +.if \min + umin v3.16b, v3.16b, \s1\().16b + umax v4.16b, v4.16b, \s1\().16b + umin v3.16b, v3.16b, \s2\().16b + umax v4.16b, v4.16b, \s2\().16b +.endif + uabd v16.16b, v0.16b, \s1\().16b // abs(diff) + uabd v20.16b, v0.16b, \s2\().16b // abs(diff) + ushl v17.16b, v16.16b, \shift // abs(diff) >> shift + ushl v21.16b, v20.16b, \shift // abs(diff) >> shift + uqsub v17.16b, \thresh_vec, v17.16b // clip = imax(0, threshold - (abs(diff) >> shift)) + uqsub v21.16b, \thresh_vec, v21.16b // clip = imax(0, threshold - (abs(diff) >> shift)) + cmhi v18.16b, v0.16b, \s1\().16b // px > p0 + cmhi v22.16b, v0.16b, \s2\().16b // px > p1 + umin v17.16b, v17.16b, v16.16b // imin(abs(diff), clip) + umin v21.16b, v21.16b, v20.16b // imin(abs(diff), clip) + dup v19.16b, \tap // taps[k] + neg v16.16b, v17.16b // -imin() + neg v20.16b, v21.16b // -imin() + bsl v18.16b, v16.16b, v17.16b // constrain() = apply_sign() + bsl v22.16b, v20.16b, v21.16b // constrain() = apply_sign() + smlal v1.8h, v18.8b, v19.8b // sum += taps[k] * constrain() + smlal v1.8h, v22.8b, v19.8b // sum += taps[k] * constrain() + smlal2 v2.8h, v18.16b, v19.16b // sum += taps[k] * constrain() + smlal2 v2.8h, v22.16b, v19.16b // sum += taps[k] * constrain() +.endm + +// void cdef_filterX_edged_8bpc_neon(pixel *dst, ptrdiff_t dst_stride, +// const uint8_t *tmp, int pri_strength, +// int sec_strength, int dir, int damping, +// int h); +.macro filter_func_8 w, pri, sec, min, suffix +function cdef_filter\w\suffix\()_edged_8bpc_neon +.if \pri + movrel x8, pri_taps + and w9, w3, #1 + add x8, x8, w9, uxtw #1 +.endif + movrel x9, directions\w + add x5, x9, w5, uxtw #1 + movi v30.8b, #7 + dup v28.8b, w6 // damping + +.if \pri + dup v25.16b, w3 // threshold +.endif +.if \sec + dup v27.16b, w4 // threshold +.endif + trn1 v24.8b, v25.8b, v27.8b + clz v24.8b, v24.8b // clz(threshold) + sub v24.8b, v30.8b, v24.8b // ulog2(threshold) + uqsub v24.8b, v28.8b, v24.8b // shift = imax(0, damping - ulog2(threshold)) + neg v24.8b, v24.8b // -shift +.if \sec + dup v26.16b, v24.b[1] +.endif +.if \pri + dup v24.16b, v24.b[0] +.endif + +1: +.if \w == 8 + add x12, x2, #16 + ld1 {v0.d}[0], [x2] // px + ld1 {v0.d}[1], [x12] // px +.else + add x12, x2, #1*8 + add x13, x2, #2*8 + add x14, x2, #3*8 + ld1 {v0.s}[0], [x2] // px + ld1 {v0.s}[1], [x12] // px + ld1 {v0.s}[2], [x13] // px + ld1 {v0.s}[3], [x14] // px +.endif + + movi v1.8h, #0 // sum + movi v2.8h, #0 // sum +.if \min + mov v3.16b, v0.16b // min + mov v4.16b, v0.16b // max +.endif + + // Instead of loading sec_taps 2, 1 from memory, just set it + // to 2 initially and decrease for the second round. + // This is also used as loop counter. + mov w11, #2 // sec_taps[0] + +2: +.if \pri + ldrb w9, [x5] // off1 + + load_px_8 v5, v6, \w +.endif + +.if \sec + add x5, x5, #4 // +2*2 + ldrb w9, [x5] // off2 + load_px_8 v28, v29, \w +.endif + +.if \pri + ldrb w10, [x8] // *pri_taps + + handle_pixel_8 v5, v6, v25.16b, v24.16b, w10, \min +.endif + +.if \sec + add x5, x5, #8 // +2*4 + ldrb w9, [x5] // off3 + load_px_8 v5, v6, \w + + handle_pixel_8 v28, v29, v27.16b, v26.16b, w11, \min + + handle_pixel_8 v5, v6, v27.16b, v26.16b, w11, \min + + sub x5, x5, #11 // x5 -= 2*(2+4); x5 += 1; +.else + add x5, x5, #1 // x5 += 1 +.endif + subs w11, w11, #1 // sec_tap-- (value) +.if \pri + add x8, x8, #1 // pri_taps++ (pointer) +.endif + b.ne 2b + + sshr v5.8h, v1.8h, #15 // -(sum < 0) + sshr v6.8h, v2.8h, #15 // -(sum < 0) + add v1.8h, v1.8h, v5.8h // sum - (sum < 0) + add v2.8h, v2.8h, v6.8h // sum - (sum < 0) + srshr v1.8h, v1.8h, #4 // (8 + sum - (sum < 0)) >> 4 + srshr v2.8h, v2.8h, #4 // (8 + sum - (sum < 0)) >> 4 + uaddw v1.8h, v1.8h, v0.8b // px + (8 + sum ...) >> 4 + uaddw2 v2.8h, v2.8h, v0.16b // px + (8 + sum ...) >> 4 + sqxtun v0.8b, v1.8h + sqxtun2 v0.16b, v2.8h +.if \min + umin v0.16b, v0.16b, v4.16b + umax v0.16b, v0.16b, v3.16b // iclip(px + .., min, max) +.endif +.if \w == 8 + st1 {v0.d}[0], [x0], x1 + add x2, x2, #2*16 // tmp += 2*tmp_stride + subs w7, w7, #2 // h -= 2 + st1 {v0.d}[1], [x0], x1 +.else + st1 {v0.s}[0], [x0], x1 + add x2, x2, #4*8 // tmp += 4*tmp_stride + st1 {v0.s}[1], [x0], x1 + subs w7, w7, #4 // h -= 4 + st1 {v0.s}[2], [x0], x1 + st1 {v0.s}[3], [x0], x1 +.endif + + // Reset pri_taps and directions back to the original point + sub x5, x5, #2 +.if \pri + sub x8, x8, #2 +.endif + + b.gt 1b + ret +endfunc +.endm + +.macro filter_8 w +filter_func_8 \w, pri=1, sec=0, min=0, suffix=_pri +filter_func_8 \w, pri=0, sec=1, min=0, suffix=_sec +filter_func_8 \w, pri=1, sec=1, min=1, suffix=_pri_sec +.endm + +filter_8 8 +filter_8 4 diff --git a/third_party/dav1d/src/arm/64/cdef16.S b/third_party/dav1d/src/arm/64/cdef16.S new file mode 100644 index 0000000000..125ecb2663 --- /dev/null +++ b/third_party/dav1d/src/arm/64/cdef16.S @@ -0,0 +1,228 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" +#include "cdef_tmpl.S" + +.macro pad_top_bot_16 s1, s2, w, stride, reg, ret + tst w6, #1 // CDEF_HAVE_LEFT + b.eq 2f + // CDEF_HAVE_LEFT + sub \s1, \s1, #4 + sub \s2, \s2, #4 + tst w6, #2 // CDEF_HAVE_RIGHT + b.eq 1f + // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT + ldr \reg\()0, [\s1] + ldr d1, [\s1, #2*\w] + ldr \reg\()2, [\s2] + ldr d3, [\s2, #2*\w] + str \reg\()0, [x0] + str d1, [x0, #2*\w] + add x0, x0, #2*\stride + str \reg\()2, [x0] + str d3, [x0, #2*\w] +.if \ret + ret +.else + add x0, x0, #2*\stride + b 3f +.endif + +1: + // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT + ldr \reg\()0, [\s1] + ldr s1, [\s1, #2*\w] + ldr \reg\()2, [\s2] + ldr s3, [\s2, #2*\w] + str \reg\()0, [x0] + str s1, [x0, #2*\w] + str s31, [x0, #2*\w+4] + add x0, x0, #2*\stride + str \reg\()2, [x0] + str s3, [x0, #2*\w] + str s31, [x0, #2*\w+4] +.if \ret + ret +.else + add x0, x0, #2*\stride + b 3f +.endif + +2: + // !CDEF_HAVE_LEFT + tst w6, #2 // CDEF_HAVE_RIGHT + b.eq 1f + // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT + ldr \reg\()0, [\s1] + ldr s1, [\s1, #2*\w] + ldr \reg\()2, [\s2] + ldr s3, [\s2, #2*\w] + str s31, [x0] + stur \reg\()0, [x0, #4] + str s1, [x0, #4+2*\w] + add x0, x0, #2*\stride + str s31, [x0] + stur \reg\()2, [x0, #4] + str s3, [x0, #4+2*\w] +.if \ret + ret +.else + add x0, x0, #2*\stride + b 3f +.endif + +1: + // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT + ldr \reg\()0, [\s1] + ldr \reg\()1, [\s2] + str s31, [x0] + stur \reg\()0, [x0, #4] + str s31, [x0, #4+2*\w] + add x0, x0, #2*\stride + str s31, [x0] + stur \reg\()1, [x0, #4] + str s31, [x0, #4+2*\w] +.if \ret + ret +.else + add x0, x0, #2*\stride +.endif +3: +.endm + +.macro load_n_incr_16 dst, src, incr, w +.if \w == 4 + ld1 {\dst\().4h}, [\src], \incr +.else + ld1 {\dst\().8h}, [\src], \incr +.endif +.endm + +// void dav1d_cdef_paddingX_16bpc_neon(uint16_t *tmp, const pixel *src, +// ptrdiff_t src_stride, const pixel (*left)[2], +// const pixel *const top, int h, +// enum CdefEdgeFlags edges); + +.macro padding_func_16 w, stride, reg +function cdef_padding\w\()_16bpc_neon, export=1 + movi v30.8h, #0x80, lsl #8 + mov v31.16b, v30.16b + sub x0, x0, #2*(2*\stride+2) + tst w6, #4 // CDEF_HAVE_TOP + b.ne 1f + // !CDEF_HAVE_TOP + st1 {v30.8h, v31.8h}, [x0], #32 +.if \w == 8 + st1 {v30.8h, v31.8h}, [x0], #32 +.endif + b 3f +1: + // CDEF_HAVE_TOP + add x9, x4, x2 + pad_top_bot_16 x4, x9, \w, \stride, \reg, 0 + + // Middle section +3: + tst w6, #1 // CDEF_HAVE_LEFT + b.eq 2f + // CDEF_HAVE_LEFT + tst w6, #2 // CDEF_HAVE_RIGHT + b.eq 1f + // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT +0: + ld1 {v0.s}[0], [x3], #4 + ldr s2, [x1, #2*\w] + load_n_incr_16 v1, x1, x2, \w + subs w5, w5, #1 + str s0, [x0] + stur \reg\()1, [x0, #4] + str s2, [x0, #4+2*\w] + add x0, x0, #2*\stride + b.gt 0b + b 3f +1: + // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT + ld1 {v0.s}[0], [x3], #4 + load_n_incr_16 v1, x1, x2, \w + subs w5, w5, #1 + str s0, [x0] + stur \reg\()1, [x0, #4] + str s31, [x0, #4+2*\w] + add x0, x0, #2*\stride + b.gt 1b + b 3f +2: + tst w6, #2 // CDEF_HAVE_RIGHT + b.eq 1f + // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT +0: + ldr s1, [x1, #2*\w] + load_n_incr_16 v0, x1, x2, \w + subs w5, w5, #1 + str s31, [x0] + stur \reg\()0, [x0, #4] + str s1, [x0, #4+2*\w] + add x0, x0, #2*\stride + b.gt 0b + b 3f +1: + // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT + load_n_incr_16 v0, x1, x2, \w + subs w5, w5, #1 + str s31, [x0] + stur \reg\()0, [x0, #4] + str s31, [x0, #4+2*\w] + add x0, x0, #2*\stride + b.gt 1b + +3: + tst w6, #8 // CDEF_HAVE_BOTTOM + b.ne 1f + // !CDEF_HAVE_BOTTOM + st1 {v30.8h, v31.8h}, [x0], #32 +.if \w == 8 + st1 {v30.8h, v31.8h}, [x0], #32 +.endif + ret +1: + // CDEF_HAVE_BOTTOM + add x9, x1, x2 + pad_top_bot_16 x1, x9, \w, \stride, \reg, 1 +endfunc +.endm + +padding_func_16 8, 16, q +padding_func_16 4, 8, d + +tables + +filter 8, 16 +filter 4, 16 + +find_dir 16 diff --git a/third_party/dav1d/src/arm/64/cdef_tmpl.S b/third_party/dav1d/src/arm/64/cdef_tmpl.S new file mode 100644 index 0000000000..87c6388b4c --- /dev/null +++ b/third_party/dav1d/src/arm/64/cdef_tmpl.S @@ -0,0 +1,482 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +.macro dir_table w, stride +const directions\w + .byte -1 * \stride + 1, -2 * \stride + 2 + .byte 0 * \stride + 1, -1 * \stride + 2 + .byte 0 * \stride + 1, 0 * \stride + 2 + .byte 0 * \stride + 1, 1 * \stride + 2 + .byte 1 * \stride + 1, 2 * \stride + 2 + .byte 1 * \stride + 0, 2 * \stride + 1 + .byte 1 * \stride + 0, 2 * \stride + 0 + .byte 1 * \stride + 0, 2 * \stride - 1 +// Repeated, to avoid & 7 + .byte -1 * \stride + 1, -2 * \stride + 2 + .byte 0 * \stride + 1, -1 * \stride + 2 + .byte 0 * \stride + 1, 0 * \stride + 2 + .byte 0 * \stride + 1, 1 * \stride + 2 + .byte 1 * \stride + 1, 2 * \stride + 2 + .byte 1 * \stride + 0, 2 * \stride + 1 +endconst +.endm + +.macro tables +dir_table 8, 16 +dir_table 4, 8 + +const pri_taps + .byte 4, 2, 3, 3 +endconst +.endm + +.macro load_px d1, d2, w +.if \w == 8 + add x6, x2, w9, sxtb #1 // x + off + sub x9, x2, w9, sxtb #1 // x - off + ld1 {\d1\().8h}, [x6] // p0 + ld1 {\d2\().8h}, [x9] // p1 +.else + add x6, x2, w9, sxtb #1 // x + off + sub x9, x2, w9, sxtb #1 // x - off + ld1 {\d1\().4h}, [x6] // p0 + add x6, x6, #2*8 // += stride + ld1 {\d2\().4h}, [x9] // p1 + add x9, x9, #2*8 // += stride + ld1 {\d1\().d}[1], [x6] // p0 + ld1 {\d2\().d}[1], [x9] // p1 +.endif +.endm +.macro handle_pixel s1, s2, thresh_vec, shift, tap, min +.if \min + umin v2.8h, v2.8h, \s1\().8h + smax v3.8h, v3.8h, \s1\().8h + umin v2.8h, v2.8h, \s2\().8h + smax v3.8h, v3.8h, \s2\().8h +.endif + uabd v16.8h, v0.8h, \s1\().8h // abs(diff) + uabd v20.8h, v0.8h, \s2\().8h // abs(diff) + ushl v17.8h, v16.8h, \shift // abs(diff) >> shift + ushl v21.8h, v20.8h, \shift // abs(diff) >> shift + uqsub v17.8h, \thresh_vec, v17.8h // clip = imax(0, threshold - (abs(diff) >> shift)) + uqsub v21.8h, \thresh_vec, v21.8h // clip = imax(0, threshold - (abs(diff) >> shift)) + sub v18.8h, \s1\().8h, v0.8h // diff = p0 - px + sub v22.8h, \s2\().8h, v0.8h // diff = p1 - px + neg v16.8h, v17.8h // -clip + neg v20.8h, v21.8h // -clip + smin v18.8h, v18.8h, v17.8h // imin(diff, clip) + smin v22.8h, v22.8h, v21.8h // imin(diff, clip) + dup v19.8h, \tap // taps[k] + smax v18.8h, v18.8h, v16.8h // constrain() = imax(imin(diff, clip), -clip) + smax v22.8h, v22.8h, v20.8h // constrain() = imax(imin(diff, clip), -clip) + mla v1.8h, v18.8h, v19.8h // sum += taps[k] * constrain() + mla v1.8h, v22.8h, v19.8h // sum += taps[k] * constrain() +.endm + +// void dav1d_cdef_filterX_Ybpc_neon(pixel *dst, ptrdiff_t dst_stride, +// const uint16_t *tmp, int pri_strength, +// int sec_strength, int dir, int damping, +// int h, size_t edges); +.macro filter_func w, bpc, pri, sec, min, suffix +function cdef_filter\w\suffix\()_\bpc\()bpc_neon +.if \bpc == 8 + ldr w8, [sp] // edges + cmp w8, #0xf + b.eq cdef_filter\w\suffix\()_edged_8bpc_neon +.endif +.if \pri +.if \bpc == 16 + ldr w9, [sp, #8] // bitdepth_max + clz w9, w9 + sub w9, w9, #24 // -bitdepth_min_8 + neg w9, w9 // bitdepth_min_8 +.endif + movrel x8, pri_taps +.if \bpc == 16 + lsr w9, w3, w9 // pri_strength >> bitdepth_min_8 + and w9, w9, #1 // (pri_strength >> bitdepth_min_8) & 1 +.else + and w9, w3, #1 +.endif + add x8, x8, w9, uxtw #1 +.endif + movrel x9, directions\w + add x5, x9, w5, uxtw #1 + movi v30.4h, #15 + dup v28.4h, w6 // damping + +.if \pri + dup v25.8h, w3 // threshold +.endif +.if \sec + dup v27.8h, w4 // threshold +.endif + trn1 v24.4h, v25.4h, v27.4h + clz v24.4h, v24.4h // clz(threshold) + sub v24.4h, v30.4h, v24.4h // ulog2(threshold) + uqsub v24.4h, v28.4h, v24.4h // shift = imax(0, damping - ulog2(threshold)) + neg v24.4h, v24.4h // -shift +.if \sec + dup v26.8h, v24.h[1] +.endif +.if \pri + dup v24.8h, v24.h[0] +.endif + +1: +.if \w == 8 + ld1 {v0.8h}, [x2] // px +.else + add x12, x2, #2*8 + ld1 {v0.4h}, [x2] // px + ld1 {v0.d}[1], [x12] // px +.endif + + movi v1.8h, #0 // sum +.if \min + mov v2.16b, v0.16b // min + mov v3.16b, v0.16b // max +.endif + + // Instead of loading sec_taps 2, 1 from memory, just set it + // to 2 initially and decrease for the second round. + // This is also used as loop counter. + mov w11, #2 // sec_taps[0] + +2: +.if \pri + ldrb w9, [x5] // off1 + + load_px v4, v5, \w +.endif + +.if \sec + add x5, x5, #4 // +2*2 + ldrb w9, [x5] // off2 + load_px v6, v7, \w +.endif + +.if \pri + ldrb w10, [x8] // *pri_taps + + handle_pixel v4, v5, v25.8h, v24.8h, w10, \min +.endif + +.if \sec + add x5, x5, #8 // +2*4 + ldrb w9, [x5] // off3 + load_px v4, v5, \w + + handle_pixel v6, v7, v27.8h, v26.8h, w11, \min + + handle_pixel v4, v5, v27.8h, v26.8h, w11, \min + + sub x5, x5, #11 // x5 -= 2*(2+4); x5 += 1; +.else + add x5, x5, #1 // x5 += 1 +.endif + subs w11, w11, #1 // sec_tap-- (value) +.if \pri + add x8, x8, #1 // pri_taps++ (pointer) +.endif + b.ne 2b + + sshr v4.8h, v1.8h, #15 // -(sum < 0) + add v1.8h, v1.8h, v4.8h // sum - (sum < 0) + srshr v1.8h, v1.8h, #4 // (8 + sum - (sum < 0)) >> 4 + add v0.8h, v0.8h, v1.8h // px + (8 + sum ...) >> 4 +.if \min + smin v0.8h, v0.8h, v3.8h + smax v0.8h, v0.8h, v2.8h // iclip(px + .., min, max) +.endif +.if \bpc == 8 + xtn v0.8b, v0.8h +.endif +.if \w == 8 + add x2, x2, #2*16 // tmp += tmp_stride + subs w7, w7, #1 // h-- +.if \bpc == 8 + st1 {v0.8b}, [x0], x1 +.else + st1 {v0.8h}, [x0], x1 +.endif +.else +.if \bpc == 8 + st1 {v0.s}[0], [x0], x1 +.else + st1 {v0.d}[0], [x0], x1 +.endif + add x2, x2, #2*16 // tmp += 2*tmp_stride + subs w7, w7, #2 // h -= 2 +.if \bpc == 8 + st1 {v0.s}[1], [x0], x1 +.else + st1 {v0.d}[1], [x0], x1 +.endif +.endif + + // Reset pri_taps and directions back to the original point + sub x5, x5, #2 +.if \pri + sub x8, x8, #2 +.endif + + b.gt 1b + ret +endfunc +.endm + +.macro filter w, bpc +filter_func \w, \bpc, pri=1, sec=0, min=0, suffix=_pri +filter_func \w, \bpc, pri=0, sec=1, min=0, suffix=_sec +filter_func \w, \bpc, pri=1, sec=1, min=1, suffix=_pri_sec + +function cdef_filter\w\()_\bpc\()bpc_neon, export=1 + cbnz w3, 1f // pri_strength + b cdef_filter\w\()_sec_\bpc\()bpc_neon // only sec +1: + cbnz w4, 1f // sec_strength + b cdef_filter\w\()_pri_\bpc\()bpc_neon // only pri +1: + b cdef_filter\w\()_pri_sec_\bpc\()bpc_neon // both pri and sec +endfunc +.endm + +const div_table + .short 840, 420, 280, 210, 168, 140, 120, 105 +endconst + +const alt_fact + .short 420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0 +endconst + +.macro cost_alt d1, d2, s1, s2, s3, s4 + smull v22.4s, \s1\().4h, \s1\().4h // sum_alt[n]*sum_alt[n] + smull2 v23.4s, \s1\().8h, \s1\().8h + smull v24.4s, \s2\().4h, \s2\().4h + smull v25.4s, \s3\().4h, \s3\().4h // sum_alt[n]*sum_alt[n] + smull2 v26.4s, \s3\().8h, \s3\().8h + smull v27.4s, \s4\().4h, \s4\().4h + mul v22.4s, v22.4s, v29.4s // sum_alt[n]^2*fact + mla v22.4s, v23.4s, v30.4s + mla v22.4s, v24.4s, v31.4s + mul v25.4s, v25.4s, v29.4s // sum_alt[n]^2*fact + mla v25.4s, v26.4s, v30.4s + mla v25.4s, v27.4s, v31.4s + addv \d1, v22.4s // *cost_ptr + addv \d2, v25.4s // *cost_ptr +.endm + +.macro find_best s1, s2, s3 +.ifnb \s2 + mov w5, \s2\().s[0] +.endif + cmp w4, w1 // cost[n] > best_cost + csel w0, w3, w0, gt // best_dir = n + csel w1, w4, w1, gt // best_cost = cost[n] +.ifnb \s2 + add w3, w3, #1 // n++ + cmp w5, w1 // cost[n] > best_cost + mov w4, \s3\().s[0] + csel w0, w3, w0, gt // best_dir = n + csel w1, w5, w1, gt // best_cost = cost[n] + add w3, w3, #1 // n++ +.endif +.endm + +// int dav1d_cdef_find_dir_Xbpc_neon(const pixel *img, const ptrdiff_t stride, +// unsigned *const var) +.macro find_dir bpc +function cdef_find_dir_\bpc\()bpc_neon, export=1 +.if \bpc == 16 + str d8, [sp, #-0x10]! + clz w3, w3 // clz(bitdepth_max) + sub w3, w3, #24 // -bitdepth_min_8 + dup v8.8h, w3 +.endif + sub sp, sp, #32 // cost + mov w3, #8 +.if \bpc == 8 + movi v31.16b, #128 +.else + movi v31.8h, #128 +.endif + movi v30.16b, #0 + movi v1.8h, #0 // v0-v1 sum_diag[0] + movi v3.8h, #0 // v2-v3 sum_diag[1] + movi v5.8h, #0 // v4-v5 sum_hv[0-1] + movi v7.8h, #0 // v6-v7 sum_alt[0] + movi v17.8h, #0 // v16-v17 sum_alt[1] + movi v18.8h, #0 // v18-v19 sum_alt[2] + movi v19.8h, #0 + movi v21.8h, #0 // v20-v21 sum_alt[3] + +.irpc i, 01234567 +.if \bpc == 8 + ld1 {v26.8b}, [x0], x1 + usubl v26.8h, v26.8b, v31.8b +.else + ld1 {v26.8h}, [x0], x1 + ushl v26.8h, v26.8h, v8.8h + sub v26.8h, v26.8h, v31.8h +.endif + + addv h25, v26.8h // [y] + rev64 v27.8h, v26.8h + addp v28.8h, v26.8h, v30.8h // [(x >> 1)] + add v5.8h, v5.8h, v26.8h // sum_hv[1] + ext v27.16b, v27.16b, v27.16b, #8 // [-x] + rev64 v29.4h, v28.4h // [-(x >> 1)] + ins v4.h[\i], v25.h[0] // sum_hv[0] + +.if \i == 0 + mov v0.16b, v26.16b // sum_diag[0] + mov v2.16b, v27.16b // sum_diag[1] + mov v6.16b, v28.16b // sum_alt[0] + mov v16.16b, v29.16b // sum_alt[1] +.else + ext v22.16b, v30.16b, v26.16b, #(16-2*\i) + ext v23.16b, v26.16b, v30.16b, #(16-2*\i) + ext v24.16b, v30.16b, v27.16b, #(16-2*\i) + ext v25.16b, v27.16b, v30.16b, #(16-2*\i) + add v0.8h, v0.8h, v22.8h // sum_diag[0] + add v1.8h, v1.8h, v23.8h // sum_diag[0] + add v2.8h, v2.8h, v24.8h // sum_diag[1] + add v3.8h, v3.8h, v25.8h // sum_diag[1] + ext v22.16b, v30.16b, v28.16b, #(16-2*\i) + ext v23.16b, v28.16b, v30.16b, #(16-2*\i) + ext v24.16b, v30.16b, v29.16b, #(16-2*\i) + ext v25.16b, v29.16b, v30.16b, #(16-2*\i) + add v6.8h, v6.8h, v22.8h // sum_alt[0] + add v7.4h, v7.4h, v23.4h // sum_alt[0] + add v16.8h, v16.8h, v24.8h // sum_alt[1] + add v17.4h, v17.4h, v25.4h // sum_alt[1] +.endif +.if \i < 6 + ext v22.16b, v30.16b, v26.16b, #(16-2*(3-(\i/2))) + ext v23.16b, v26.16b, v30.16b, #(16-2*(3-(\i/2))) + add v18.8h, v18.8h, v22.8h // sum_alt[2] + add v19.4h, v19.4h, v23.4h // sum_alt[2] +.else + add v18.8h, v18.8h, v26.8h // sum_alt[2] +.endif +.if \i == 0 + mov v20.16b, v26.16b // sum_alt[3] +.elseif \i == 1 + add v20.8h, v20.8h, v26.8h // sum_alt[3] +.else + ext v24.16b, v30.16b, v26.16b, #(16-2*(\i/2)) + ext v25.16b, v26.16b, v30.16b, #(16-2*(\i/2)) + add v20.8h, v20.8h, v24.8h // sum_alt[3] + add v21.4h, v21.4h, v25.4h // sum_alt[3] +.endif +.endr + + movi v31.4s, #105 + + smull v26.4s, v4.4h, v4.4h // sum_hv[0]*sum_hv[0] + smlal2 v26.4s, v4.8h, v4.8h + smull v27.4s, v5.4h, v5.4h // sum_hv[1]*sum_hv[1] + smlal2 v27.4s, v5.8h, v5.8h + mul v26.4s, v26.4s, v31.4s // cost[2] *= 105 + mul v27.4s, v27.4s, v31.4s // cost[6] *= 105 + addv s4, v26.4s // cost[2] + addv s5, v27.4s // cost[6] + + rev64 v1.8h, v1.8h + rev64 v3.8h, v3.8h + ext v1.16b, v1.16b, v1.16b, #10 // sum_diag[0][14-n] + ext v3.16b, v3.16b, v3.16b, #10 // sum_diag[1][14-n] + + str s4, [sp, #2*4] // cost[2] + str s5, [sp, #6*4] // cost[6] + + movrel x4, div_table + ld1 {v31.8h}, [x4] + + smull v22.4s, v0.4h, v0.4h // sum_diag[0]*sum_diag[0] + smull2 v23.4s, v0.8h, v0.8h + smlal v22.4s, v1.4h, v1.4h + smlal2 v23.4s, v1.8h, v1.8h + smull v24.4s, v2.4h, v2.4h // sum_diag[1]*sum_diag[1] + smull2 v25.4s, v2.8h, v2.8h + smlal v24.4s, v3.4h, v3.4h + smlal2 v25.4s, v3.8h, v3.8h + uxtl v30.4s, v31.4h // div_table + uxtl2 v31.4s, v31.8h + mul v22.4s, v22.4s, v30.4s // cost[0] + mla v22.4s, v23.4s, v31.4s // cost[0] + mul v24.4s, v24.4s, v30.4s // cost[4] + mla v24.4s, v25.4s, v31.4s // cost[4] + addv s0, v22.4s // cost[0] + addv s2, v24.4s // cost[4] + + movrel x5, alt_fact + ld1 {v29.4h, v30.4h, v31.4h}, [x5]// div_table[2*m+1] + 105 + + str s0, [sp, #0*4] // cost[0] + str s2, [sp, #4*4] // cost[4] + + uxtl v29.4s, v29.4h // div_table[2*m+1] + 105 + uxtl v30.4s, v30.4h + uxtl v31.4s, v31.4h + + cost_alt s6, s16, v6, v7, v16, v17 // cost[1], cost[3] + cost_alt s18, s20, v18, v19, v20, v21 // cost[5], cost[7] + str s6, [sp, #1*4] // cost[1] + str s16, [sp, #3*4] // cost[3] + + mov w0, #0 // best_dir + mov w1, v0.s[0] // best_cost + mov w3, #1 // n + + str s18, [sp, #5*4] // cost[5] + str s20, [sp, #7*4] // cost[7] + + mov w4, v6.s[0] + + find_best v6, v4, v16 + find_best v16, v2, v18 + find_best v18, v5, v20 + find_best v20 + + eor w3, w0, #4 // best_dir ^4 + ldr w4, [sp, w3, uxtw #2] + sub w1, w1, w4 // best_cost - cost[best_dir ^ 4] + lsr w1, w1, #10 + str w1, [x2] // *var + + add sp, sp, #32 +.if \bpc == 16 + ldr d8, [sp], 0x10 +.endif + ret +endfunc +.endm diff --git a/third_party/dav1d/src/arm/64/ipred.S b/third_party/dav1d/src/arm/64/ipred.S new file mode 100644 index 0000000000..4be84a1a26 --- /dev/null +++ b/third_party/dav1d/src/arm/64/ipred.S @@ -0,0 +1,2764 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2019, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +// void ipred_dc_128_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_128_8bpc_neon, export=1 + clz w3, w3 + adr x5, L(ipred_dc_128_tbl) + sub w3, w3, #25 + ldrh w3, [x5, w3, uxtw #1] + movi v0.16b, #128 + sub x5, x5, w3, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +4: + st1 {v0.s}[0], [x0], x1 + st1 {v0.s}[0], [x6], x1 + subs w4, w4, #4 + st1 {v0.s}[0], [x0], x1 + st1 {v0.s}[0], [x6], x1 + b.gt 4b + ret +8: + st1 {v0.8b}, [x0], x1 + st1 {v0.8b}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8b}, [x0], x1 + st1 {v0.8b}, [x6], x1 + b.gt 8b + ret +16: + st1 {v0.16b}, [x0], x1 + st1 {v0.16b}, [x6], x1 + subs w4, w4, #4 + st1 {v0.16b}, [x0], x1 + st1 {v0.16b}, [x6], x1 + b.gt 16b + ret +320: + movi v1.16b, #128 +32: + st1 {v0.16b, v1.16b}, [x0], x1 + st1 {v0.16b, v1.16b}, [x6], x1 + subs w4, w4, #4 + st1 {v0.16b, v1.16b}, [x0], x1 + st1 {v0.16b, v1.16b}, [x6], x1 + b.gt 32b + ret +640: + movi v1.16b, #128 + movi v2.16b, #128 + movi v3.16b, #128 +64: + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 + subs w4, w4, #4 + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 + b.gt 64b + ret + +L(ipred_dc_128_tbl): + .hword L(ipred_dc_128_tbl) - 640b + .hword L(ipred_dc_128_tbl) - 320b + .hword L(ipred_dc_128_tbl) - 16b + .hword L(ipred_dc_128_tbl) - 8b + .hword L(ipred_dc_128_tbl) - 4b +endfunc + +// void ipred_v_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_v_8bpc_neon, export=1 + clz w3, w3 + adr x5, L(ipred_v_tbl) + sub w3, w3, #25 + ldrh w3, [x5, w3, uxtw #1] + add x2, x2, #1 + sub x5, x5, w3, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +40: + ld1 {v0.s}[0], [x2] +4: + st1 {v0.s}[0], [x0], x1 + st1 {v0.s}[0], [x6], x1 + subs w4, w4, #4 + st1 {v0.s}[0], [x0], x1 + st1 {v0.s}[0], [x6], x1 + b.gt 4b + ret +80: + ld1 {v0.8b}, [x2] +8: + st1 {v0.8b}, [x0], x1 + st1 {v0.8b}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8b}, [x0], x1 + st1 {v0.8b}, [x6], x1 + b.gt 8b + ret +160: + ld1 {v0.16b}, [x2] +16: + st1 {v0.16b}, [x0], x1 + st1 {v0.16b}, [x6], x1 + subs w4, w4, #4 + st1 {v0.16b}, [x0], x1 + st1 {v0.16b}, [x6], x1 + b.gt 16b + ret +320: + ld1 {v0.16b, v1.16b}, [x2] +32: + st1 {v0.16b, v1.16b}, [x0], x1 + st1 {v0.16b, v1.16b}, [x6], x1 + subs w4, w4, #4 + st1 {v0.16b, v1.16b}, [x0], x1 + st1 {v0.16b, v1.16b}, [x6], x1 + b.gt 32b + ret +640: + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] +64: + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 + subs w4, w4, #4 + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 + b.gt 64b + ret + +L(ipred_v_tbl): + .hword L(ipred_v_tbl) - 640b + .hword L(ipred_v_tbl) - 320b + .hword L(ipred_v_tbl) - 160b + .hword L(ipred_v_tbl) - 80b + .hword L(ipred_v_tbl) - 40b +endfunc + +// void ipred_h_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_h_8bpc_neon, export=1 + clz w3, w3 + adr x5, L(ipred_h_tbl) + sub w3, w3, #25 + ldrh w3, [x5, w3, uxtw #1] + sub x2, x2, #4 + sub x5, x5, w3, uxtw + mov x7, #-4 + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +4: + ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 + st1 {v3.s}[0], [x0], x1 + st1 {v2.s}[0], [x6], x1 + subs w4, w4, #4 + st1 {v1.s}[0], [x0], x1 + st1 {v0.s}[0], [x6], x1 + b.gt 4b + ret +8: + ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 + st1 {v3.8b}, [x0], x1 + st1 {v2.8b}, [x6], x1 + subs w4, w4, #4 + st1 {v1.8b}, [x0], x1 + st1 {v0.8b}, [x6], x1 + b.gt 8b + ret +16: + ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 + st1 {v3.16b}, [x0], x1 + st1 {v2.16b}, [x6], x1 + subs w4, w4, #4 + st1 {v1.16b}, [x0], x1 + st1 {v0.16b}, [x6], x1 + b.gt 16b + ret +32: + ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 + str q3, [x0, #16] + str q2, [x6, #16] + st1 {v3.16b}, [x0], x1 + st1 {v2.16b}, [x6], x1 + subs w4, w4, #4 + str q1, [x0, #16] + str q0, [x6, #16] + st1 {v1.16b}, [x0], x1 + st1 {v0.16b}, [x6], x1 + b.gt 32b + ret +64: + ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 + str q3, [x0, #16] + str q2, [x6, #16] + stp q3, q3, [x0, #32] + stp q2, q2, [x6, #32] + st1 {v3.16b}, [x0], x1 + st1 {v2.16b}, [x6], x1 + subs w4, w4, #4 + str q1, [x0, #16] + str q0, [x6, #16] + stp q1, q1, [x0, #32] + stp q0, q0, [x6, #32] + st1 {v1.16b}, [x0], x1 + st1 {v0.16b}, [x6], x1 + b.gt 64b + ret + +L(ipred_h_tbl): + .hword L(ipred_h_tbl) - 64b + .hword L(ipred_h_tbl) - 32b + .hword L(ipred_h_tbl) - 16b + .hword L(ipred_h_tbl) - 8b + .hword L(ipred_h_tbl) - 4b +endfunc + +// void ipred_dc_top_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_top_8bpc_neon, export=1 + clz w3, w3 + adr x5, L(ipred_dc_top_tbl) + sub w3, w3, #25 + ldrh w3, [x5, w3, uxtw #1] + add x2, x2, #1 + sub x5, x5, w3, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +40: + ld1r {v0.2s}, [x2] + uaddlv h0, v0.8b + rshrn v0.8b, v0.8h, #3 + dup v0.8b, v0.b[0] +4: + st1 {v0.s}[0], [x0], x1 + st1 {v0.s}[0], [x6], x1 + subs w4, w4, #4 + st1 {v0.s}[0], [x0], x1 + st1 {v0.s}[0], [x6], x1 + b.gt 4b + ret +80: + ld1 {v0.8b}, [x2] + uaddlv h0, v0.8b + rshrn v0.8b, v0.8h, #3 + dup v0.8b, v0.b[0] +8: + st1 {v0.8b}, [x0], x1 + st1 {v0.8b}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8b}, [x0], x1 + st1 {v0.8b}, [x6], x1 + b.gt 8b + ret +160: + ld1 {v0.16b}, [x2] + uaddlv h0, v0.16b + rshrn v0.8b, v0.8h, #4 + dup v0.16b, v0.b[0] +16: + st1 {v0.16b}, [x0], x1 + st1 {v0.16b}, [x6], x1 + subs w4, w4, #4 + st1 {v0.16b}, [x0], x1 + st1 {v0.16b}, [x6], x1 + b.gt 16b + ret +320: + ld1 {v0.16b, v1.16b}, [x2] + uaddlv h0, v0.16b + uaddlv h1, v1.16b + add v2.4h, v0.4h, v1.4h + rshrn v2.8b, v2.8h, #5 + dup v0.16b, v2.b[0] + dup v1.16b, v2.b[0] +32: + st1 {v0.16b, v1.16b}, [x0], x1 + st1 {v0.16b, v1.16b}, [x6], x1 + subs w4, w4, #4 + st1 {v0.16b, v1.16b}, [x0], x1 + st1 {v0.16b, v1.16b}, [x6], x1 + b.gt 32b + ret +640: + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] + uaddlv h0, v0.16b + uaddlv h1, v1.16b + uaddlv h2, v2.16b + uaddlv h3, v3.16b + add v4.4h, v0.4h, v1.4h + add v5.4h, v2.4h, v3.4h + add v4.4h, v4.4h, v5.4h + rshrn v4.8b, v4.8h, #6 + dup v0.16b, v4.b[0] + dup v1.16b, v4.b[0] + dup v2.16b, v4.b[0] + dup v3.16b, v4.b[0] +64: + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 + subs w4, w4, #4 + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 + b.gt 64b + ret + +L(ipred_dc_top_tbl): + .hword L(ipred_dc_top_tbl) - 640b + .hword L(ipred_dc_top_tbl) - 320b + .hword L(ipred_dc_top_tbl) - 160b + .hword L(ipred_dc_top_tbl) - 80b + .hword L(ipred_dc_top_tbl) - 40b +endfunc + +// void ipred_dc_left_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_left_8bpc_neon, export=1 + sub x2, x2, w4, uxtw + clz w3, w3 + clz w7, w4 + adr x5, L(ipred_dc_left_tbl) + sub w3, w3, #20 // 25 leading bits, minus table offset 5 + sub w7, w7, #25 + ldrh w3, [x5, w3, uxtw #1] + ldrh w7, [x5, w7, uxtw #1] + sub x3, x5, w3, uxtw + sub x5, x5, w7, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x5 + +L(ipred_dc_left_h4): + ld1r {v0.2s}, [x2] + uaddlv h0, v0.8b + rshrn v0.8b, v0.8h, #3 + dup v0.16b, v0.b[0] + br x3 +L(ipred_dc_left_w4): + st1 {v0.s}[0], [x0], x1 + st1 {v0.s}[0], [x6], x1 + subs w4, w4, #4 + st1 {v0.s}[0], [x0], x1 + st1 {v0.s}[0], [x6], x1 + b.gt L(ipred_dc_left_w4) + ret + +L(ipred_dc_left_h8): + ld1 {v0.8b}, [x2] + uaddlv h0, v0.8b + rshrn v0.8b, v0.8h, #3 + dup v0.16b, v0.b[0] + br x3 +L(ipred_dc_left_w8): + st1 {v0.8b}, [x0], x1 + st1 {v0.8b}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8b}, [x0], x1 + st1 {v0.8b}, [x6], x1 + b.gt L(ipred_dc_left_w8) + ret + +L(ipred_dc_left_h16): + ld1 {v0.16b}, [x2] + uaddlv h0, v0.16b + rshrn v0.8b, v0.8h, #4 + dup v0.16b, v0.b[0] + br x3 +L(ipred_dc_left_w16): + st1 {v0.16b}, [x0], x1 + st1 {v0.16b}, [x6], x1 + subs w4, w4, #4 + st1 {v0.16b}, [x0], x1 + st1 {v0.16b}, [x6], x1 + b.gt L(ipred_dc_left_w16) + ret + +L(ipred_dc_left_h32): + ld1 {v0.16b, v1.16b}, [x2] + uaddlv h0, v0.16b + uaddlv h1, v1.16b + add v0.4h, v0.4h, v1.4h + rshrn v0.8b, v0.8h, #5 + dup v0.16b, v0.b[0] + br x3 +L(ipred_dc_left_w32): + mov v1.16b, v0.16b +1: + st1 {v0.16b, v1.16b}, [x0], x1 + st1 {v0.16b, v1.16b}, [x6], x1 + subs w4, w4, #4 + st1 {v0.16b, v1.16b}, [x0], x1 + st1 {v0.16b, v1.16b}, [x6], x1 + b.gt 1b + ret + +L(ipred_dc_left_h64): + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] + uaddlv h0, v0.16b + uaddlv h1, v1.16b + uaddlv h2, v2.16b + uaddlv h3, v3.16b + add v0.4h, v0.4h, v1.4h + add v2.4h, v2.4h, v3.4h + add v0.4h, v0.4h, v2.4h + rshrn v0.8b, v0.8h, #6 + dup v0.16b, v0.b[0] + br x3 +L(ipred_dc_left_w64): + mov v1.16b, v0.16b + mov v2.16b, v0.16b + mov v3.16b, v0.16b +1: + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 + subs w4, w4, #4 + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 + b.gt 1b + ret + +L(ipred_dc_left_tbl): + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4) +endfunc + +// void ipred_dc_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_8bpc_neon, export=1 + sub x2, x2, w4, uxtw + add w7, w3, w4 // width + height + clz w3, w3 + clz w6, w4 + dup v16.8h, w7 // width + height + adr x5, L(ipred_dc_tbl) + rbit w7, w7 // rbit(width + height) + sub w3, w3, #20 // 25 leading bits, minus table offset 5 + sub w6, w6, #25 + clz w7, w7 // ctz(width + height) + ldrh w3, [x5, w3, uxtw #1] + ldrh w6, [x5, w6, uxtw #1] + neg w7, w7 // -ctz(width + height) + sub x3, x5, w3, uxtw + sub x5, x5, w6, uxtw + ushr v16.8h, v16.8h, #1 // (width + height) >> 1 + dup v17.8h, w7 // -ctz(width + height) + add x6, x0, x1 + lsl x1, x1, #1 + br x5 + +L(ipred_dc_h4): + ld1 {v0.s}[0], [x2], #4 + ins v0.s[1], wzr + uaddlv h0, v0.8b + br x3 +L(ipred_dc_w4): + add x2, x2, #1 + ld1 {v1.s}[0], [x2] + ins v1.s[1], wzr + add v0.4h, v0.4h, v16.4h + uaddlv h1, v1.8b + cmp w4, #4 + add v0.4h, v0.4h, v1.4h + ushl v0.4h, v0.4h, v17.4h + b.eq 1f + // h = 8/16 + mov w16, #(0x3334/2) + movk w16, #(0x5556/2), lsl #16 + add w17, w4, w4 // w17 = 2*h = 16 or 32 + lsr w16, w16, w17 + dup v16.4h, w16 + sqdmulh v0.4h, v0.4h, v16.4h +1: + dup v0.8b, v0.b[0] +2: + st1 {v0.s}[0], [x0], x1 + st1 {v0.s}[0], [x6], x1 + subs w4, w4, #4 + st1 {v0.s}[0], [x0], x1 + st1 {v0.s}[0], [x6], x1 + b.gt 2b + ret + +L(ipred_dc_h8): + ld1 {v0.8b}, [x2], #8 + uaddlv h0, v0.8b + br x3 +L(ipred_dc_w8): + add x2, x2, #1 + ld1 {v1.8b}, [x2] + add v0.4h, v0.4h, v16.4h + uaddlv h1, v1.8b + cmp w4, #8 + add v0.4h, v0.4h, v1.4h + ushl v0.4h, v0.4h, v17.4h + b.eq 1f + // h = 4/16/32 + cmp w4, #32 + mov w16, #(0x3334/2) + mov w17, #(0x5556/2) + csel w16, w16, w17, eq + dup v16.4h, w16 + sqdmulh v0.4h, v0.4h, v16.4h +1: + dup v0.8b, v0.b[0] +2: + st1 {v0.8b}, [x0], x1 + st1 {v0.8b}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8b}, [x0], x1 + st1 {v0.8b}, [x6], x1 + b.gt 2b + ret + +L(ipred_dc_h16): + ld1 {v0.16b}, [x2], #16 + uaddlv h0, v0.16b + br x3 +L(ipred_dc_w16): + add x2, x2, #1 + ld1 {v1.16b}, [x2] + add v0.4h, v0.4h, v16.4h + uaddlv h1, v1.16b + cmp w4, #16 + add v0.4h, v0.4h, v1.4h + ushl v0.4h, v0.4h, v17.4h + b.eq 1f + // h = 4/8/32/64 + tst w4, #(32+16+8) // 16 added to make a consecutive bitmask + mov w16, #(0x3334/2) + mov w17, #(0x5556/2) + csel w16, w16, w17, eq + dup v16.4h, w16 + sqdmulh v0.4h, v0.4h, v16.4h +1: + dup v0.16b, v0.b[0] +2: + st1 {v0.16b}, [x0], x1 + st1 {v0.16b}, [x6], x1 + subs w4, w4, #4 + st1 {v0.16b}, [x0], x1 + st1 {v0.16b}, [x6], x1 + b.gt 2b + ret + +L(ipred_dc_h32): + ld1 {v0.16b, v1.16b}, [x2], #32 + uaddlv h0, v0.16b + uaddlv h1, v1.16b + add v0.4h, v0.4h, v1.4h + br x3 +L(ipred_dc_w32): + add x2, x2, #1 + ld1 {v1.16b, v2.16b}, [x2] + add v0.4h, v0.4h, v16.4h + uaddlv h1, v1.16b + uaddlv h2, v2.16b + cmp w4, #32 + add v0.4h, v0.4h, v1.4h + add v0.4h, v0.4h, v2.4h + ushl v4.4h, v0.4h, v17.4h + b.eq 1f + // h = 8/16/64 + cmp w4, #8 + mov w16, #(0x3334/2) + mov w17, #(0x5556/2) + csel w16, w16, w17, eq + dup v16.4h, w16 + sqdmulh v4.4h, v4.4h, v16.4h +1: + dup v0.16b, v4.b[0] + dup v1.16b, v4.b[0] +2: + st1 {v0.16b, v1.16b}, [x0], x1 + st1 {v0.16b, v1.16b}, [x6], x1 + subs w4, w4, #4 + st1 {v0.16b, v1.16b}, [x0], x1 + st1 {v0.16b, v1.16b}, [x6], x1 + b.gt 2b + ret + +L(ipred_dc_h64): + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64 + uaddlv h0, v0.16b + uaddlv h1, v1.16b + uaddlv h2, v2.16b + uaddlv h3, v3.16b + add v0.4h, v0.4h, v1.4h + add v2.4h, v2.4h, v3.4h + add v0.4h, v0.4h, v2.4h + br x3 +L(ipred_dc_w64): + add x2, x2, #1 + ld1 {v1.16b, v2.16b, v3.16b, v4.16b}, [x2] + add v0.4h, v0.4h, v16.4h + uaddlv h1, v1.16b + uaddlv h2, v2.16b + uaddlv h3, v3.16b + uaddlv h4, v4.16b + add v1.4h, v1.4h, v2.4h + add v3.4h, v3.4h, v4.4h + cmp w4, #64 + add v0.4h, v0.4h, v1.4h + add v0.4h, v0.4h, v3.4h + ushl v4.4h, v0.4h, v17.4h + b.eq 1f + // h = 16/32 + mov w16, #(0x5556/2) + movk w16, #(0x3334/2), lsl #16 + lsr w16, w16, w4 + dup v16.4h, w16 + sqdmulh v4.4h, v4.4h, v16.4h +1: + dup v0.16b, v4.b[0] + dup v1.16b, v4.b[0] + dup v2.16b, v4.b[0] + dup v3.16b, v4.b[0] +2: + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 + subs w4, w4, #4 + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 + b.gt 2b + ret + +L(ipred_dc_tbl): + .hword L(ipred_dc_tbl) - L(ipred_dc_h64) + .hword L(ipred_dc_tbl) - L(ipred_dc_h32) + .hword L(ipred_dc_tbl) - L(ipred_dc_h16) + .hword L(ipred_dc_tbl) - L(ipred_dc_h8) + .hword L(ipred_dc_tbl) - L(ipred_dc_h4) + .hword L(ipred_dc_tbl) - L(ipred_dc_w64) + .hword L(ipred_dc_tbl) - L(ipred_dc_w32) + .hword L(ipred_dc_tbl) - L(ipred_dc_w16) + .hword L(ipred_dc_tbl) - L(ipred_dc_w8) + .hword L(ipred_dc_tbl) - L(ipred_dc_w4) +endfunc + +// void ipred_paeth_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_paeth_8bpc_neon, export=1 + clz w9, w3 + adr x5, L(ipred_paeth_tbl) + sub w9, w9, #25 + ldrh w9, [x5, w9, uxtw #1] + ld1r {v4.16b}, [x2] + add x8, x2, #1 + sub x2, x2, #4 + sub x5, x5, w9, uxtw + mov x7, #-4 + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +40: + ld1r {v5.4s}, [x8] + usubl v6.8h, v5.8b, v4.8b // top - topleft +4: + ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 + zip1 v0.2s, v0.2s, v1.2s + zip1 v2.2s, v2.2s, v3.2s + uaddw v16.8h, v6.8h, v0.8b + uaddw v17.8h, v6.8h, v2.8b + sqxtun v16.8b, v16.8h // base + sqxtun2 v16.16b, v17.8h + zip1 v0.2d, v0.2d, v2.2d + uabd v20.16b, v5.16b, v16.16b // tdiff + uabd v22.16b, v4.16b, v16.16b // tldiff + uabd v16.16b, v0.16b, v16.16b // ldiff + umin v18.16b, v20.16b, v22.16b // min(tdiff, tldiff) + cmhs v20.16b, v22.16b, v20.16b // tldiff >= tdiff + cmhs v16.16b, v18.16b, v16.16b // min(tdiff, tldiff) >= ldiff + bsl v20.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft + bit v20.16b, v0.16b, v16.16b // ldiff <= min ? left : ... + st1 {v20.s}[3], [x0], x1 + st1 {v20.s}[2], [x6], x1 + subs w4, w4, #4 + st1 {v20.s}[1], [x0], x1 + st1 {v20.s}[0], [x6], x1 + b.gt 4b + ret +80: + ld1r {v5.2d}, [x8] + usubl v6.8h, v5.8b, v4.8b // top - topleft +8: + ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 + uaddw v16.8h, v6.8h, v0.8b + uaddw v17.8h, v6.8h, v1.8b + uaddw v18.8h, v6.8h, v2.8b + uaddw v19.8h, v6.8h, v3.8b + sqxtun v16.8b, v16.8h // base + sqxtun2 v16.16b, v17.8h + sqxtun v18.8b, v18.8h + sqxtun2 v18.16b, v19.8h + zip1 v2.2d, v2.2d, v3.2d + zip1 v0.2d, v0.2d, v1.2d + uabd v21.16b, v5.16b, v18.16b // tdiff + uabd v20.16b, v5.16b, v16.16b + uabd v23.16b, v4.16b, v18.16b // tldiff + uabd v22.16b, v4.16b, v16.16b + uabd v17.16b, v2.16b, v18.16b // ldiff + uabd v16.16b, v0.16b, v16.16b + umin v19.16b, v21.16b, v23.16b // min(tdiff, tldiff) + umin v18.16b, v20.16b, v22.16b + cmhs v21.16b, v23.16b, v21.16b // tldiff >= tdiff + cmhs v20.16b, v22.16b, v20.16b + cmhs v17.16b, v19.16b, v17.16b // min(tdiff, tldiff) >= ldiff + cmhs v16.16b, v18.16b, v16.16b + bsl v21.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft + bsl v20.16b, v5.16b, v4.16b + bit v21.16b, v2.16b, v17.16b // ldiff <= min ? left : ... + bit v20.16b, v0.16b, v16.16b + st1 {v21.d}[1], [x0], x1 + st1 {v21.d}[0], [x6], x1 + subs w4, w4, #4 + st1 {v20.d}[1], [x0], x1 + st1 {v20.d}[0], [x6], x1 + b.gt 8b + ret +160: +320: +640: + ld1 {v5.16b}, [x8], #16 + mov w9, w3 + // Set up pointers for four rows in parallel; x0, x6, x5, x10 + add x5, x0, x1 + add x10, x6, x1 + lsl x1, x1, #1 + sub x1, x1, w3, uxtw +1: + ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 +2: + usubl v6.8h, v5.8b, v4.8b // top - topleft + usubl2 v7.8h, v5.16b, v4.16b + uaddw v24.8h, v6.8h, v0.8b + uaddw v25.8h, v7.8h, v0.8b + uaddw v26.8h, v6.8h, v1.8b + uaddw v27.8h, v7.8h, v1.8b + uaddw v28.8h, v6.8h, v2.8b + uaddw v29.8h, v7.8h, v2.8b + uaddw v30.8h, v6.8h, v3.8b + uaddw v31.8h, v7.8h, v3.8b + sqxtun v17.8b, v26.8h // base + sqxtun2 v17.16b, v27.8h + sqxtun v16.8b, v24.8h + sqxtun2 v16.16b, v25.8h + sqxtun v19.8b, v30.8h + sqxtun2 v19.16b, v31.8h + sqxtun v18.8b, v28.8h + sqxtun2 v18.16b, v29.8h + uabd v23.16b, v5.16b, v19.16b // tdiff + uabd v22.16b, v5.16b, v18.16b + uabd v21.16b, v5.16b, v17.16b + uabd v20.16b, v5.16b, v16.16b + uabd v27.16b, v4.16b, v19.16b // tldiff + uabd v26.16b, v4.16b, v18.16b + uabd v25.16b, v4.16b, v17.16b + uabd v24.16b, v4.16b, v16.16b + uabd v19.16b, v3.16b, v19.16b // ldiff + uabd v18.16b, v2.16b, v18.16b + uabd v17.16b, v1.16b, v17.16b + uabd v16.16b, v0.16b, v16.16b + umin v31.16b, v23.16b, v27.16b // min(tdiff, tldiff) + umin v30.16b, v22.16b, v26.16b + umin v29.16b, v21.16b, v25.16b + umin v28.16b, v20.16b, v24.16b + cmhs v23.16b, v27.16b, v23.16b // tldiff >= tdiff + cmhs v22.16b, v26.16b, v22.16b + cmhs v21.16b, v25.16b, v21.16b + cmhs v20.16b, v24.16b, v20.16b + cmhs v19.16b, v31.16b, v19.16b // min(tdiff, tldiff) >= ldiff + cmhs v18.16b, v30.16b, v18.16b + cmhs v17.16b, v29.16b, v17.16b + cmhs v16.16b, v28.16b, v16.16b + bsl v23.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft + bsl v22.16b, v5.16b, v4.16b + bsl v21.16b, v5.16b, v4.16b + bsl v20.16b, v5.16b, v4.16b + bit v23.16b, v3.16b, v19.16b // ldiff <= min ? left : ... + bit v22.16b, v2.16b, v18.16b + bit v21.16b, v1.16b, v17.16b + bit v20.16b, v0.16b, v16.16b + subs w3, w3, #16 + st1 {v23.16b}, [x0], #16 + st1 {v22.16b}, [x6], #16 + st1 {v21.16b}, [x5], #16 + st1 {v20.16b}, [x10], #16 + b.le 8f + ld1 {v5.16b}, [x8], #16 + b 2b +8: + subs w4, w4, #4 + b.le 9f + // End of horizontal loop, move pointers to next four rows + sub x8, x8, w9, uxtw + add x0, x0, x1 + add x6, x6, x1 + // Load the top row as early as possible + ld1 {v5.16b}, [x8], #16 + add x5, x5, x1 + add x10, x10, x1 + mov w3, w9 + b 1b +9: + ret + +L(ipred_paeth_tbl): + .hword L(ipred_paeth_tbl) - 640b + .hword L(ipred_paeth_tbl) - 320b + .hword L(ipred_paeth_tbl) - 160b + .hword L(ipred_paeth_tbl) - 80b + .hword L(ipred_paeth_tbl) - 40b +endfunc + +// void ipred_smooth_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_smooth_8bpc_neon, export=1 + movrel x10, X(sm_weights) + add x11, x10, w4, uxtw + add x10, x10, w3, uxtw + clz w9, w3 + adr x5, L(ipred_smooth_tbl) + sub x12, x2, w4, uxtw + sub w9, w9, #25 + ldrh w9, [x5, w9, uxtw #1] + ld1r {v4.16b}, [x12] // bottom + add x8, x2, #1 + sub x5, x5, w9, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +40: + ld1r {v6.2s}, [x8] // top + ld1r {v7.2s}, [x10] // weights_hor + sub x2, x2, #4 + mov x7, #-4 + dup v5.16b, v6.b[3] // right + usubl v6.8h, v6.8b, v4.8b // top-bottom + uxtl v7.8h, v7.8b // weights_hor +4: + ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left + ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver + shll v20.8h, v5.8b, #8 // right*256 + shll v21.8h, v5.8b, #8 + zip1 v1.2s, v1.2s, v0.2s // left, flipped + zip1 v0.2s, v3.2s, v2.2s + zip1 v16.2s, v16.2s, v17.2s // weights_ver + zip1 v18.2s, v18.2s, v19.2s + shll v22.8h, v4.8b, #8 // bottom*256 + shll v23.8h, v4.8b, #8 + usubl v0.8h, v0.8b, v5.8b // left-right + usubl v1.8h, v1.8b, v5.8b + uxtl v16.8h, v16.8b // weights_ver + uxtl v18.8h, v18.8b + mla v20.8h, v0.8h, v7.8h // right*256 + (left-right)*weights_hor + mla v21.8h, v1.8h, v7.8h + mla v22.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver + mla v23.8h, v6.8h, v18.8h + uhadd v20.8h, v20.8h, v22.8h + uhadd v21.8h, v21.8h, v23.8h + rshrn v20.8b, v20.8h, #8 + rshrn v21.8b, v21.8h, #8 + st1 {v20.s}[0], [x0], x1 + st1 {v20.s}[1], [x6], x1 + subs w4, w4, #4 + st1 {v21.s}[0], [x0], x1 + st1 {v21.s}[1], [x6], x1 + b.gt 4b + ret +80: + ld1 {v6.8b}, [x8] // top + ld1 {v7.8b}, [x10] // weights_hor + sub x2, x2, #4 + mov x7, #-4 + dup v5.16b, v6.b[7] // right + usubl v6.8h, v6.8b, v4.8b // top-bottom + uxtl v7.8h, v7.8b // weights_hor +8: + ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left + ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver + shll v20.8h, v5.8b, #8 // right*256 + shll v21.8h, v5.8b, #8 + shll v22.8h, v5.8b, #8 + shll v23.8h, v5.8b, #8 + usubl v0.8h, v0.8b, v5.8b // left-right + usubl v1.8h, v1.8b, v5.8b + usubl v2.8h, v2.8b, v5.8b + usubl v3.8h, v3.8b, v5.8b + shll v24.8h, v4.8b, #8 // bottom*256 + shll v25.8h, v4.8b, #8 + shll v26.8h, v4.8b, #8 + shll v27.8h, v4.8b, #8 + uxtl v16.8h, v16.8b // weights_ver + uxtl v17.8h, v17.8b + uxtl v18.8h, v18.8b + uxtl v19.8h, v19.8b + mla v20.8h, v3.8h, v7.8h // right*256 + (left-right)*weights_hor + mla v21.8h, v2.8h, v7.8h // (left flipped) + mla v22.8h, v1.8h, v7.8h + mla v23.8h, v0.8h, v7.8h + mla v24.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver + mla v25.8h, v6.8h, v17.8h + mla v26.8h, v6.8h, v18.8h + mla v27.8h, v6.8h, v19.8h + uhadd v20.8h, v20.8h, v24.8h + uhadd v21.8h, v21.8h, v25.8h + uhadd v22.8h, v22.8h, v26.8h + uhadd v23.8h, v23.8h, v27.8h + rshrn v20.8b, v20.8h, #8 + rshrn v21.8b, v21.8h, #8 + rshrn v22.8b, v22.8h, #8 + rshrn v23.8b, v23.8h, #8 + st1 {v20.8b}, [x0], x1 + st1 {v21.8b}, [x6], x1 + subs w4, w4, #4 + st1 {v22.8b}, [x0], x1 + st1 {v23.8b}, [x6], x1 + b.gt 8b + ret +160: +320: +640: + add x12, x2, w3, uxtw + sub x2, x2, #2 + mov x7, #-2 + ld1r {v5.16b}, [x12] // right + sub x1, x1, w3, uxtw + mov w9, w3 + +1: + ld2r {v0.8b, v1.8b}, [x2], x7 // left + ld2r {v16.8b, v17.8b}, [x11], #2 // weights_ver + usubl v0.8h, v0.8b, v5.8b // left-right + usubl v1.8h, v1.8b, v5.8b + uxtl v16.8h, v16.8b // weights_ver + uxtl v17.8h, v17.8b +2: + ld1 {v7.16b}, [x10], #16 // weights_hor + ld1 {v3.16b}, [x8], #16 // top + shll v20.8h, v5.8b, #8 // right*256 + shll v21.8h, v5.8b, #8 + shll v22.8h, v5.8b, #8 + shll v23.8h, v5.8b, #8 + uxtl v6.8h, v7.8b // weights_hor + uxtl2 v7.8h, v7.16b + usubl v2.8h, v3.8b, v4.8b // top-bottom + usubl2 v3.8h, v3.16b, v4.16b + mla v20.8h, v1.8h, v6.8h // right*256 + (left-right)*weights_hor + mla v21.8h, v1.8h, v7.8h // (left flipped) + mla v22.8h, v0.8h, v6.8h + mla v23.8h, v0.8h, v7.8h + shll v24.8h, v4.8b, #8 // bottom*256 + shll v25.8h, v4.8b, #8 + shll v26.8h, v4.8b, #8 + shll v27.8h, v4.8b, #8 + mla v24.8h, v2.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver + mla v25.8h, v3.8h, v16.8h + mla v26.8h, v2.8h, v17.8h + mla v27.8h, v3.8h, v17.8h + uhadd v20.8h, v20.8h, v24.8h + uhadd v21.8h, v21.8h, v25.8h + uhadd v22.8h, v22.8h, v26.8h + uhadd v23.8h, v23.8h, v27.8h + rshrn v20.8b, v20.8h, #8 + rshrn2 v20.16b, v21.8h, #8 + rshrn v22.8b, v22.8h, #8 + rshrn2 v22.16b, v23.8h, #8 + subs w3, w3, #16 + st1 {v20.16b}, [x0], #16 + st1 {v22.16b}, [x6], #16 + b.gt 2b + subs w4, w4, #2 + b.le 9f + sub x8, x8, w9, uxtw + sub x10, x10, w9, uxtw + add x0, x0, x1 + add x6, x6, x1 + mov w3, w9 + b 1b +9: + ret + +L(ipred_smooth_tbl): + .hword L(ipred_smooth_tbl) - 640b + .hword L(ipred_smooth_tbl) - 320b + .hword L(ipred_smooth_tbl) - 160b + .hword L(ipred_smooth_tbl) - 80b + .hword L(ipred_smooth_tbl) - 40b +endfunc + +// void ipred_smooth_v_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_smooth_v_8bpc_neon, export=1 + movrel x7, X(sm_weights) + add x7, x7, w4, uxtw + clz w9, w3 + adr x5, L(ipred_smooth_v_tbl) + sub x8, x2, w4, uxtw + sub w9, w9, #25 + ldrh w9, [x5, w9, uxtw #1] + ld1r {v4.16b}, [x8] // bottom + add x2, x2, #1 + sub x5, x5, w9, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +40: + ld1r {v6.2s}, [x2] // top + usubl v6.8h, v6.8b, v4.8b // top-bottom +4: + ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver + shll v22.8h, v4.8b, #8 // bottom*256 + shll v23.8h, v4.8b, #8 + zip1 v16.2s, v16.2s, v17.2s // weights_ver + zip1 v18.2s, v18.2s, v19.2s + uxtl v16.8h, v16.8b // weights_ver + uxtl v18.8h, v18.8b + mla v22.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver + mla v23.8h, v6.8h, v18.8h + rshrn v22.8b, v22.8h, #8 + rshrn v23.8b, v23.8h, #8 + st1 {v22.s}[0], [x0], x1 + st1 {v22.s}[1], [x6], x1 + subs w4, w4, #4 + st1 {v23.s}[0], [x0], x1 + st1 {v23.s}[1], [x6], x1 + b.gt 4b + ret +80: + ld1 {v6.8b}, [x2] // top + usubl v6.8h, v6.8b, v4.8b // top-bottom +8: + ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver + shll v24.8h, v4.8b, #8 // bottom*256 + shll v25.8h, v4.8b, #8 + shll v26.8h, v4.8b, #8 + shll v27.8h, v4.8b, #8 + uxtl v16.8h, v16.8b // weights_ver + uxtl v17.8h, v17.8b + uxtl v18.8h, v18.8b + uxtl v19.8h, v19.8b + mla v24.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver + mla v25.8h, v6.8h, v17.8h + mla v26.8h, v6.8h, v18.8h + mla v27.8h, v6.8h, v19.8h + rshrn v24.8b, v24.8h, #8 + rshrn v25.8b, v25.8h, #8 + rshrn v26.8b, v26.8h, #8 + rshrn v27.8b, v27.8h, #8 + st1 {v24.8b}, [x0], x1 + st1 {v25.8b}, [x6], x1 + subs w4, w4, #4 + st1 {v26.8b}, [x0], x1 + st1 {v27.8b}, [x6], x1 + b.gt 8b + ret +160: +320: +640: + // Set up pointers for four rows in parallel; x0, x6, x5, x8 + add x5, x0, x1 + add x8, x6, x1 + lsl x1, x1, #1 + sub x1, x1, w3, uxtw + mov w9, w3 + +1: + ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver + uxtl v16.8h, v16.8b // weights_ver + uxtl v17.8h, v17.8b + uxtl v18.8h, v18.8b + uxtl v19.8h, v19.8b +2: + ld1 {v3.16b}, [x2], #16 // top + shll v20.8h, v4.8b, #8 // bottom*256 + shll v21.8h, v4.8b, #8 + shll v22.8h, v4.8b, #8 + shll v23.8h, v4.8b, #8 + shll v24.8h, v4.8b, #8 + shll v25.8h, v4.8b, #8 + shll v26.8h, v4.8b, #8 + shll v27.8h, v4.8b, #8 + usubl v2.8h, v3.8b, v4.8b // top-bottom + usubl2 v3.8h, v3.16b, v4.16b + mla v20.8h, v2.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver + mla v21.8h, v3.8h, v16.8h + mla v22.8h, v2.8h, v17.8h + mla v23.8h, v3.8h, v17.8h + mla v24.8h, v2.8h, v18.8h + mla v25.8h, v3.8h, v18.8h + mla v26.8h, v2.8h, v19.8h + mla v27.8h, v3.8h, v19.8h + rshrn v20.8b, v20.8h, #8 + rshrn2 v20.16b, v21.8h, #8 + rshrn v22.8b, v22.8h, #8 + rshrn2 v22.16b, v23.8h, #8 + rshrn v24.8b, v24.8h, #8 + rshrn2 v24.16b, v25.8h, #8 + rshrn v26.8b, v26.8h, #8 + rshrn2 v26.16b, v27.8h, #8 + subs w3, w3, #16 + st1 {v20.16b}, [x0], #16 + st1 {v22.16b}, [x6], #16 + st1 {v24.16b}, [x5], #16 + st1 {v26.16b}, [x8], #16 + b.gt 2b + subs w4, w4, #4 + b.le 9f + sub x2, x2, w9, uxtw + add x0, x0, x1 + add x6, x6, x1 + add x5, x5, x1 + add x8, x8, x1 + mov w3, w9 + b 1b +9: + ret + +L(ipred_smooth_v_tbl): + .hword L(ipred_smooth_v_tbl) - 640b + .hword L(ipred_smooth_v_tbl) - 320b + .hword L(ipred_smooth_v_tbl) - 160b + .hword L(ipred_smooth_v_tbl) - 80b + .hword L(ipred_smooth_v_tbl) - 40b +endfunc + +// void ipred_smooth_h_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_smooth_h_8bpc_neon, export=1 + movrel x8, X(sm_weights) + add x8, x8, w3, uxtw + clz w9, w3 + adr x5, L(ipred_smooth_h_tbl) + add x12, x2, w3, uxtw + sub w9, w9, #25 + ldrh w9, [x5, w9, uxtw #1] + ld1r {v5.16b}, [x12] // right + sub x5, x5, w9, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +40: + ld1r {v7.2s}, [x8] // weights_hor + sub x2, x2, #4 + mov x7, #-4 + uxtl v7.8h, v7.8b // weights_hor +4: + ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left + shll v20.8h, v5.8b, #8 // right*256 + shll v21.8h, v5.8b, #8 + zip1 v1.2s, v1.2s, v0.2s // left, flipped + zip1 v0.2s, v3.2s, v2.2s + usubl v0.8h, v0.8b, v5.8b // left-right + usubl v1.8h, v1.8b, v5.8b + mla v20.8h, v0.8h, v7.8h // right*256 + (left-right)*weights_hor + mla v21.8h, v1.8h, v7.8h + rshrn v20.8b, v20.8h, #8 + rshrn v21.8b, v21.8h, #8 + st1 {v20.s}[0], [x0], x1 + st1 {v20.s}[1], [x6], x1 + subs w4, w4, #4 + st1 {v21.s}[0], [x0], x1 + st1 {v21.s}[1], [x6], x1 + b.gt 4b + ret +80: + ld1 {v7.8b}, [x8] // weights_hor + sub x2, x2, #4 + mov x7, #-4 + uxtl v7.8h, v7.8b // weights_hor +8: + ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left + shll v20.8h, v5.8b, #8 // right*256 + shll v21.8h, v5.8b, #8 + shll v22.8h, v5.8b, #8 + shll v23.8h, v5.8b, #8 + usubl v3.8h, v3.8b, v5.8b // left-right + usubl v2.8h, v2.8b, v5.8b + usubl v1.8h, v1.8b, v5.8b + usubl v0.8h, v0.8b, v5.8b + mla v20.8h, v3.8h, v7.8h // right*256 + (left-right)*weights_hor + mla v21.8h, v2.8h, v7.8h // (left flipped) + mla v22.8h, v1.8h, v7.8h + mla v23.8h, v0.8h, v7.8h + rshrn v20.8b, v20.8h, #8 + rshrn v21.8b, v21.8h, #8 + rshrn v22.8b, v22.8h, #8 + rshrn v23.8b, v23.8h, #8 + st1 {v20.8b}, [x0], x1 + st1 {v21.8b}, [x6], x1 + subs w4, w4, #4 + st1 {v22.8b}, [x0], x1 + st1 {v23.8b}, [x6], x1 + b.gt 8b + ret +160: +320: +640: + sub x2, x2, #4 + mov x7, #-4 + // Set up pointers for four rows in parallel; x0, x6, x5, x10 + add x5, x0, x1 + add x10, x6, x1 + lsl x1, x1, #1 + sub x1, x1, w3, uxtw + mov w9, w3 + +1: + ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left + usubl v0.8h, v0.8b, v5.8b // left-right + usubl v1.8h, v1.8b, v5.8b + usubl v2.8h, v2.8b, v5.8b + usubl v3.8h, v3.8b, v5.8b +2: + ld1 {v7.16b}, [x8], #16 // weights_hor + shll v20.8h, v5.8b, #8 // right*256 + shll v21.8h, v5.8b, #8 + shll v22.8h, v5.8b, #8 + shll v23.8h, v5.8b, #8 + shll v24.8h, v5.8b, #8 + shll v25.8h, v5.8b, #8 + shll v26.8h, v5.8b, #8 + shll v27.8h, v5.8b, #8 + uxtl v6.8h, v7.8b // weights_hor + uxtl2 v7.8h, v7.16b + mla v20.8h, v3.8h, v6.8h // right*256 + (left-right)*weights_hor + mla v21.8h, v3.8h, v7.8h // (left flipped) + mla v22.8h, v2.8h, v6.8h + mla v23.8h, v2.8h, v7.8h + mla v24.8h, v1.8h, v6.8h + mla v25.8h, v1.8h, v7.8h + mla v26.8h, v0.8h, v6.8h + mla v27.8h, v0.8h, v7.8h + rshrn v20.8b, v20.8h, #8 + rshrn2 v20.16b, v21.8h, #8 + rshrn v22.8b, v22.8h, #8 + rshrn2 v22.16b, v23.8h, #8 + rshrn v24.8b, v24.8h, #8 + rshrn2 v24.16b, v25.8h, #8 + rshrn v26.8b, v26.8h, #8 + rshrn2 v26.16b, v27.8h, #8 + subs w3, w3, #16 + st1 {v20.16b}, [x0], #16 + st1 {v22.16b}, [x6], #16 + st1 {v24.16b}, [x5], #16 + st1 {v26.16b}, [x10], #16 + b.gt 2b + subs w4, w4, #4 + b.le 9f + sub x8, x8, w9, uxtw + add x0, x0, x1 + add x6, x6, x1 + add x5, x5, x1 + add x10, x10, x1 + mov w3, w9 + b 1b +9: + ret + +L(ipred_smooth_h_tbl): + .hword L(ipred_smooth_h_tbl) - 640b + .hword L(ipred_smooth_h_tbl) - 320b + .hword L(ipred_smooth_h_tbl) - 160b + .hword L(ipred_smooth_h_tbl) - 80b + .hword L(ipred_smooth_h_tbl) - 40b +endfunc + +// void ipred_filter_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int filt_idx, +// const int max_width, const int max_height); +function ipred_filter_8bpc_neon, export=1 + and w5, w5, #511 + movrel x6, X(filter_intra_taps) + lsl w5, w5, #6 + add x6, x6, w5, uxtw + ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32 + clz w9, w3 + adr x5, L(ipred_filter_tbl) + ld1 {v20.8b, v21.8b, v22.8b}, [x6] + sub w9, w9, #26 + ldrh w9, [x5, w9, uxtw #1] + sxtl v16.8h, v16.8b + sxtl v17.8h, v17.8b + sub x5, x5, w9, uxtw + sxtl v18.8h, v18.8b + sxtl v19.8h, v19.8b + add x6, x0, x1 + lsl x1, x1, #1 + sxtl v20.8h, v20.8b + sxtl v21.8h, v21.8b + sxtl v22.8h, v22.8b + br x5 +40: + ldur s0, [x2, #1] // top (0-3) + sub x2, x2, #2 + mov x7, #-2 + uxtl v0.8h, v0.8b // top (0-3) +4: + ld1 {v1.s}[0], [x2], x7 // left (0-1) + topleft (2) + mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) + mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) + mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) + uxtl v1.8h, v1.8b // left (0-1) + topleft (2) + mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) + mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) + mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) + mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) + sqrshrun v2.8b, v2.8h, #4 + subs w4, w4, #2 + st1 {v2.s}[0], [x0], x1 + uxtl v0.8h, v2.8b + st1 {v2.s}[1], [x6], x1 + ext v0.16b, v0.16b, v0.16b, #8 // move top from [4-7] to [0-3] + b.gt 4b + ret +80: + ldur d0, [x2, #1] // top (0-7) + sub x2, x2, #2 + mov x7, #-2 + uxtl v0.8h, v0.8b // top (0-7) +8: + ld1 {v1.s}[0], [x2], x7 // left (0-1) + topleft (2) + mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) + mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) + mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) + uxtl v1.8h, v1.8b // left (0-1) + topleft (2) + mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) + mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) + mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) + mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) + mul v3.8h, v17.8h, v0.h[4] // p1(top[0]) * filter(1) + mla v3.8h, v18.8h, v0.h[5] // p2(top[1]) * filter(2) + mla v3.8h, v19.8h, v0.h[6] // p3(top[2]) * filter(3) + sqrshrun v2.8b, v2.8h, #4 + uxtl v1.8h, v2.8b // first block, in 16 bit + mla v3.8h, v20.8h, v0.h[7] // p4(top[3]) * filter(4) + mla v3.8h, v16.8h, v0.h[3] // p0(topleft) * filter(0) + mla v3.8h, v21.8h, v1.h[3] // p5(left[0]) * filter(5) + mla v3.8h, v22.8h, v1.h[7] // p6(left[1]) * filter(6) + sqrshrun v3.8b, v3.8h, #4 + subs w4, w4, #2 + st2 {v2.s, v3.s}[0], [x0], x1 + zip2 v0.2s, v2.2s, v3.2s + st2 {v2.s, v3.s}[1], [x6], x1 + uxtl v0.8h, v0.8b + b.gt 8b + ret +160: +320: + add x8, x2, #1 + sub x2, x2, #2 + mov x7, #-2 + sub x1, x1, w3, uxtw + mov w9, w3 + +1: + ld1 {v0.s}[0], [x2], x7 // left (0-1) + topleft (2) + uxtl v0.8h, v0.8b // left (0-1) + topleft (2) +2: + ld1 {v2.16b}, [x8], #16 // top(0-15) + mul v3.8h, v16.8h, v0.h[2] // p0(topleft) * filter(0) + mla v3.8h, v21.8h, v0.h[1] // p5(left[0]) * filter(5) + uxtl v1.8h, v2.8b // top(0-7) + uxtl2 v2.8h, v2.16b // top(8-15) + mla v3.8h, v22.8h, v0.h[0] // p6(left[1]) * filter(6) + mla v3.8h, v17.8h, v1.h[0] // p1(top[0]) * filter(1) + mla v3.8h, v18.8h, v1.h[1] // p2(top[1]) * filter(2) + mla v3.8h, v19.8h, v1.h[2] // p3(top[2]) * filter(3) + mla v3.8h, v20.8h, v1.h[3] // p4(top[3]) * filter(4) + + mul v4.8h, v17.8h, v1.h[4] // p1(top[0]) * filter(1) + mla v4.8h, v18.8h, v1.h[5] // p2(top[1]) * filter(2) + mla v4.8h, v19.8h, v1.h[6] // p3(top[2]) * filter(3) + sqrshrun v3.8b, v3.8h, #4 + uxtl v0.8h, v3.8b // first block, in 16 bit + mla v4.8h, v20.8h, v1.h[7] // p4(top[3]) * filter(4) + mla v4.8h, v16.8h, v1.h[3] // p0(topleft) * filter(0) + mla v4.8h, v21.8h, v0.h[3] // p5(left[0]) * filter(5) + mla v4.8h, v22.8h, v0.h[7] // p6(left[1]) * filter(6) + + mul v5.8h, v17.8h, v2.h[0] // p1(top[0]) * filter(1) + mla v5.8h, v18.8h, v2.h[1] // p2(top[1]) * filter(2) + mla v5.8h, v19.8h, v2.h[2] // p3(top[2]) * filter(3) + sqrshrun v4.8b, v4.8h, #4 + uxtl v0.8h, v4.8b // second block, in 16 bit + mla v5.8h, v20.8h, v2.h[3] // p4(top[3]) * filter(4) + mla v5.8h, v16.8h, v1.h[7] // p0(topleft) * filter(0) + mla v5.8h, v21.8h, v0.h[3] // p5(left[0]) * filter(5) + mla v5.8h, v22.8h, v0.h[7] // p6(left[1]) * filter(6) + + mul v6.8h, v17.8h, v2.h[4] // p1(top[0]) * filter(1) + mla v6.8h, v18.8h, v2.h[5] // p2(top[1]) * filter(2) + mla v6.8h, v19.8h, v2.h[6] // p3(top[2]) * filter(3) + sqrshrun v5.8b, v5.8h, #4 + uxtl v0.8h, v5.8b // third block, in 16 bit + mla v6.8h, v20.8h, v2.h[7] // p4(top[3]) * filter(4) + mla v6.8h, v16.8h, v2.h[3] // p0(topleft) * filter(0) + mla v6.8h, v21.8h, v0.h[3] // p5(left[0]) * filter(5) + mla v6.8h, v22.8h, v0.h[7] // p6(left[1]) * filter(6) + + subs w3, w3, #16 + sqrshrun v6.8b, v6.8h, #4 + + st4 {v3.s, v4.s, v5.s, v6.s}[0], [x0], #16 + st4 {v3.s, v4.s, v5.s, v6.s}[1], [x6], #16 + b.le 8f + ins v0.h[2], v2.h[7] + ins v0.b[0], v6.b[7] + ins v0.b[2], v6.b[3] + b 2b +8: + subs w4, w4, #2 + b.le 9f + sub x8, x6, w9, uxtw + add x0, x0, x1 + add x6, x6, x1 + mov w3, w9 + b 1b +9: + ret + +L(ipred_filter_tbl): + .hword L(ipred_filter_tbl) - 320b + .hword L(ipred_filter_tbl) - 160b + .hword L(ipred_filter_tbl) - 80b + .hword L(ipred_filter_tbl) - 40b +endfunc + +// void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const uint16_t *const pal, const uint8_t *idx, +// const int w, const int h); +function pal_pred_8bpc_neon, export=1 + ld1 {v0.8h}, [x2] + clz w9, w4 + adr x6, L(pal_pred_tbl) + sub w9, w9, #25 + ldrh w9, [x6, w9, uxtw #1] + xtn v0.8b, v0.8h + sub x6, x6, w9, uxtw + add x2, x0, x1 + lsl x1, x1, #1 + br x6 +4: + ld1 {v1.16b}, [x3], #16 + subs w5, w5, #4 + tbl v1.16b, {v0.16b}, v1.16b + st1 {v1.s}[0], [x0], x1 + st1 {v1.s}[1], [x2], x1 + st1 {v1.s}[2], [x0], x1 + st1 {v1.s}[3], [x2], x1 + b.gt 4b + ret +8: + ld1 {v1.16b, v2.16b}, [x3], #32 + subs w5, w5, #4 + tbl v1.16b, {v0.16b}, v1.16b + st1 {v1.d}[0], [x0], x1 + tbl v2.16b, {v0.16b}, v2.16b + st1 {v1.d}[1], [x2], x1 + st1 {v2.d}[0], [x0], x1 + st1 {v2.d}[1], [x2], x1 + b.gt 8b + ret +16: + ld1 {v1.16b, v2.16b, v3.16b, v4.16b}, [x3], #64 + subs w5, w5, #4 + tbl v1.16b, {v0.16b}, v1.16b + tbl v2.16b, {v0.16b}, v2.16b + st1 {v1.16b}, [x0], x1 + tbl v3.16b, {v0.16b}, v3.16b + st1 {v2.16b}, [x2], x1 + tbl v4.16b, {v0.16b}, v4.16b + st1 {v3.16b}, [x0], x1 + st1 {v4.16b}, [x2], x1 + b.gt 16b + ret +32: + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64 + ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x3], #64 + subs w5, w5, #4 + tbl v16.16b, {v0.16b}, v16.16b + tbl v17.16b, {v0.16b}, v17.16b + tbl v18.16b, {v0.16b}, v18.16b + tbl v19.16b, {v0.16b}, v19.16b + tbl v20.16b, {v0.16b}, v20.16b + st1 {v16.16b, v17.16b}, [x0], x1 + tbl v21.16b, {v0.16b}, v21.16b + st1 {v18.16b, v19.16b}, [x2], x1 + tbl v22.16b, {v0.16b}, v22.16b + st1 {v20.16b, v21.16b}, [x0], x1 + tbl v23.16b, {v0.16b}, v23.16b + st1 {v22.16b, v23.16b}, [x2], x1 + b.gt 32b + ret +64: + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64 + ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x3], #64 + subs w5, w5, #2 + tbl v16.16b, {v0.16b}, v16.16b + tbl v17.16b, {v0.16b}, v17.16b + tbl v18.16b, {v0.16b}, v18.16b + tbl v19.16b, {v0.16b}, v19.16b + st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1 + tbl v20.16b, {v0.16b}, v20.16b + tbl v21.16b, {v0.16b}, v21.16b + tbl v22.16b, {v0.16b}, v22.16b + tbl v23.16b, {v0.16b}, v23.16b + st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x1 + b.gt 64b + ret + +L(pal_pred_tbl): + .hword L(pal_pred_tbl) - 64b + .hword L(pal_pred_tbl) - 32b + .hword L(pal_pred_tbl) - 16b + .hword L(pal_pred_tbl) - 8b + .hword L(pal_pred_tbl) - 4b +endfunc + +// void ipred_cfl_128_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha); +function ipred_cfl_128_8bpc_neon, export=1 + clz w9, w3 + adr x7, L(ipred_cfl_128_tbl) + sub w9, w9, #26 + ldrh w9, [x7, w9, uxtw #1] + movi v0.8h, #128 // dc + dup v1.8h, w6 // alpha + sub x7, x7, w9, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x7 +L(ipred_cfl_splat_w4): + ld1 {v2.8h, v3.8h}, [x5], #32 + mul v2.8h, v2.8h, v1.8h // diff = ac * alpha + mul v3.8h, v3.8h, v1.8h + sshr v4.8h, v2.8h, #15 // sign = diff >> 15 + sshr v5.8h, v3.8h, #15 + add v2.8h, v2.8h, v4.8h // diff + sign + add v3.8h, v3.8h, v5.8h + srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign() + srshr v3.8h, v3.8h, #6 + add v2.8h, v2.8h, v0.8h // dc + apply_sign() + add v3.8h, v3.8h, v0.8h + sqxtun v2.8b, v2.8h // iclip_pixel(dc + apply_sign()) + sqxtun v3.8b, v3.8h + st1 {v2.s}[0], [x0], x1 + st1 {v2.s}[1], [x6], x1 + subs w4, w4, #4 + st1 {v3.s}[0], [x0], x1 + st1 {v3.s}[1], [x6], x1 + b.gt L(ipred_cfl_splat_w4) + ret +L(ipred_cfl_splat_w8): + ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x5], #64 + mul v2.8h, v2.8h, v1.8h // diff = ac * alpha + mul v3.8h, v3.8h, v1.8h + mul v4.8h, v4.8h, v1.8h + mul v5.8h, v5.8h, v1.8h + sshr v16.8h, v2.8h, #15 // sign = diff >> 15 + sshr v17.8h, v3.8h, #15 + sshr v18.8h, v4.8h, #15 + sshr v19.8h, v5.8h, #15 + add v2.8h, v2.8h, v16.8h // diff + sign + add v3.8h, v3.8h, v17.8h + add v4.8h, v4.8h, v18.8h + add v5.8h, v5.8h, v19.8h + srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign() + srshr v3.8h, v3.8h, #6 + srshr v4.8h, v4.8h, #6 + srshr v5.8h, v5.8h, #6 + add v2.8h, v2.8h, v0.8h // dc + apply_sign() + add v3.8h, v3.8h, v0.8h + add v4.8h, v4.8h, v0.8h + add v5.8h, v5.8h, v0.8h + sqxtun v2.8b, v2.8h // iclip_pixel(dc + apply_sign()) + sqxtun v3.8b, v3.8h + sqxtun v4.8b, v4.8h + sqxtun v5.8b, v5.8h + st1 {v2.8b}, [x0], x1 + st1 {v3.8b}, [x6], x1 + subs w4, w4, #4 + st1 {v4.8b}, [x0], x1 + st1 {v5.8b}, [x6], x1 + b.gt L(ipred_cfl_splat_w8) + ret +L(ipred_cfl_splat_w16): + add x7, x5, w3, uxtw #1 + sub x1, x1, w3, uxtw + mov w9, w3 +1: + ld1 {v2.8h, v3.8h}, [x5], #32 + ld1 {v4.8h, v5.8h}, [x7], #32 + mul v2.8h, v2.8h, v1.8h // diff = ac * alpha + mul v3.8h, v3.8h, v1.8h + mul v4.8h, v4.8h, v1.8h + mul v5.8h, v5.8h, v1.8h + sshr v16.8h, v2.8h, #15 // sign = diff >> 15 + sshr v17.8h, v3.8h, #15 + sshr v18.8h, v4.8h, #15 + sshr v19.8h, v5.8h, #15 + add v2.8h, v2.8h, v16.8h // diff + sign + add v3.8h, v3.8h, v17.8h + add v4.8h, v4.8h, v18.8h + add v5.8h, v5.8h, v19.8h + srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign() + srshr v3.8h, v3.8h, #6 + srshr v4.8h, v4.8h, #6 + srshr v5.8h, v5.8h, #6 + add v2.8h, v2.8h, v0.8h // dc + apply_sign() + add v3.8h, v3.8h, v0.8h + add v4.8h, v4.8h, v0.8h + add v5.8h, v5.8h, v0.8h + sqxtun v2.8b, v2.8h // iclip_pixel(dc + apply_sign()) + sqxtun v3.8b, v3.8h + sqxtun v4.8b, v4.8h + sqxtun v5.8b, v5.8h + subs w3, w3, #16 + st1 {v2.8b, v3.8b}, [x0], #16 + st1 {v4.8b, v5.8b}, [x6], #16 + b.gt 1b + subs w4, w4, #2 + add x5, x5, w9, uxtw #1 + add x7, x7, w9, uxtw #1 + add x0, x0, x1 + add x6, x6, x1 + mov w3, w9 + b.gt 1b + ret + +L(ipred_cfl_128_tbl): +L(ipred_cfl_splat_tbl): + .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16) + .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16) + .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8) + .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4) +endfunc + +// void ipred_cfl_top_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha); +function ipred_cfl_top_8bpc_neon, export=1 + clz w9, w3 + adr x7, L(ipred_cfl_top_tbl) + sub w9, w9, #26 + ldrh w9, [x7, w9, uxtw #1] + dup v1.8h, w6 // alpha + add x2, x2, #1 + sub x7, x7, w9, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x7 +4: + ld1r {v0.2s}, [x2] + uaddlv h0, v0.8b + urshr v0.4h, v0.4h, #3 + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w4) +8: + ld1 {v0.8b}, [x2] + uaddlv h0, v0.8b + urshr v0.4h, v0.4h, #3 + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w8) +16: + ld1 {v0.16b}, [x2] + uaddlv h0, v0.16b + urshr v0.4h, v0.4h, #4 + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w16) +32: + ld1 {v2.16b, v3.16b}, [x2] + uaddlv h2, v2.16b + uaddlv h3, v3.16b + add v2.4h, v2.4h, v3.4h + urshr v2.4h, v2.4h, #5 + dup v0.8h, v2.h[0] + b L(ipred_cfl_splat_w16) + +L(ipred_cfl_top_tbl): + .hword L(ipred_cfl_top_tbl) - 32b + .hword L(ipred_cfl_top_tbl) - 16b + .hword L(ipred_cfl_top_tbl) - 8b + .hword L(ipred_cfl_top_tbl) - 4b +endfunc + +// void ipred_cfl_left_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha); +function ipred_cfl_left_8bpc_neon, export=1 + sub x2, x2, w4, uxtw + clz w9, w3 + clz w8, w4 + adr x10, L(ipred_cfl_splat_tbl) + adr x7, L(ipred_cfl_left_tbl) + sub w9, w9, #26 + sub w8, w8, #26 + ldrh w9, [x10, w9, uxtw #1] + ldrh w8, [x7, w8, uxtw #1] + dup v1.8h, w6 // alpha + sub x9, x10, w9, uxtw + sub x7, x7, w8, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x7 + +L(ipred_cfl_left_h4): + ld1r {v0.2s}, [x2] + uaddlv h0, v0.8b + urshr v0.4h, v0.4h, #3 + dup v0.8h, v0.h[0] + br x9 + +L(ipred_cfl_left_h8): + ld1 {v0.8b}, [x2] + uaddlv h0, v0.8b + urshr v0.4h, v0.4h, #3 + dup v0.8h, v0.h[0] + br x9 + +L(ipred_cfl_left_h16): + ld1 {v0.16b}, [x2] + uaddlv h0, v0.16b + urshr v0.4h, v0.4h, #4 + dup v0.8h, v0.h[0] + br x9 + +L(ipred_cfl_left_h32): + ld1 {v2.16b, v3.16b}, [x2] + uaddlv h2, v2.16b + uaddlv h3, v3.16b + add v2.4h, v2.4h, v3.4h + urshr v2.4h, v2.4h, #5 + dup v0.8h, v2.h[0] + br x9 + +L(ipred_cfl_left_tbl): + .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32) + .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16) + .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8) + .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4) +endfunc + +// void ipred_cfl_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha); +function ipred_cfl_8bpc_neon, export=1 + sub x2, x2, w4, uxtw + add w8, w3, w4 // width + height + dup v1.8h, w6 // alpha + clz w9, w3 + clz w6, w4 + dup v16.8h, w8 // width + height + adr x7, L(ipred_cfl_tbl) + rbit w8, w8 // rbit(width + height) + sub w9, w9, #22 // 26 leading bits, minus table offset 4 + sub w6, w6, #26 + clz w8, w8 // ctz(width + height) + ldrh w9, [x7, w9, uxtw #1] + ldrh w6, [x7, w6, uxtw #1] + neg w8, w8 // -ctz(width + height) + sub x9, x7, w9, uxtw + sub x7, x7, w6, uxtw + ushr v16.8h, v16.8h, #1 // (width + height) >> 1 + dup v17.8h, w8 // -ctz(width + height) + add x6, x0, x1 + lsl x1, x1, #1 + br x7 + +L(ipred_cfl_h4): + ld1 {v0.s}[0], [x2], #4 + ins v0.s[1], wzr + uaddlv h0, v0.8b + br x9 +L(ipred_cfl_w4): + add x2, x2, #1 + ld1 {v2.s}[0], [x2] + ins v2.s[1], wzr + add v0.4h, v0.4h, v16.4h + uaddlv h2, v2.8b + cmp w4, #4 + add v0.4h, v0.4h, v2.4h + ushl v0.4h, v0.4h, v17.4h + b.eq 1f + // h = 8/16 + mov w16, #(0x3334/2) + movk w16, #(0x5556/2), lsl #16 + add w17, w4, w4 // w17 = 2*h = 16 or 32 + lsr w16, w16, w17 + dup v16.4h, w16 + sqdmulh v0.4h, v0.4h, v16.4h +1: + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w4) + +L(ipred_cfl_h8): + ld1 {v0.8b}, [x2], #8 + uaddlv h0, v0.8b + br x9 +L(ipred_cfl_w8): + add x2, x2, #1 + ld1 {v2.8b}, [x2] + add v0.4h, v0.4h, v16.4h + uaddlv h2, v2.8b + cmp w4, #8 + add v0.4h, v0.4h, v2.4h + ushl v0.4h, v0.4h, v17.4h + b.eq 1f + // h = 4/16/32 + cmp w4, #32 + mov w16, #(0x3334/2) + mov w17, #(0x5556/2) + csel w16, w16, w17, eq + dup v16.4h, w16 + sqdmulh v0.4h, v0.4h, v16.4h +1: + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w8) + +L(ipred_cfl_h16): + ld1 {v0.16b}, [x2], #16 + uaddlv h0, v0.16b + br x9 +L(ipred_cfl_w16): + add x2, x2, #1 + ld1 {v2.16b}, [x2] + add v0.4h, v0.4h, v16.4h + uaddlv h2, v2.16b + cmp w4, #16 + add v0.4h, v0.4h, v2.4h + ushl v0.4h, v0.4h, v17.4h + b.eq 1f + // h = 4/8/32 + cmp w4, #4 + mov w16, #(0x3334/2) + mov w17, #(0x5556/2) + csel w16, w16, w17, eq + dup v16.4h, w16 + sqdmulh v0.4h, v0.4h, v16.4h +1: + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w16) + +L(ipred_cfl_h32): + ld1 {v2.16b, v3.16b}, [x2], #32 + uaddlv h2, v2.16b + uaddlv h3, v3.16b + add v0.4h, v2.4h, v3.4h + br x9 +L(ipred_cfl_w32): + add x2, x2, #1 + ld1 {v2.16b, v3.16b}, [x2] + add v0.4h, v0.4h, v16.4h + uaddlv h2, v2.16b + uaddlv h3, v3.16b + cmp w4, #32 + add v0.4h, v0.4h, v2.4h + add v0.4h, v0.4h, v3.4h + ushl v0.4h, v0.4h, v17.4h + b.eq 1f + // h = 8/16 + mov w16, #(0x5556/2) + movk w16, #(0x3334/2), lsl #16 + add w17, w4, w4 // w17 = 2*h = 16 or 32 + lsr w16, w16, w17 + dup v16.4h, w16 + sqdmulh v0.4h, v0.4h, v16.4h +1: + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w16) + +L(ipred_cfl_tbl): + .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32) + .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16) + .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8) + .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4) + .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32) + .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16) + .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8) + .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4) +endfunc + +// void cfl_ac_420_8bpc_neon(int16_t *const ac, const pixel *const ypx, +// const ptrdiff_t stride, const int w_pad, +// const int h_pad, const int cw, const int ch); +function ipred_cfl_ac_420_8bpc_neon, export=1 + clz w8, w5 + lsl w4, w4, #2 + adr x7, L(ipred_cfl_ac_420_tbl) + sub w8, w8, #27 + ldrh w8, [x7, w8, uxtw #1] + movi v16.8h, #0 + movi v17.8h, #0 + movi v18.8h, #0 + movi v19.8h, #0 + sub x7, x7, w8, uxtw + sub w8, w6, w4 // height - h_pad + rbit w9, w5 // rbit(width) + rbit w10, w6 // rbit(height) + clz w9, w9 // ctz(width) + clz w10, w10 // ctz(height) + add w9, w9, w10 // log2sz + add x10, x1, x2 + dup v31.4s, w9 + lsl x2, x2, #1 + neg v31.4s, v31.4s // -log2sz + br x7 + +L(ipred_cfl_ac_420_w4): +1: // Copy and subsample input + ld1 {v0.8b}, [x1], x2 + ld1 {v1.8b}, [x10], x2 + ld1 {v0.d}[1], [x1], x2 + ld1 {v1.d}[1], [x10], x2 + uaddlp v0.8h, v0.16b + uaddlp v1.8h, v1.16b + add v0.8h, v0.8h, v1.8h + shl v0.8h, v0.8h, #1 + subs w8, w8, #2 + st1 {v0.8h}, [x0], #16 + add v16.8h, v16.8h, v0.8h + b.gt 1b + trn2 v1.2d, v0.2d, v0.2d + trn2 v0.2d, v0.2d, v0.2d +L(ipred_cfl_ac_420_w4_hpad): + cbz w4, 3f +2: // Vertical padding (h_pad > 0) + subs w4, w4, #4 + st1 {v0.8h, v1.8h}, [x0], #32 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + b.gt 2b +3: + // Aggregate the sums + add v0.8h, v16.8h, v17.8h + uaddlv s0, v0.8h // sum + sub x0, x0, w6, uxtw #3 + urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz + dup v4.8h, v4.h[0] +6: // Subtract dc from ac + ld1 {v0.8h, v1.8h}, [x0] + subs w6, w6, #4 + sub v0.8h, v0.8h, v4.8h + sub v1.8h, v1.8h, v4.8h + st1 {v0.8h, v1.8h}, [x0], #32 + b.gt 6b + ret + +L(ipred_cfl_ac_420_w8): + cbnz w3, L(ipred_cfl_ac_420_w8_wpad) +1: // Copy and subsample input, without padding + ld1 {v0.16b}, [x1], x2 + ld1 {v1.16b}, [x10], x2 + ld1 {v2.16b}, [x1], x2 + uaddlp v0.8h, v0.16b + ld1 {v3.16b}, [x10], x2 + uaddlp v1.8h, v1.16b + uaddlp v2.8h, v2.16b + uaddlp v3.8h, v3.16b + add v0.8h, v0.8h, v1.8h + add v2.8h, v2.8h, v3.8h + shl v0.8h, v0.8h, #1 + shl v1.8h, v2.8h, #1 + subs w8, w8, #2 + st1 {v0.8h, v1.8h}, [x0], #32 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + b.gt 1b + mov v0.16b, v1.16b + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_420_w8_wpad): +1: // Copy and subsample input, padding 4 + ld1 {v0.8b}, [x1], x2 + ld1 {v1.8b}, [x10], x2 + ld1 {v0.d}[1], [x1], x2 + ld1 {v1.d}[1], [x10], x2 + uaddlp v0.8h, v0.16b + uaddlp v1.8h, v1.16b + add v0.8h, v0.8h, v1.8h + shl v0.8h, v0.8h, #1 + dup v1.4h, v0.h[3] + dup v3.4h, v0.h[7] + trn2 v2.2d, v0.2d, v0.2d + subs w8, w8, #2 + st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 + add v16.4h, v16.4h, v0.4h + add v17.4h, v17.4h, v1.4h + add v18.4h, v18.4h, v2.4h + add v19.4h, v19.4h, v3.4h + b.gt 1b + trn1 v0.2d, v2.2d, v3.2d + trn1 v1.2d, v2.2d, v3.2d + +L(ipred_cfl_ac_420_w8_hpad): + cbz w4, 3f +2: // Vertical padding (h_pad > 0) + subs w4, w4, #4 + st1 {v0.8h, v1.8h}, [x0], #32 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + st1 {v0.8h, v1.8h}, [x0], #32 + add v18.8h, v18.8h, v0.8h + add v19.8h, v19.8h, v1.8h + b.gt 2b +3: + +L(ipred_cfl_ac_420_w8_calc_subtract_dc): + // Aggregate the sums + add v0.8h, v16.8h, v17.8h + add v2.8h, v18.8h, v19.8h + uaddlp v0.4s, v0.8h + uaddlp v2.4s, v2.8h + add v0.4s, v0.4s, v2.4s + addv s0, v0.4s // sum + sub x0, x0, w6, uxtw #4 + urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz + dup v4.8h, v4.h[0] +L(ipred_cfl_ac_420_w8_subtract_dc): +6: // Subtract dc from ac + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] + subs w6, w6, #4 + sub v0.8h, v0.8h, v4.8h + sub v1.8h, v1.8h, v4.8h + sub v2.8h, v2.8h, v4.8h + sub v3.8h, v3.8h, v4.8h + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + b.gt 6b + ret + +L(ipred_cfl_ac_420_w16): + adr x7, L(ipred_cfl_ac_420_w16_tbl) + ldrh w3, [x7, w3, uxtw #1] + sub x7, x7, w3, uxtw + br x7 + +L(ipred_cfl_ac_420_w16_wpad0): +1: // Copy and subsample input, without padding + ld1 {v0.16b, v1.16b}, [x1], x2 + ld1 {v2.16b, v3.16b}, [x10], x2 + uaddlp v0.8h, v0.16b + ld1 {v4.16b, v5.16b}, [x1], x2 + uaddlp v1.8h, v1.16b + ld1 {v6.16b, v7.16b}, [x10], x2 + uaddlp v2.8h, v2.16b + uaddlp v3.8h, v3.16b + uaddlp v4.8h, v4.16b + uaddlp v5.8h, v5.16b + uaddlp v6.8h, v6.16b + uaddlp v7.8h, v7.16b + add v0.8h, v0.8h, v2.8h + add v1.8h, v1.8h, v3.8h + add v4.8h, v4.8h, v6.8h + add v5.8h, v5.8h, v7.8h + shl v0.8h, v0.8h, #1 + shl v1.8h, v1.8h, #1 + shl v2.8h, v4.8h, #1 + shl v3.8h, v5.8h, #1 + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_wpad1): +1: // Copy and subsample input, padding 4 + ldr d1, [x1, #16] + ld1 {v0.16b}, [x1], x2 + ldr d3, [x10, #16] + ld1 {v2.16b}, [x10], x2 + uaddlp v1.4h, v1.8b + ldr d5, [x1, #16] + uaddlp v0.8h, v0.16b + ld1 {v4.16b}, [x1], x2 + uaddlp v3.4h, v3.8b + ldr d7, [x10, #16] + uaddlp v2.8h, v2.16b + ld1 {v6.16b}, [x10], x2 + uaddlp v5.4h, v5.8b + uaddlp v4.8h, v4.16b + uaddlp v7.4h, v7.8b + uaddlp v6.8h, v6.16b + add v1.4h, v1.4h, v3.4h + add v0.8h, v0.8h, v2.8h + add v5.4h, v5.4h, v7.4h + add v4.8h, v4.8h, v6.8h + shl v1.4h, v1.4h, #1 + shl v0.8h, v0.8h, #1 + shl v3.4h, v5.4h, #1 + shl v2.8h, v4.8h, #1 + dup v4.4h, v1.h[3] + dup v5.4h, v3.h[3] + trn1 v1.2d, v1.2d, v4.2d + trn1 v3.2d, v3.2d, v5.2d + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_wpad2): +1: // Copy and subsample input, padding 8 + ld1 {v0.16b}, [x1], x2 + ld1 {v2.16b}, [x10], x2 + ld1 {v4.16b}, [x1], x2 + uaddlp v0.8h, v0.16b + ld1 {v6.16b}, [x10], x2 + uaddlp v2.8h, v2.16b + uaddlp v4.8h, v4.16b + uaddlp v6.8h, v6.16b + add v0.8h, v0.8h, v2.8h + add v4.8h, v4.8h, v6.8h + shl v0.8h, v0.8h, #1 + shl v2.8h, v4.8h, #1 + dup v1.8h, v0.h[7] + dup v3.8h, v2.h[7] + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_wpad3): +1: // Copy and subsample input, padding 12 + ld1 {v0.8b}, [x1], x2 + ld1 {v2.8b}, [x10], x2 + ld1 {v4.8b}, [x1], x2 + uaddlp v0.4h, v0.8b + ld1 {v6.8b}, [x10], x2 + uaddlp v2.4h, v2.8b + uaddlp v4.4h, v4.8b + uaddlp v6.4h, v6.8b + add v0.4h, v0.4h, v2.4h + add v4.4h, v4.4h, v6.4h + shl v0.4h, v0.4h, #1 + shl v2.4h, v4.4h, #1 + dup v1.8h, v0.h[3] + dup v3.8h, v2.h[3] + trn1 v0.2d, v0.2d, v1.2d + trn1 v2.2d, v2.2d, v3.2d + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + +L(ipred_cfl_ac_420_w16_hpad): + cbz w4, 3f +2: // Vertical padding (h_pad > 0) + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + b.gt 2b +3: + + // Double the height and reuse the w8 summing/subtracting + lsl w6, w6, #1 + b L(ipred_cfl_ac_420_w8_calc_subtract_dc) + +L(ipred_cfl_ac_420_tbl): + .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16) + .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8) + .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4) + .hword 0 + +L(ipred_cfl_ac_420_w16_tbl): + .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0) + .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1) + .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2) + .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3) +endfunc + +// void cfl_ac_422_8bpc_neon(int16_t *const ac, const pixel *const ypx, +// const ptrdiff_t stride, const int w_pad, +// const int h_pad, const int cw, const int ch); +function ipred_cfl_ac_422_8bpc_neon, export=1 + clz w8, w5 + lsl w4, w4, #2 + adr x7, L(ipred_cfl_ac_422_tbl) + sub w8, w8, #27 + ldrh w8, [x7, w8, uxtw #1] + movi v16.8h, #0 + movi v17.8h, #0 + movi v18.8h, #0 + movi v19.8h, #0 + sub x7, x7, w8, uxtw + sub w8, w6, w4 // height - h_pad + rbit w9, w5 // rbit(width) + rbit w10, w6 // rbit(height) + clz w9, w9 // ctz(width) + clz w10, w10 // ctz(height) + add w9, w9, w10 // log2sz + add x10, x1, x2 + dup v31.4s, w9 + lsl x2, x2, #1 + neg v31.4s, v31.4s // -log2sz + br x7 + +L(ipred_cfl_ac_422_w4): +1: // Copy and subsample input + ld1 {v0.8b}, [x1], x2 + ld1 {v0.d}[1], [x10], x2 + ld1 {v1.8b}, [x1], x2 + ld1 {v1.d}[1], [x10], x2 + uaddlp v0.8h, v0.16b + uaddlp v1.8h, v1.16b + shl v0.8h, v0.8h, #2 + shl v1.8h, v1.8h, #2 + subs w8, w8, #4 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + st1 {v0.8h, v1.8h}, [x0], #32 + b.gt 1b + trn2 v0.2d, v1.2d, v1.2d + trn2 v1.2d, v1.2d, v1.2d + b L(ipred_cfl_ac_420_w4_hpad) + +L(ipred_cfl_ac_422_w8): + cbnz w3, L(ipred_cfl_ac_422_w8_wpad) +1: // Copy and subsample input, without padding + ld1 {v0.16b}, [x1], x2 + ld1 {v1.16b}, [x10], x2 + ld1 {v2.16b}, [x1], x2 + uaddlp v0.8h, v0.16b + ld1 {v3.16b}, [x10], x2 + uaddlp v1.8h, v1.16b + uaddlp v2.8h, v2.16b + uaddlp v3.8h, v3.16b + shl v0.8h, v0.8h, #2 + shl v1.8h, v1.8h, #2 + shl v2.8h, v2.8h, #2 + shl v3.8h, v3.8h, #2 + subs w8, w8, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + b.gt 1b + mov v0.16b, v3.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_422_w8_wpad): +1: // Copy and subsample input, padding 4 + ld1 {v0.8b}, [x1], x2 + ld1 {v0.d}[1], [x10], x2 + ld1 {v2.8b}, [x1], x2 + ld1 {v2.d}[1], [x10], x2 + uaddlp v0.8h, v0.16b + uaddlp v2.8h, v2.16b + shl v0.8h, v0.8h, #2 + shl v2.8h, v2.8h, #2 + dup v4.4h, v0.h[3] + dup v5.8h, v0.h[7] + dup v6.4h, v2.h[3] + dup v7.8h, v2.h[7] + trn2 v1.2d, v0.2d, v5.2d + trn1 v0.2d, v0.2d, v4.2d + trn2 v3.2d, v2.2d, v7.2d + trn1 v2.2d, v2.2d, v6.2d + subs w8, w8, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + b.gt 1b + mov v0.16b, v3.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_422_w16): + adr x7, L(ipred_cfl_ac_422_w16_tbl) + ldrh w3, [x7, w3, uxtw #1] + sub x7, x7, w3, uxtw + br x7 + +L(ipred_cfl_ac_422_w16_wpad0): +1: // Copy and subsample input, without padding + ld1 {v0.16b, v1.16b}, [x1], x2 + ld1 {v2.16b, v3.16b}, [x10], x2 + uaddlp v0.8h, v0.16b + uaddlp v1.8h, v1.16b + uaddlp v2.8h, v2.16b + uaddlp v3.8h, v3.16b + shl v0.8h, v0.8h, #2 + shl v1.8h, v1.8h, #2 + shl v2.8h, v2.8h, #2 + shl v3.8h, v3.8h, #2 + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_422_w16_wpad1): +1: // Copy and subsample input, padding 4 + ldr d1, [x1, #16] + ld1 {v0.16b}, [x1], x2 + ldr d3, [x10, #16] + ld1 {v2.16b}, [x10], x2 + uaddlp v1.4h, v1.8b + uaddlp v0.8h, v0.16b + uaddlp v3.4h, v3.8b + uaddlp v2.8h, v2.16b + shl v1.4h, v1.4h, #2 + shl v0.8h, v0.8h, #2 + shl v3.4h, v3.4h, #2 + shl v2.8h, v2.8h, #2 + dup v4.4h, v1.h[3] + dup v5.4h, v3.h[3] + trn1 v1.2d, v1.2d, v4.2d + trn1 v3.2d, v3.2d, v5.2d + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_422_w16_wpad2): +1: // Copy and subsample input, padding 8 + ld1 {v0.16b}, [x1], x2 + ld1 {v2.16b}, [x10], x2 + uaddlp v0.8h, v0.16b + uaddlp v2.8h, v2.16b + shl v0.8h, v0.8h, #2 + shl v2.8h, v2.8h, #2 + dup v1.8h, v0.h[7] + dup v3.8h, v2.h[7] + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_422_w16_wpad3): +1: // Copy and subsample input, padding 12 + ld1 {v0.8b}, [x1], x2 + ld1 {v2.8b}, [x10], x2 + uaddlp v0.4h, v0.8b + uaddlp v2.4h, v2.8b + shl v0.4h, v0.4h, #2 + shl v2.4h, v2.4h, #2 + dup v1.8h, v0.h[3] + dup v3.8h, v2.h[3] + trn1 v0.2d, v0.2d, v1.2d + trn1 v2.2d, v2.2d, v3.2d + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_422_tbl): + .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16) + .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8) + .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4) + .hword 0 + +L(ipred_cfl_ac_422_w16_tbl): + .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0) + .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1) + .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2) + .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3) +endfunc + +// void cfl_ac_444_8bpc_neon(int16_t *const ac, const pixel *const ypx, +// const ptrdiff_t stride, const int w_pad, +// const int h_pad, const int cw, const int ch); +function ipred_cfl_ac_444_8bpc_neon, export=1 + clz w8, w5 + lsl w4, w4, #2 + adr x7, L(ipred_cfl_ac_444_tbl) + sub w8, w8, #26 + ldrh w8, [x7, w8, uxtw #1] + movi v16.8h, #0 + movi v17.8h, #0 + movi v18.8h, #0 + movi v19.8h, #0 + sub x7, x7, w8, uxtw + sub w8, w6, w4 // height - h_pad + rbit w9, w5 // rbit(width) + rbit w10, w6 // rbit(height) + clz w9, w9 // ctz(width) + clz w10, w10 // ctz(height) + add w9, w9, w10 // log2sz + add x10, x1, x2 + dup v31.4s, w9 + lsl x2, x2, #1 + neg v31.4s, v31.4s // -log2sz + br x7 + +L(ipred_cfl_ac_444_w4): +1: // Copy and expand input + ld1 {v0.s}[0], [x1], x2 + ld1 {v0.s}[1], [x10], x2 + ld1 {v1.s}[0], [x1], x2 + ld1 {v1.s}[1], [x10], x2 + ushll v0.8h, v0.8b, #3 + ushll v1.8h, v1.8b, #3 + subs w8, w8, #4 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + st1 {v0.8h, v1.8h}, [x0], #32 + b.gt 1b + trn2 v0.2d, v1.2d, v1.2d + trn2 v1.2d, v1.2d, v1.2d + b L(ipred_cfl_ac_420_w4_hpad) + +L(ipred_cfl_ac_444_w8): +1: // Copy and expand input + ld1 {v0.8b}, [x1], x2 + ld1 {v1.8b}, [x10], x2 + ld1 {v2.8b}, [x1], x2 + ushll v0.8h, v0.8b, #3 + ld1 {v3.8b}, [x10], x2 + ushll v1.8h, v1.8b, #3 + ushll v2.8h, v2.8b, #3 + ushll v3.8h, v3.8b, #3 + subs w8, w8, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + b.gt 1b + mov v0.16b, v3.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_444_w16): + cbnz w3, L(ipred_cfl_ac_444_w16_wpad) +1: // Copy and expand input, without padding + ld1 {v0.16b}, [x1], x2 + ld1 {v2.16b}, [x10], x2 + ld1 {v4.16b}, [x1], x2 + ushll2 v1.8h, v0.16b, #3 + ushll v0.8h, v0.8b, #3 + ld1 {v6.16b}, [x10], x2 + ushll2 v3.8h, v2.16b, #3 + ushll v2.8h, v2.8b, #3 + ushll2 v5.8h, v4.16b, #3 + ushll v4.8h, v4.8b, #3 + ushll2 v7.8h, v6.16b, #3 + ushll v6.8h, v6.8b, #3 + subs w8, w8, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 + add v16.8h, v16.8h, v4.8h + add v17.8h, v17.8h, v5.8h + add v18.8h, v18.8h, v6.8h + add v19.8h, v19.8h, v7.8h + b.gt 1b + mov v0.16b, v6.16b + mov v1.16b, v7.16b + mov v2.16b, v6.16b + mov v3.16b, v7.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_444_w16_wpad): +1: // Copy and expand input, padding 8 + ld1 {v0.8b}, [x1], x2 + ld1 {v2.8b}, [x10], x2 + ld1 {v4.8b}, [x1], x2 + ld1 {v6.8b}, [x10], x2 + ushll v0.8h, v0.8b, #3 + ushll v2.8h, v2.8b, #3 + ushll v4.8h, v4.8b, #3 + ushll v6.8h, v6.8b, #3 + dup v1.8h, v0.h[7] + dup v3.8h, v2.h[7] + dup v5.8h, v4.h[7] + dup v7.8h, v6.h[7] + subs w8, w8, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 + add v16.8h, v16.8h, v4.8h + add v17.8h, v17.8h, v5.8h + add v18.8h, v18.8h, v6.8h + add v19.8h, v19.8h, v7.8h + b.gt 1b + mov v0.16b, v6.16b + mov v1.16b, v7.16b + mov v2.16b, v6.16b + mov v3.16b, v7.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_444_w32): + adr x7, L(ipred_cfl_ac_444_w32_tbl) + ldrh w3, [x7, w3, uxtw] // (w3>>1) << 1 + sub x7, x7, w3, uxtw + br x7 + +L(ipred_cfl_ac_444_w32_wpad0): +1: // Copy and expand input, without padding + ld1 {v2.16b, v3.16b}, [x1], x2 + ld1 {v6.16b, v7.16b}, [x10], x2 + ushll v0.8h, v2.8b, #3 + ushll2 v1.8h, v2.16b, #3 + ushll v2.8h, v3.8b, #3 + ushll2 v3.8h, v3.16b, #3 + ushll v4.8h, v6.8b, #3 + ushll2 v5.8h, v6.16b, #3 + ushll v6.8h, v7.8b, #3 + ushll2 v7.8h, v7.16b, #3 + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 + add v16.8h, v16.8h, v4.8h + add v17.8h, v17.8h, v5.8h + add v18.8h, v18.8h, v6.8h + add v19.8h, v19.8h, v7.8h + b.gt 1b + b L(ipred_cfl_ac_444_w32_hpad) + +L(ipred_cfl_ac_444_w32_wpad2): +1: // Copy and expand input, padding 8 + ldr d2, [x1, #16] + ld1 {v1.16b}, [x1], x2 + ldr d6, [x10, #16] + ld1 {v5.16b}, [x10], x2 + ushll v2.8h, v2.8b, #3 + ushll v0.8h, v1.8b, #3 + ushll2 v1.8h, v1.16b, #3 + ushll v6.8h, v6.8b, #3 + ushll v4.8h, v5.8b, #3 + ushll2 v5.8h, v5.16b, #3 + dup v3.8h, v2.h[7] + dup v7.8h, v6.h[7] + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 + add v16.8h, v16.8h, v4.8h + add v17.8h, v17.8h, v5.8h + add v18.8h, v18.8h, v6.8h + add v19.8h, v19.8h, v7.8h + b.gt 1b + b L(ipred_cfl_ac_444_w32_hpad) + +L(ipred_cfl_ac_444_w32_wpad4): +1: // Copy and expand input, padding 16 + ld1 {v1.16b}, [x1], x2 + ld1 {v5.16b}, [x10], x2 + ushll v0.8h, v1.8b, #3 + ushll2 v1.8h, v1.16b, #3 + ushll v4.8h, v5.8b, #3 + ushll2 v5.8h, v5.16b, #3 + dup v2.8h, v1.h[7] + dup v3.8h, v1.h[7] + dup v6.8h, v5.h[7] + dup v7.8h, v5.h[7] + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 + add v16.8h, v16.8h, v4.8h + add v17.8h, v17.8h, v5.8h + add v18.8h, v18.8h, v6.8h + add v19.8h, v19.8h, v7.8h + b.gt 1b + b L(ipred_cfl_ac_444_w32_hpad) + +L(ipred_cfl_ac_444_w32_wpad6): +1: // Copy and expand input, padding 24 + ld1 {v0.8b}, [x1], x2 + ld1 {v4.8b}, [x10], x2 + ushll v0.8h, v0.8b, #3 + ushll v4.8h, v4.8b, #3 + dup v1.8h, v0.h[7] + dup v2.8h, v0.h[7] + dup v3.8h, v0.h[7] + dup v5.8h, v4.h[7] + dup v6.8h, v4.h[7] + dup v7.8h, v4.h[7] + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 + add v16.8h, v16.8h, v4.8h + add v17.8h, v17.8h, v5.8h + add v18.8h, v18.8h, v6.8h + add v19.8h, v19.8h, v7.8h + b.gt 1b + +L(ipred_cfl_ac_444_w32_hpad): + cbz w4, 3f +2: // Vertical padding (h_pad > 0) + subs w4, w4, #2 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 + add v16.8h, v16.8h, v4.8h + add v17.8h, v17.8h, v5.8h + add v18.8h, v18.8h, v6.8h + add v19.8h, v19.8h, v7.8h + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 + add v16.8h, v16.8h, v4.8h + add v17.8h, v17.8h, v5.8h + add v18.8h, v18.8h, v6.8h + add v19.8h, v19.8h, v7.8h + b.gt 2b +3: + + // Quadruple the height and reuse the w8 subtracting + lsl w6, w6, #2 + // Aggregate the sums, with wider intermediates earlier than in + // ipred_cfl_ac_420_w8_calc_subtract_dc. + uaddlp v0.4s, v16.8h + uaddlp v1.4s, v17.8h + uaddlp v2.4s, v18.8h + uaddlp v3.4s, v19.8h + add v0.4s, v0.4s, v1.4s + add v2.4s, v2.4s, v3.4s + add v0.4s, v0.4s, v2.4s + addv s0, v0.4s // sum + sub x0, x0, w6, uxtw #4 + urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz + dup v4.8h, v4.h[0] + b L(ipred_cfl_ac_420_w8_subtract_dc) + +L(ipred_cfl_ac_444_tbl): + .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32) + .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16) + .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8) + .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4) + +L(ipred_cfl_ac_444_w32_tbl): + .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0) + .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2) + .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4) + .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6) +endfunc diff --git a/third_party/dav1d/src/arm/64/ipred16.S b/third_party/dav1d/src/arm/64/ipred16.S new file mode 100644 index 0000000000..a3993d034a --- /dev/null +++ b/third_party/dav1d/src/arm/64/ipred16.S @@ -0,0 +1,3076 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2019, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +// void ipred_dc_128_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height, +// const int bitdepth_max); +function ipred_dc_128_16bpc_neon, export=1 + ldr w8, [sp] + clz w3, w3 + adr x5, L(ipred_dc_128_tbl) + sub w3, w3, #25 + ldrh w3, [x5, w3, uxtw #1] + dup v0.8h, w8 + sub x5, x5, w3, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + urshr v0.8h, v0.8h, #1 + br x5 +4: + st1 {v0.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + b.gt 4b + ret +8: + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + b.gt 8b + ret +160: + mov v1.16b, v0.16b +16: + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v0.8h, v1.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v0.8h, v1.8h}, [x6], x1 + b.gt 16b + ret +320: + mov v1.16b, v0.16b + mov v2.16b, v0.16b + mov v3.16b, v0.16b +32: + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + b.gt 32b + ret +640: + mov v1.16b, v0.16b + mov v2.16b, v0.16b + mov v3.16b, v0.16b + sub x1, x1, #64 +64: + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + b.gt 64b + ret + +L(ipred_dc_128_tbl): + .hword L(ipred_dc_128_tbl) - 640b + .hword L(ipred_dc_128_tbl) - 320b + .hword L(ipred_dc_128_tbl) - 160b + .hword L(ipred_dc_128_tbl) - 8b + .hword L(ipred_dc_128_tbl) - 4b +endfunc + +// void ipred_v_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_v_16bpc_neon, export=1 + clz w3, w3 + adr x5, L(ipred_v_tbl) + sub w3, w3, #25 + ldrh w3, [x5, w3, uxtw #1] + add x2, x2, #2 + sub x5, x5, w3, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +40: + ld1 {v0.4h}, [x2] +4: + st1 {v0.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + b.gt 4b + ret +80: + ld1 {v0.8h}, [x2] +8: + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + b.gt 8b + ret +160: + ld1 {v0.8h, v1.8h}, [x2] +16: + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v0.8h, v1.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v0.8h, v1.8h}, [x6], x1 + b.gt 16b + ret +320: + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2] +32: + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + b.gt 32b + ret +640: + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 + sub x1, x1, #64 + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2] +64: + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1 + b.gt 64b + ret + +L(ipred_v_tbl): + .hword L(ipred_v_tbl) - 640b + .hword L(ipred_v_tbl) - 320b + .hword L(ipred_v_tbl) - 160b + .hword L(ipred_v_tbl) - 80b + .hword L(ipred_v_tbl) - 40b +endfunc + +// void ipred_h_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_h_16bpc_neon, export=1 + clz w3, w3 + adr x5, L(ipred_h_tbl) + sub w3, w3, #25 + ldrh w3, [x5, w3, uxtw #1] + sub x2, x2, #8 + sub x5, x5, w3, uxtw + mov x7, #-8 + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +4: + ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 + st1 {v3.4h}, [x0], x1 + st1 {v2.4h}, [x6], x1 + subs w4, w4, #4 + st1 {v1.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + b.gt 4b + ret +8: + ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 + st1 {v3.8h}, [x0], x1 + st1 {v2.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v1.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + b.gt 8b + ret +16: + ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 + str q3, [x0, #16] + str q2, [x6, #16] + st1 {v3.8h}, [x0], x1 + st1 {v2.8h}, [x6], x1 + subs w4, w4, #4 + str q1, [x0, #16] + str q0, [x6, #16] + st1 {v1.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + b.gt 16b + ret +32: + ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 + str q3, [x0, #16] + str q2, [x6, #16] + stp q3, q3, [x0, #32] + stp q2, q2, [x6, #32] + st1 {v3.8h}, [x0], x1 + st1 {v2.8h}, [x6], x1 + subs w4, w4, #4 + str q1, [x0, #16] + str q0, [x6, #16] + stp q1, q1, [x0, #32] + stp q0, q0, [x6, #32] + st1 {v1.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + b.gt 32b + ret +64: + ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 + str q3, [x0, #16] + str q2, [x6, #16] + stp q3, q3, [x0, #32] + stp q2, q2, [x6, #32] + stp q3, q3, [x0, #64] + stp q2, q2, [x6, #64] + stp q3, q3, [x0, #96] + stp q2, q2, [x6, #96] + st1 {v3.8h}, [x0], x1 + st1 {v2.8h}, [x6], x1 + subs w4, w4, #4 + str q1, [x0, #16] + str q0, [x6, #16] + stp q1, q1, [x0, #32] + stp q0, q0, [x6, #32] + stp q1, q1, [x0, #64] + stp q0, q0, [x6, #64] + stp q1, q1, [x0, #96] + stp q0, q0, [x6, #96] + st1 {v1.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + b.gt 64b + ret + +L(ipred_h_tbl): + .hword L(ipred_h_tbl) - 64b + .hword L(ipred_h_tbl) - 32b + .hword L(ipred_h_tbl) - 16b + .hword L(ipred_h_tbl) - 8b + .hword L(ipred_h_tbl) - 4b +endfunc + +// void ipred_dc_top_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_top_16bpc_neon, export=1 + clz w3, w3 + adr x5, L(ipred_dc_top_tbl) + sub w3, w3, #25 + ldrh w3, [x5, w3, uxtw #1] + add x2, x2, #2 + sub x5, x5, w3, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +40: + ld1 {v0.4h}, [x2] + addv h0, v0.4h + urshr v0.4h, v0.4h, #2 + dup v0.4h, v0.h[0] +4: + st1 {v0.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + b.gt 4b + ret +80: + ld1 {v0.8h}, [x2] + addv h0, v0.8h + urshr v0.4h, v0.4h, #3 + dup v0.8h, v0.h[0] +8: + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + b.gt 8b + ret +160: + ld1 {v0.8h, v1.8h}, [x2] + addp v0.8h, v0.8h, v1.8h + addv h0, v0.8h + urshr v2.4h, v0.4h, #4 + dup v0.8h, v2.h[0] + dup v1.8h, v2.h[0] +16: + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v0.8h, v1.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v0.8h, v1.8h}, [x6], x1 + b.gt 16b + ret +320: + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2] + addp v0.8h, v0.8h, v1.8h + addp v2.8h, v2.8h, v3.8h + addp v0.8h, v0.8h, v2.8h + uaddlv s0, v0.8h + rshrn v4.4h, v0.4s, #5 + dup v0.8h, v4.h[0] + dup v1.8h, v4.h[0] + dup v2.8h, v4.h[0] + dup v3.8h, v4.h[0] +32: + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + b.gt 32b + ret +640: + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 + addp v0.8h, v0.8h, v1.8h + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2] + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v6.8h, v6.8h, v7.8h + addp v0.8h, v0.8h, v2.8h + addp v4.8h, v4.8h, v6.8h + addp v0.8h, v0.8h, v4.8h + uaddlv s0, v0.8h + rshrn v4.4h, v0.4s, #6 + sub x1, x1, #64 + dup v0.8h, v4.h[0] + dup v1.8h, v4.h[0] + dup v2.8h, v4.h[0] + dup v3.8h, v4.h[0] +64: + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + b.gt 64b + ret + +L(ipred_dc_top_tbl): + .hword L(ipred_dc_top_tbl) - 640b + .hword L(ipred_dc_top_tbl) - 320b + .hword L(ipred_dc_top_tbl) - 160b + .hword L(ipred_dc_top_tbl) - 80b + .hword L(ipred_dc_top_tbl) - 40b +endfunc + +// void ipred_dc_left_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_left_16bpc_neon, export=1 + sub x2, x2, w4, uxtw #1 + clz w3, w3 + clz w7, w4 + adr x5, L(ipred_dc_left_tbl) + sub w3, w3, #20 // 25 leading bits, minus table offset 5 + sub w7, w7, #25 + ldrh w3, [x5, w3, uxtw #1] + ldrh w7, [x5, w7, uxtw #1] + sub x3, x5, w3, uxtw + sub x5, x5, w7, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x5 + +L(ipred_dc_left_h4): + ld1 {v0.4h}, [x2] + addv h0, v0.4h + urshr v0.4h, v0.4h, #2 + dup v0.8h, v0.h[0] + br x3 +L(ipred_dc_left_w4): + st1 {v0.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + b.gt L(ipred_dc_left_w4) + ret + +L(ipred_dc_left_h8): + ld1 {v0.8h}, [x2] + addv h0, v0.8h + urshr v0.4h, v0.4h, #3 + dup v0.8h, v0.h[0] + br x3 +L(ipred_dc_left_w8): + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + b.gt L(ipred_dc_left_w8) + ret + +L(ipred_dc_left_h16): + ld1 {v0.8h, v1.8h}, [x2] + addp v0.8h, v0.8h, v1.8h + addv h0, v0.8h + urshr v2.4h, v0.4h, #4 + dup v0.8h, v2.h[0] + dup v1.8h, v2.h[0] + br x3 +L(ipred_dc_left_w16): + mov v1.16b, v0.16b +1: + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v0.8h, v1.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v0.8h, v1.8h}, [x6], x1 + b.gt 1b + ret + +L(ipred_dc_left_h32): + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2] + addp v0.8h, v0.8h, v1.8h + addp v2.8h, v2.8h, v3.8h + addp v0.8h, v0.8h, v2.8h + uaddlp v0.4s, v0.8h + addv s0, v0.4s + rshrn v4.4h, v0.4s, #5 + dup v0.8h, v4.h[0] + br x3 +L(ipred_dc_left_w32): + mov v1.16b, v0.16b + mov v2.16b, v0.16b + mov v3.16b, v0.16b +1: + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + b.gt 1b + ret + +L(ipred_dc_left_h64): + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 + addp v0.8h, v0.8h, v1.8h + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2] + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v6.8h, v6.8h, v7.8h + addp v0.8h, v0.8h, v2.8h + addp v4.8h, v4.8h, v6.8h + addp v0.8h, v0.8h, v4.8h + uaddlv s0, v0.8h + rshrn v4.4h, v0.4s, #6 + dup v0.8h, v4.h[0] + br x3 +L(ipred_dc_left_w64): + mov v1.16b, v0.16b + mov v2.16b, v0.16b + mov v3.16b, v0.16b + sub x1, x1, #64 +1: + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + b.gt 1b + ret + +L(ipred_dc_left_tbl): + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4) +endfunc + +// void ipred_dc_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_16bpc_neon, export=1 + sub x2, x2, w4, uxtw #1 + add w7, w3, w4 // width + height + clz w3, w3 + clz w6, w4 + dup v16.4s, w7 // width + height + adr x5, L(ipred_dc_tbl) + rbit w7, w7 // rbit(width + height) + sub w3, w3, #20 // 25 leading bits, minus table offset 5 + sub w6, w6, #25 + clz w7, w7 // ctz(width + height) + ldrh w3, [x5, w3, uxtw #1] + ldrh w6, [x5, w6, uxtw #1] + neg w7, w7 // -ctz(width + height) + sub x3, x5, w3, uxtw + sub x5, x5, w6, uxtw + ushr v16.4s, v16.4s, #1 // (width + height) >> 1 + dup v17.4s, w7 // -ctz(width + height) + add x6, x0, x1 + lsl x1, x1, #1 + br x5 + +L(ipred_dc_h4): + ld1 {v0.4h}, [x2], #8 + uaddlv s0, v0.4h + br x3 +L(ipred_dc_w4): + add x2, x2, #2 + ld1 {v1.4h}, [x2] + add v0.2s, v0.2s, v16.2s + uaddlv s1, v1.4h + cmp w4, #4 + add v0.2s, v0.2s, v1.2s + ushl v0.2s, v0.2s, v17.2s + b.eq 1f + // h = 8/16 + cmp w4, #16 + mov w16, #0x6667 + mov w17, #0xAAAB + csel w16, w16, w17, eq + dup v16.2s, w16 + mul v0.2s, v0.2s, v16.2s + ushr v0.2s, v0.2s, #17 +1: + dup v0.4h, v0.h[0] +2: + st1 {v0.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + b.gt 2b + ret + +L(ipred_dc_h8): + ld1 {v0.8h}, [x2], #16 + uaddlv s0, v0.8h + br x3 +L(ipred_dc_w8): + add x2, x2, #2 + ld1 {v1.8h}, [x2] + add v0.2s, v0.2s, v16.2s + uaddlv s1, v1.8h + cmp w4, #8 + add v0.2s, v0.2s, v1.2s + ushl v0.2s, v0.2s, v17.2s + b.eq 1f + // h = 4/16/32 + cmp w4, #32 + mov w16, #0x6667 + mov w17, #0xAAAB + csel w16, w16, w17, eq + dup v16.2s, w16 + mul v0.2s, v0.2s, v16.2s + ushr v0.2s, v0.2s, #17 +1: + dup v0.8h, v0.h[0] +2: + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + b.gt 2b + ret + +L(ipred_dc_h16): + ld1 {v0.8h, v1.8h}, [x2], #32 + addp v0.8h, v0.8h, v1.8h + uaddlv s0, v0.8h + br x3 +L(ipred_dc_w16): + add x2, x2, #2 + ld1 {v1.8h, v2.8h}, [x2] + add v0.2s, v0.2s, v16.2s + addp v1.8h, v1.8h, v2.8h + uaddlv s1, v1.8h + cmp w4, #16 + add v0.2s, v0.2s, v1.2s + ushl v4.2s, v0.2s, v17.2s + b.eq 1f + // h = 4/8/32/64 + tst w4, #(32+16+8) // 16 added to make a consecutive bitmask + mov w16, #0x6667 + mov w17, #0xAAAB + csel w16, w16, w17, eq + dup v16.2s, w16 + mul v4.2s, v4.2s, v16.2s + ushr v4.2s, v4.2s, #17 +1: + dup v0.8h, v4.h[0] + dup v1.8h, v4.h[0] +2: + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v0.8h, v1.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v0.8h, v1.8h}, [x6], x1 + b.gt 2b + ret + +L(ipred_dc_h32): + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 + addp v0.8h, v0.8h, v1.8h + addp v2.8h, v2.8h, v3.8h + addp v0.8h, v0.8h, v2.8h + uaddlv s0, v0.8h + br x3 +L(ipred_dc_w32): + add x2, x2, #2 + ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2] + add v0.2s, v0.2s, v16.2s + addp v1.8h, v1.8h, v2.8h + addp v3.8h, v3.8h, v4.8h + addp v1.8h, v1.8h, v3.8h + uaddlv s1, v1.8h + cmp w4, #32 + add v0.2s, v0.2s, v1.2s + ushl v4.2s, v0.2s, v17.2s + b.eq 1f + // h = 8/16/64 + cmp w4, #8 + mov w16, #0x6667 + mov w17, #0xAAAB + csel w16, w16, w17, eq + dup v16.2s, w16 + mul v4.2s, v4.2s, v16.2s + ushr v4.2s, v4.2s, #17 +1: + dup v0.8h, v4.h[0] + dup v1.8h, v4.h[0] + dup v2.8h, v4.h[0] + dup v3.8h, v4.h[0] +2: + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + b.gt 2b + ret + +L(ipred_dc_h64): + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 + addp v0.8h, v0.8h, v1.8h + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v6.8h, v6.8h, v7.8h + addp v0.8h, v0.8h, v2.8h + addp v4.8h, v4.8h, v6.8h + addp v0.8h, v0.8h, v4.8h + uaddlv s0, v0.8h + br x3 +L(ipred_dc_w64): + add x2, x2, #2 + ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2], #64 + add v0.2s, v0.2s, v16.2s + addp v1.8h, v1.8h, v2.8h + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2] + addp v3.8h, v3.8h, v4.8h + addp v20.8h, v20.8h, v21.8h + addp v22.8h, v22.8h, v23.8h + addp v1.8h, v1.8h, v3.8h + addp v20.8h, v20.8h, v22.8h + addp v1.8h, v1.8h, v20.8h + uaddlv s1, v1.8h + cmp w4, #64 + add v0.2s, v0.2s, v1.2s + ushl v4.2s, v0.2s, v17.2s + b.eq 1f + // h = 16/32 + cmp w4, #16 + mov w16, #0x6667 + mov w17, #0xAAAB + csel w16, w16, w17, eq + dup v16.2s, w16 + mul v4.2s, v4.2s, v16.2s + ushr v4.2s, v4.2s, #17 +1: + sub x1, x1, #64 + dup v0.8h, v4.h[0] + dup v1.8h, v4.h[0] + dup v2.8h, v4.h[0] + dup v3.8h, v4.h[0] +2: + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + b.gt 2b + ret + +L(ipred_dc_tbl): + .hword L(ipred_dc_tbl) - L(ipred_dc_h64) + .hword L(ipred_dc_tbl) - L(ipred_dc_h32) + .hword L(ipred_dc_tbl) - L(ipred_dc_h16) + .hword L(ipred_dc_tbl) - L(ipred_dc_h8) + .hword L(ipred_dc_tbl) - L(ipred_dc_h4) + .hword L(ipred_dc_tbl) - L(ipred_dc_w64) + .hword L(ipred_dc_tbl) - L(ipred_dc_w32) + .hword L(ipred_dc_tbl) - L(ipred_dc_w16) + .hword L(ipred_dc_tbl) - L(ipred_dc_w8) + .hword L(ipred_dc_tbl) - L(ipred_dc_w4) +endfunc + +// void ipred_paeth_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_paeth_16bpc_neon, export=1 + clz w9, w3 + adr x5, L(ipred_paeth_tbl) + sub w9, w9, #25 + ldrh w9, [x5, w9, uxtw #1] + ld1r {v4.8h}, [x2] + add x8, x2, #2 + sub x2, x2, #8 + sub x5, x5, w9, uxtw + mov x7, #-8 + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +40: + ld1r {v5.2d}, [x8] + sub v6.8h, v5.8h, v4.8h // top - topleft +4: + ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 + zip1 v0.2d, v0.2d, v1.2d + zip1 v2.2d, v2.2d, v3.2d + add v16.8h, v6.8h, v0.8h // base + add v17.8h, v6.8h, v2.8h + sabd v20.8h, v5.8h, v16.8h // tdiff + sabd v21.8h, v5.8h, v17.8h + sabd v22.8h, v4.8h, v16.8h // tldiff + sabd v23.8h, v4.8h, v17.8h + sabd v16.8h, v0.8h, v16.8h // ldiff + sabd v17.8h, v2.8h, v17.8h + umin v18.8h, v20.8h, v22.8h // min(tdiff, tldiff) + umin v19.8h, v21.8h, v23.8h + cmge v20.8h, v22.8h, v20.8h // tldiff >= tdiff + cmge v21.8h, v23.8h, v21.8h + cmge v16.8h, v18.8h, v16.8h // min(tdiff, tldiff) >= ldiff + cmge v17.8h, v19.8h, v17.8h + bsl v21.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft + bsl v20.16b, v5.16b, v4.16b + bit v21.16b, v2.16b, v17.16b // ldiff <= min ? left : ... + bit v20.16b, v0.16b, v16.16b + st1 {v21.d}[1], [x0], x1 + st1 {v21.d}[0], [x6], x1 + subs w4, w4, #4 + st1 {v20.d}[1], [x0], x1 + st1 {v20.d}[0], [x6], x1 + b.gt 4b + ret +80: +160: +320: +640: + ld1 {v5.8h}, [x8], #16 + mov w9, w3 + // Set up pointers for four rows in parallel; x0, x6, x5, x10 + add x5, x0, x1 + add x10, x6, x1 + lsl x1, x1, #1 + sub x1, x1, w3, uxtw #1 +1: + ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 +2: + sub v6.8h, v5.8h, v4.8h // top - topleft + add v16.8h, v6.8h, v0.8h // base + add v17.8h, v6.8h, v1.8h + add v18.8h, v6.8h, v2.8h + add v19.8h, v6.8h, v3.8h + sabd v20.8h, v5.8h, v16.8h // tdiff + sabd v21.8h, v5.8h, v17.8h + sabd v22.8h, v5.8h, v18.8h + sabd v23.8h, v5.8h, v19.8h + sabd v24.8h, v4.8h, v16.8h // tldiff + sabd v25.8h, v4.8h, v17.8h + sabd v26.8h, v4.8h, v18.8h + sabd v27.8h, v4.8h, v19.8h + sabd v16.8h, v0.8h, v16.8h // ldiff + sabd v17.8h, v1.8h, v17.8h + sabd v18.8h, v2.8h, v18.8h + sabd v19.8h, v3.8h, v19.8h + umin v28.8h, v20.8h, v24.8h // min(tdiff, tldiff) + umin v29.8h, v21.8h, v25.8h + umin v30.8h, v22.8h, v26.8h + umin v31.8h, v23.8h, v27.8h + cmge v20.8h, v24.8h, v20.8h // tldiff >= tdiff + cmge v21.8h, v25.8h, v21.8h + cmge v22.8h, v26.8h, v22.8h + cmge v23.8h, v27.8h, v23.8h + cmge v16.8h, v28.8h, v16.8h // min(tdiff, tldiff) >= ldiff + cmge v17.8h, v29.8h, v17.8h + cmge v18.8h, v30.8h, v18.8h + cmge v19.8h, v31.8h, v19.8h + bsl v23.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft + bsl v22.16b, v5.16b, v4.16b + bsl v21.16b, v5.16b, v4.16b + bsl v20.16b, v5.16b, v4.16b + bit v23.16b, v3.16b, v19.16b // ldiff <= min ? left : ... + bit v22.16b, v2.16b, v18.16b + bit v21.16b, v1.16b, v17.16b + bit v20.16b, v0.16b, v16.16b + st1 {v23.8h}, [x0], #16 + st1 {v22.8h}, [x6], #16 + subs w3, w3, #8 + st1 {v21.8h}, [x5], #16 + st1 {v20.8h}, [x10], #16 + b.le 8f + ld1 {v5.8h}, [x8], #16 + b 2b +8: + subs w4, w4, #4 + b.le 9f + // End of horizontal loop, move pointers to next four rows + sub x8, x8, w9, uxtw #1 + add x0, x0, x1 + add x6, x6, x1 + // Load the top row as early as possible + ld1 {v5.8h}, [x8], #16 + add x5, x5, x1 + add x10, x10, x1 + mov w3, w9 + b 1b +9: + ret + +L(ipred_paeth_tbl): + .hword L(ipred_paeth_tbl) - 640b + .hword L(ipred_paeth_tbl) - 320b + .hword L(ipred_paeth_tbl) - 160b + .hword L(ipred_paeth_tbl) - 80b + .hword L(ipred_paeth_tbl) - 40b +endfunc + +// void ipred_smooth_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_smooth_16bpc_neon, export=1 + movrel x10, X(sm_weights) + add x11, x10, w4, uxtw + add x10, x10, w3, uxtw + clz w9, w3 + adr x5, L(ipred_smooth_tbl) + sub x12, x2, w4, uxtw #1 + sub w9, w9, #25 + ldrh w9, [x5, w9, uxtw #1] + ld1r {v4.8h}, [x12] // bottom + add x8, x2, #2 + sub x5, x5, w9, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +40: + ld1r {v6.2d}, [x8] // top + ld1r {v7.2s}, [x10] // weights_hor + sub x2, x2, #8 + mov x7, #-8 + dup v5.8h, v6.h[3] // right + sub v6.8h, v6.8h, v4.8h // top-bottom + uxtl v7.8h, v7.8b // weights_hor + add v31.4h, v4.4h, v5.4h // bottom+right +4: + ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 // left + ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver + ushll v20.4s, v31.4h, #8 // (bottom+right)*256 + ushll v21.4s, v31.4h, #8 + ushll v22.4s, v31.4h, #8 + ushll v23.4s, v31.4h, #8 + zip1 v1.2d, v1.2d, v0.2d // left, flipped + zip1 v0.2d, v3.2d, v2.2d + zip1 v16.2s, v16.2s, v17.2s // weights_ver + zip1 v18.2s, v18.2s, v19.2s + sub v0.8h, v0.8h, v5.8h // left-right + sub v1.8h, v1.8h, v5.8h + uxtl v16.8h, v16.8b // weights_ver + uxtl v18.8h, v18.8b + smlal v20.4s, v0.4h, v7.4h // += (left-right)*weights_hor + smlal2 v21.4s, v0.8h, v7.8h + smlal v22.4s, v1.4h, v7.4h + smlal2 v23.4s, v1.8h, v7.8h + smlal v20.4s, v6.4h, v16.4h // += (top-bottom)*weights_ver + smlal2 v21.4s, v6.8h, v16.8h + smlal v22.4s, v6.4h, v18.4h + smlal2 v23.4s, v6.8h, v18.8h + rshrn v20.4h, v20.4s, #9 + rshrn v21.4h, v21.4s, #9 + rshrn v22.4h, v22.4s, #9 + rshrn v23.4h, v23.4s, #9 + st1 {v20.4h}, [x0], x1 + st1 {v21.4h}, [x6], x1 + subs w4, w4, #4 + st1 {v22.4h}, [x0], x1 + st1 {v23.4h}, [x6], x1 + b.gt 4b + ret +80: + ld1 {v6.8h}, [x8] // top + ld1 {v7.8b}, [x10] // weights_hor + sub x2, x2, #8 + mov x7, #-8 + dup v5.8h, v6.h[7] // right + sub v6.8h, v6.8h, v4.8h // top-bottom + uxtl v7.8h, v7.8b // weights_hor + add v31.4h, v4.4h, v5.4h // bottom+right +8: + ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left + ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver + ushll v20.4s, v31.4h, #8 // (bottom+right)*256 + ushll v21.4s, v31.4h, #8 + ushll v22.4s, v31.4h, #8 + ushll v23.4s, v31.4h, #8 + ushll v24.4s, v31.4h, #8 + ushll v25.4s, v31.4h, #8 + ushll v26.4s, v31.4h, #8 + ushll v27.4s, v31.4h, #8 + sub v0.8h, v0.8h, v5.8h // left-right + sub v1.8h, v1.8h, v5.8h + sub v2.8h, v2.8h, v5.8h + sub v3.8h, v3.8h, v5.8h + uxtl v16.8h, v16.8b // weights_ver + uxtl v17.8h, v17.8b + uxtl v18.8h, v18.8b + uxtl v19.8h, v19.8b + smlal v20.4s, v3.4h, v7.4h // += (left-right)*weights_hor + smlal2 v21.4s, v3.8h, v7.8h // (left flipped) + smlal v22.4s, v2.4h, v7.4h + smlal2 v23.4s, v2.8h, v7.8h + smlal v24.4s, v1.4h, v7.4h + smlal2 v25.4s, v1.8h, v7.8h + smlal v26.4s, v0.4h, v7.4h + smlal2 v27.4s, v0.8h, v7.8h + smlal v20.4s, v6.4h, v16.4h // += (top-bottom)*weights_ver + smlal2 v21.4s, v6.8h, v16.8h + smlal v22.4s, v6.4h, v17.4h + smlal2 v23.4s, v6.8h, v17.8h + smlal v24.4s, v6.4h, v18.4h + smlal2 v25.4s, v6.8h, v18.8h + smlal v26.4s, v6.4h, v19.4h + smlal2 v27.4s, v6.8h, v19.8h + rshrn v20.4h, v20.4s, #9 + rshrn2 v20.8h, v21.4s, #9 + rshrn v21.4h, v22.4s, #9 + rshrn2 v21.8h, v23.4s, #9 + rshrn v22.4h, v24.4s, #9 + rshrn2 v22.8h, v25.4s, #9 + rshrn v23.4h, v26.4s, #9 + rshrn2 v23.8h, v27.4s, #9 + st1 {v20.8h}, [x0], x1 + st1 {v21.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v22.8h}, [x0], x1 + st1 {v23.8h}, [x6], x1 + b.gt 8b + ret +160: +320: +640: + add x12, x2, w3, uxtw #1 + sub x1, x1, w3, uxtw #1 + ld1r {v5.8h}, [x12] // right + sub x2, x2, #4 + mov x7, #-4 + mov w9, w3 + add v31.4h, v4.4h, v5.4h // bottom+right + +1: + ld2r {v0.8h, v1.8h}, [x2], x7 // left + ld2r {v16.8b, v17.8b}, [x11], #2 // weights_ver + sub v0.8h, v0.8h, v5.8h // left-right + sub v1.8h, v1.8h, v5.8h + uxtl v16.8h, v16.8b // weights_ver + uxtl v17.8h, v17.8b +2: + ld1 {v7.16b}, [x10], #16 // weights_hor + ld1 {v2.8h, v3.8h}, [x8], #32 // top + ushll v20.4s, v31.4h, #8 // (bottom+right)*256 + ushll v21.4s, v31.4h, #8 + ushll v22.4s, v31.4h, #8 + ushll v23.4s, v31.4h, #8 + ushll v24.4s, v31.4h, #8 + ushll v25.4s, v31.4h, #8 + ushll v26.4s, v31.4h, #8 + ushll v27.4s, v31.4h, #8 + uxtl v6.8h, v7.8b // weights_hor + uxtl2 v7.8h, v7.16b + sub v2.8h, v2.8h, v4.8h // top-bottom + sub v3.8h, v3.8h, v4.8h + smlal v20.4s, v1.4h, v6.4h // += (left-right)*weights_hor + smlal2 v21.4s, v1.8h, v6.8h // (left flipped) + smlal v22.4s, v1.4h, v7.4h + smlal2 v23.4s, v1.8h, v7.8h + smlal v24.4s, v0.4h, v6.4h + smlal2 v25.4s, v0.8h, v6.8h + smlal v26.4s, v0.4h, v7.4h + smlal2 v27.4s, v0.8h, v7.8h + smlal v20.4s, v2.4h, v16.4h // += (top-bottom)*weights_ver + smlal2 v21.4s, v2.8h, v16.8h + smlal v22.4s, v3.4h, v16.4h + smlal2 v23.4s, v3.8h, v16.8h + smlal v24.4s, v2.4h, v17.4h + smlal2 v25.4s, v2.8h, v17.8h + smlal v26.4s, v3.4h, v17.4h + smlal2 v27.4s, v3.8h, v17.8h + rshrn v20.4h, v20.4s, #9 + rshrn2 v20.8h, v21.4s, #9 + rshrn v21.4h, v22.4s, #9 + rshrn2 v21.8h, v23.4s, #9 + rshrn v22.4h, v24.4s, #9 + rshrn2 v22.8h, v25.4s, #9 + rshrn v23.4h, v26.4s, #9 + rshrn2 v23.8h, v27.4s, #9 + subs w3, w3, #16 + st1 {v20.8h, v21.8h}, [x0], #32 + st1 {v22.8h, v23.8h}, [x6], #32 + b.gt 2b + subs w4, w4, #2 + b.le 9f + sub x8, x8, w9, uxtw #1 + sub x10, x10, w9, uxtw + add x0, x0, x1 + add x6, x6, x1 + mov w3, w9 + b 1b +9: + ret + +L(ipred_smooth_tbl): + .hword L(ipred_smooth_tbl) - 640b + .hword L(ipred_smooth_tbl) - 320b + .hword L(ipred_smooth_tbl) - 160b + .hword L(ipred_smooth_tbl) - 80b + .hword L(ipred_smooth_tbl) - 40b +endfunc + +// void ipred_smooth_v_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_smooth_v_16bpc_neon, export=1 + movrel x7, X(sm_weights) + add x7, x7, w4, uxtw + clz w9, w3 + adr x5, L(ipred_smooth_v_tbl) + sub x8, x2, w4, uxtw #1 + sub w9, w9, #25 + ldrh w9, [x5, w9, uxtw #1] + ld1r {v4.8h}, [x8] // bottom + add x2, x2, #2 + sub x5, x5, w9, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +40: + ld1r {v6.2d}, [x2] // top + sub v6.8h, v6.8h, v4.8h // top-bottom +4: + ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver + zip1 v16.2s, v16.2s, v17.2s // weights_ver + zip1 v18.2s, v18.2s, v19.2s + ushll v16.8h, v16.8b, #7 // weights_ver << 7 + ushll v18.8h, v18.8b, #7 + sqrdmulh v20.8h, v6.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8 + sqrdmulh v21.8h, v6.8h, v18.8h + add v20.8h, v20.8h, v4.8h + add v21.8h, v21.8h, v4.8h + st1 {v20.d}[0], [x0], x1 + st1 {v20.d}[1], [x6], x1 + subs w4, w4, #4 + st1 {v21.d}[0], [x0], x1 + st1 {v21.d}[1], [x6], x1 + b.gt 4b + ret +80: + ld1 {v6.8h}, [x2] // top + sub v6.8h, v6.8h, v4.8h // top-bottom +8: + ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver + ushll v16.8h, v16.8b, #7 // weights_ver << 7 + ushll v17.8h, v17.8b, #7 + ushll v18.8h, v18.8b, #7 + ushll v19.8h, v19.8b, #7 + sqrdmulh v20.8h, v6.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8 + sqrdmulh v21.8h, v6.8h, v17.8h + sqrdmulh v22.8h, v6.8h, v18.8h + sqrdmulh v23.8h, v6.8h, v19.8h + add v20.8h, v20.8h, v4.8h + add v21.8h, v21.8h, v4.8h + add v22.8h, v22.8h, v4.8h + add v23.8h, v23.8h, v4.8h + st1 {v20.8h}, [x0], x1 + st1 {v21.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v22.8h}, [x0], x1 + st1 {v23.8h}, [x6], x1 + b.gt 8b + ret +160: +320: +640: + // Set up pointers for four rows in parallel; x0, x6, x5, x8 + add x5, x0, x1 + add x8, x6, x1 + lsl x1, x1, #1 + sub x1, x1, w3, uxtw #1 + mov w9, w3 + +1: + ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver + ushll v16.8h, v16.8b, #7 // weights_ver << 7 + ushll v17.8h, v17.8b, #7 + ushll v18.8h, v18.8b, #7 + ushll v19.8h, v19.8b, #7 +2: + ld1 {v2.8h, v3.8h}, [x2], #32 // top + sub v2.8h, v2.8h, v4.8h // top-bottom + sub v3.8h, v3.8h, v4.8h + sqrdmulh v20.8h, v2.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8 + sqrdmulh v21.8h, v3.8h, v16.8h + sqrdmulh v22.8h, v2.8h, v17.8h + sqrdmulh v23.8h, v3.8h, v17.8h + sqrdmulh v24.8h, v2.8h, v18.8h + sqrdmulh v25.8h, v3.8h, v18.8h + sqrdmulh v26.8h, v2.8h, v19.8h + sqrdmulh v27.8h, v3.8h, v19.8h + add v20.8h, v20.8h, v4.8h + add v21.8h, v21.8h, v4.8h + add v22.8h, v22.8h, v4.8h + add v23.8h, v23.8h, v4.8h + add v24.8h, v24.8h, v4.8h + add v25.8h, v25.8h, v4.8h + add v26.8h, v26.8h, v4.8h + add v27.8h, v27.8h, v4.8h + subs w3, w3, #16 + st1 {v20.8h, v21.8h}, [x0], #32 + st1 {v22.8h, v23.8h}, [x6], #32 + st1 {v24.8h, v25.8h}, [x5], #32 + st1 {v26.8h, v27.8h}, [x8], #32 + b.gt 2b + subs w4, w4, #4 + b.le 9f + sub x2, x2, w9, uxtw #1 + add x0, x0, x1 + add x6, x6, x1 + add x5, x5, x1 + add x8, x8, x1 + mov w3, w9 + b 1b +9: + ret + +L(ipred_smooth_v_tbl): + .hword L(ipred_smooth_v_tbl) - 640b + .hword L(ipred_smooth_v_tbl) - 320b + .hword L(ipred_smooth_v_tbl) - 160b + .hword L(ipred_smooth_v_tbl) - 80b + .hword L(ipred_smooth_v_tbl) - 40b +endfunc + +// void ipred_smooth_h_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_smooth_h_16bpc_neon, export=1 + movrel x8, X(sm_weights) + add x8, x8, w3, uxtw + clz w9, w3 + adr x5, L(ipred_smooth_h_tbl) + add x12, x2, w3, uxtw #1 + sub w9, w9, #25 + ldrh w9, [x5, w9, uxtw #1] + ld1r {v5.8h}, [x12] // right + sub x5, x5, w9, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +40: + ld1r {v7.2s}, [x8] // weights_hor + sub x2, x2, #8 + mov x7, #-8 + ushll v7.8h, v7.8b, #7 // weights_hor << 7 +4: + ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 // left + zip1 v1.2d, v1.2d, v0.2d // left, flipped + zip1 v0.2d, v3.2d, v2.2d + sub v0.8h, v0.8h, v5.8h // left-right + sub v1.8h, v1.8h, v5.8h + sqrdmulh v20.8h, v0.8h, v7.8h // ((left-right)*weights_hor + 128) >> 8 + sqrdmulh v21.8h, v1.8h, v7.8h + add v20.8h, v20.8h, v5.8h + add v21.8h, v21.8h, v5.8h + st1 {v20.d}[0], [x0], x1 + st1 {v20.d}[1], [x6], x1 + subs w4, w4, #4 + st1 {v21.d}[0], [x0], x1 + st1 {v21.d}[1], [x6], x1 + b.gt 4b + ret +80: + ld1 {v7.8b}, [x8] // weights_hor + sub x2, x2, #8 + mov x7, #-8 + ushll v7.8h, v7.8b, #7 // weights_hor << 7 +8: + ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left + sub v3.8h, v3.8h, v5.8h // left-right + sub v2.8h, v2.8h, v5.8h + sub v1.8h, v1.8h, v5.8h + sub v0.8h, v0.8h, v5.8h + sqrdmulh v20.8h, v3.8h, v7.8h // ((left-right)*weights_hor + 128) >> 8 + sqrdmulh v21.8h, v2.8h, v7.8h // (left flipped) + sqrdmulh v22.8h, v1.8h, v7.8h + sqrdmulh v23.8h, v0.8h, v7.8h + add v20.8h, v20.8h, v5.8h + add v21.8h, v21.8h, v5.8h + add v22.8h, v22.8h, v5.8h + add v23.8h, v23.8h, v5.8h + st1 {v20.8h}, [x0], x1 + st1 {v21.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v22.8h}, [x0], x1 + st1 {v23.8h}, [x6], x1 + b.gt 8b + ret +160: +320: +640: + sub x2, x2, #8 + mov x7, #-8 + // Set up pointers for four rows in parallel; x0, x6, x5, x10 + add x5, x0, x1 + add x10, x6, x1 + lsl x1, x1, #1 + sub x1, x1, w3, uxtw #1 + mov w9, w3 + +1: + ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left + sub v0.8h, v0.8h, v5.8h // left-right + sub v1.8h, v1.8h, v5.8h + sub v2.8h, v2.8h, v5.8h + sub v3.8h, v3.8h, v5.8h +2: + ld1 {v7.16b}, [x8], #16 // weights_hor + ushll v6.8h, v7.8b, #7 // weights_hor << 7 + ushll2 v7.8h, v7.16b, #7 + sqrdmulh v20.8h, v3.8h, v6.8h // ((left-right)*weights_hor + 128) >> 8 + sqrdmulh v21.8h, v3.8h, v7.8h // (left flipped) + sqrdmulh v22.8h, v2.8h, v6.8h + sqrdmulh v23.8h, v2.8h, v7.8h + sqrdmulh v24.8h, v1.8h, v6.8h + sqrdmulh v25.8h, v1.8h, v7.8h + sqrdmulh v26.8h, v0.8h, v6.8h + sqrdmulh v27.8h, v0.8h, v7.8h + add v20.8h, v20.8h, v5.8h + add v21.8h, v21.8h, v5.8h + add v22.8h, v22.8h, v5.8h + add v23.8h, v23.8h, v5.8h + add v24.8h, v24.8h, v5.8h + add v25.8h, v25.8h, v5.8h + add v26.8h, v26.8h, v5.8h + add v27.8h, v27.8h, v5.8h + subs w3, w3, #16 + st1 {v20.8h, v21.8h}, [x0], #32 + st1 {v22.8h, v23.8h}, [x6], #32 + st1 {v24.8h, v25.8h}, [x5], #32 + st1 {v26.8h, v27.8h}, [x10], #32 + b.gt 2b + subs w4, w4, #4 + b.le 9f + sub x8, x8, w9, uxtw + add x0, x0, x1 + add x6, x6, x1 + add x5, x5, x1 + add x10, x10, x1 + mov w3, w9 + b 1b +9: + ret + +L(ipred_smooth_h_tbl): + .hword L(ipred_smooth_h_tbl) - 640b + .hword L(ipred_smooth_h_tbl) - 320b + .hword L(ipred_smooth_h_tbl) - 160b + .hword L(ipred_smooth_h_tbl) - 80b + .hword L(ipred_smooth_h_tbl) - 40b +endfunc + +// void ipred_filter_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int filt_idx, +// const int max_width, const int max_height, +// const int bitdepth_max); +.macro filter_fn bpc +function ipred_filter_\bpc\()bpc_neon + and w5, w5, #511 + movrel x6, X(filter_intra_taps) + lsl w5, w5, #6 + add x6, x6, w5, uxtw + ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32 + clz w9, w3 + adr x5, L(ipred_filter\bpc\()_tbl) + ld1 {v20.8b, v21.8b, v22.8b}, [x6] + sub w9, w9, #26 + ldrh w9, [x5, w9, uxtw #1] + sxtl v16.8h, v16.8b + sxtl v17.8h, v17.8b + sub x5, x5, w9, uxtw + sxtl v18.8h, v18.8b + sxtl v19.8h, v19.8b + add x6, x0, x1 + lsl x1, x1, #1 + sxtl v20.8h, v20.8b + sxtl v21.8h, v21.8b + sxtl v22.8h, v22.8b + dup v31.8h, w8 + movi v30.8h, #0 + br x5 +40: + ldur d0, [x2, #2] // top (0-3) + sub x2, x2, #4 + mov x7, #-4 +4: + ld1 {v1.4h}, [x2], x7 // left (0-1) + topleft (2) +.if \bpc == 10 + mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) + mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) + mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) + mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) + mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) + mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) + mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) + srshr v2.8h, v2.8h, #4 + smax v2.8h, v2.8h, v30.8h +.else + smull v2.4s, v17.4h, v0.h[0] // p1(top[0]) * filter(1) + smlal v2.4s, v18.4h, v0.h[1] // p2(top[1]) * filter(2) + smlal v2.4s, v19.4h, v0.h[2] // p3(top[2]) * filter(3) + smlal v2.4s, v20.4h, v0.h[3] // p4(top[3]) * filter(4) + smlal v2.4s, v16.4h, v1.h[2] // p0(topleft) * filter(0) + smlal v2.4s, v21.4h, v1.h[1] // p5(left[0]) * filter(5) + smlal v2.4s, v22.4h, v1.h[0] // p6(left[1]) * filter(6) + smull2 v3.4s, v17.8h, v0.h[0] // p1(top[0]) * filter(1) + smlal2 v3.4s, v18.8h, v0.h[1] // p2(top[1]) * filter(2) + smlal2 v3.4s, v19.8h, v0.h[2] // p3(top[2]) * filter(3) + smlal2 v3.4s, v20.8h, v0.h[3] // p4(top[3]) * filter(4) + smlal2 v3.4s, v16.8h, v1.h[2] // p0(topleft) * filter(0) + smlal2 v3.4s, v21.8h, v1.h[1] // p5(left[0]) * filter(5) + smlal2 v3.4s, v22.8h, v1.h[0] // p6(left[1]) * filter(6) + sqrshrun v2.4h, v2.4s, #4 + sqrshrun2 v2.8h, v3.4s, #4 +.endif + smin v2.8h, v2.8h, v31.8h + subs w4, w4, #2 + st1 {v2.d}[0], [x0], x1 + uxtl v0.8h, v2.8b + ext v0.16b, v2.16b, v2.16b, #8 // move top from [4-7] to [0-3] + st1 {v2.d}[1], [x6], x1 + b.gt 4b + ret +80: + ldur q0, [x2, #2] // top (0-7) + sub x2, x2, #4 + mov x7, #-4 +8: + ld1 {v1.4h}, [x2], x7 // left (0-1) + topleft (2) +.if \bpc == 10 + mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) + mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) + mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) + mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) + mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) + mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) + mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) + mul v3.8h, v17.8h, v0.h[4] // p1(top[0]) * filter(1) + mla v3.8h, v18.8h, v0.h[5] // p2(top[1]) * filter(2) + mla v3.8h, v19.8h, v0.h[6] // p3(top[2]) * filter(3) + srshr v2.8h, v2.8h, #4 + smax v2.8h, v2.8h, v30.8h + smin v2.8h, v2.8h, v31.8h + mla v3.8h, v20.8h, v0.h[7] // p4(top[3]) * filter(4) + mla v3.8h, v16.8h, v0.h[3] // p0(topleft) * filter(0) + mla v3.8h, v21.8h, v2.h[3] // p5(left[0]) * filter(5) + mla v3.8h, v22.8h, v2.h[7] // p6(left[1]) * filter(6) + srshr v3.8h, v3.8h, #4 + smax v3.8h, v3.8h, v30.8h +.else + smull v2.4s, v17.4h, v0.h[0] // p1(top[0]) * filter(1) + smlal v2.4s, v18.4h, v0.h[1] // p2(top[1]) * filter(2) + smlal v2.4s, v19.4h, v0.h[2] // p3(top[2]) * filter(3) + smlal v2.4s, v20.4h, v0.h[3] // p4(top[3]) * filter(4) + smlal v2.4s, v16.4h, v1.h[2] // p0(topleft) * filter(0) + smlal v2.4s, v21.4h, v1.h[1] // p5(left[0]) * filter(5) + smlal v2.4s, v22.4h, v1.h[0] // p6(left[1]) * filter(6) + smull2 v3.4s, v17.8h, v0.h[0] // p1(top[0]) * filter(1) + smlal2 v3.4s, v18.8h, v0.h[1] // p2(top[1]) * filter(2) + smlal2 v3.4s, v19.8h, v0.h[2] // p3(top[2]) * filter(3) + smlal2 v3.4s, v20.8h, v0.h[3] // p4(top[3]) * filter(4) + smlal2 v3.4s, v16.8h, v1.h[2] // p0(topleft) * filter(0) + smlal2 v3.4s, v21.8h, v1.h[1] // p5(left[0]) * filter(5) + smlal2 v3.4s, v22.8h, v1.h[0] // p6(left[1]) * filter(6) + smull v4.4s, v17.4h, v0.h[4] // p1(top[0]) * filter(1) + smlal v4.4s, v18.4h, v0.h[5] // p2(top[1]) * filter(2) + smlal v4.4s, v19.4h, v0.h[6] // p3(top[2]) * filter(3) + sqrshrun v2.4h, v2.4s, #4 + sqrshrun2 v2.8h, v3.4s, #4 + smin v2.8h, v2.8h, v31.8h + smlal v4.4s, v20.4h, v0.h[7] // p4(top[3]) * filter(4) + smlal v4.4s, v16.4h, v0.h[3] // p0(topleft) * filter(0) + smlal v4.4s, v21.4h, v2.h[3] // p5(left[0]) * filter(5) + smlal v4.4s, v22.4h, v2.h[7] // p6(left[1]) * filter(6) + smull2 v5.4s, v17.8h, v0.h[4] // p1(top[0]) * filter(1) + smlal2 v5.4s, v18.8h, v0.h[5] // p2(top[1]) * filter(2) + smlal2 v5.4s, v19.8h, v0.h[6] // p3(top[2]) * filter(3) + smlal2 v5.4s, v20.8h, v0.h[7] // p4(top[3]) * filter(4) + smlal2 v5.4s, v16.8h, v0.h[3] // p0(topleft) * filter(0) + smlal2 v5.4s, v21.8h, v2.h[3] // p5(left[0]) * filter(5) + smlal2 v5.4s, v22.8h, v2.h[7] // p6(left[1]) * filter(6) + sqrshrun v3.4h, v4.4s, #4 + sqrshrun2 v3.8h, v5.4s, #4 +.endif + smin v3.8h, v3.8h, v31.8h + subs w4, w4, #2 + st2 {v2.d, v3.d}[0], [x0], x1 + zip2 v0.2d, v2.2d, v3.2d + st2 {v2.d, v3.d}[1], [x6], x1 + b.gt 8b + ret +160: +320: + add x8, x2, #2 + sub x2, x2, #4 + mov x7, #-4 + sub x1, x1, w3, uxtw #1 + mov w9, w3 + +1: + ld1 {v0.4h}, [x2], x7 // left (0-1) + topleft (2) +2: + ld1 {v1.8h, v2.8h}, [x8], #32 // top(0-15) +.if \bpc == 10 + mul v3.8h, v16.8h, v0.h[2] // p0(topleft) * filter(0) + mla v3.8h, v21.8h, v0.h[1] // p5(left[0]) * filter(5) + mla v3.8h, v22.8h, v0.h[0] // p6(left[1]) * filter(6) + mla v3.8h, v17.8h, v1.h[0] // p1(top[0]) * filter(1) + mla v3.8h, v18.8h, v1.h[1] // p2(top[1]) * filter(2) + mla v3.8h, v19.8h, v1.h[2] // p3(top[2]) * filter(3) + mla v3.8h, v20.8h, v1.h[3] // p4(top[3]) * filter(4) + + mul v4.8h, v17.8h, v1.h[4] // p1(top[0]) * filter(1) + mla v4.8h, v18.8h, v1.h[5] // p2(top[1]) * filter(2) + mla v4.8h, v19.8h, v1.h[6] // p3(top[2]) * filter(3) + srshr v3.8h, v3.8h, #4 + smax v3.8h, v3.8h, v30.8h + smin v3.8h, v3.8h, v31.8h + mla v4.8h, v20.8h, v1.h[7] // p4(top[3]) * filter(4) + mla v4.8h, v16.8h, v1.h[3] // p0(topleft) * filter(0) + mla v4.8h, v21.8h, v3.h[3] // p5(left[0]) * filter(5) + mla v4.8h, v22.8h, v3.h[7] // p6(left[1]) * filter(6) + + mul v5.8h, v17.8h, v2.h[0] // p1(top[0]) * filter(1) + mla v5.8h, v18.8h, v2.h[1] // p2(top[1]) * filter(2) + mla v5.8h, v19.8h, v2.h[2] // p3(top[2]) * filter(3) + srshr v4.8h, v4.8h, #4 + smax v4.8h, v4.8h, v30.8h + smin v4.8h, v4.8h, v31.8h + mla v5.8h, v20.8h, v2.h[3] // p4(top[3]) * filter(4) + mla v5.8h, v16.8h, v1.h[7] // p0(topleft) * filter(0) + mla v5.8h, v21.8h, v4.h[3] // p5(left[0]) * filter(5) + mla v5.8h, v22.8h, v4.h[7] // p6(left[1]) * filter(6) + + mul v6.8h, v17.8h, v2.h[4] // p1(top[0]) * filter(1) + mla v6.8h, v18.8h, v2.h[5] // p2(top[1]) * filter(2) + mla v6.8h, v19.8h, v2.h[6] // p3(top[2]) * filter(3) + srshr v5.8h, v5.8h, #4 + smax v5.8h, v5.8h, v30.8h + smin v5.8h, v5.8h, v31.8h + mla v6.8h, v20.8h, v2.h[7] // p4(top[3]) * filter(4) + mla v6.8h, v16.8h, v2.h[3] // p0(topleft) * filter(0) + mla v6.8h, v21.8h, v5.h[3] // p5(left[0]) * filter(5) + mla v6.8h, v22.8h, v5.h[7] // p6(left[1]) * filter(6) + + subs w3, w3, #16 + srshr v6.8h, v6.8h, #4 + smax v6.8h, v6.8h, v30.8h +.else + smull v3.4s, v16.4h, v0.h[2] // p0(topleft) * filter(0) + smlal v3.4s, v21.4h, v0.h[1] // p5(left[0]) * filter(5) + smlal v3.4s, v22.4h, v0.h[0] // p6(left[1]) * filter(6) + smlal v3.4s, v17.4h, v1.h[0] // p1(top[0]) * filter(1) + smlal v3.4s, v18.4h, v1.h[1] // p2(top[1]) * filter(2) + smlal v3.4s, v19.4h, v1.h[2] // p3(top[2]) * filter(3) + smlal v3.4s, v20.4h, v1.h[3] // p4(top[3]) * filter(4) + smull2 v4.4s, v16.8h, v0.h[2] // p0(topleft) * filter(0) + smlal2 v4.4s, v21.8h, v0.h[1] // p5(left[0]) * filter(5) + smlal2 v4.4s, v22.8h, v0.h[0] // p6(left[1]) * filter(6) + smlal2 v4.4s, v17.8h, v1.h[0] // p1(top[0]) * filter(1) + smlal2 v4.4s, v18.8h, v1.h[1] // p2(top[1]) * filter(2) + smlal2 v4.4s, v19.8h, v1.h[2] // p3(top[2]) * filter(3) + smlal2 v4.4s, v20.8h, v1.h[3] // p4(top[3]) * filter(4) + + smull v5.4s, v17.4h, v1.h[4] // p1(top[0]) * filter(1) + smlal v5.4s, v18.4h, v1.h[5] // p2(top[1]) * filter(2) + smlal v5.4s, v19.4h, v1.h[6] // p3(top[2]) * filter(3) + sqrshrun v3.4h, v3.4s, #4 + sqrshrun2 v3.8h, v4.4s, #4 + smin v3.8h, v3.8h, v31.8h + smlal v5.4s, v20.4h, v1.h[7] // p4(top[3]) * filter(4) + smlal v5.4s, v16.4h, v1.h[3] // p0(topleft) * filter(0) + smlal v5.4s, v21.4h, v3.h[3] // p5(left[0]) * filter(5) + smlal v5.4s, v22.4h, v3.h[7] // p6(left[1]) * filter(6) + smull2 v6.4s, v17.8h, v1.h[4] // p1(top[0]) * filter(1) + smlal2 v6.4s, v18.8h, v1.h[5] // p2(top[1]) * filter(2) + smlal2 v6.4s, v19.8h, v1.h[6] // p3(top[2]) * filter(3) + smlal2 v6.4s, v20.8h, v1.h[7] // p4(top[3]) * filter(4) + smlal2 v6.4s, v16.8h, v1.h[3] // p0(topleft) * filter(0) + smlal2 v6.4s, v21.8h, v3.h[3] // p5(left[0]) * filter(5) + smlal2 v6.4s, v22.8h, v3.h[7] // p6(left[1]) * filter(6) + + smull v24.4s, v17.4h, v2.h[0] // p1(top[0]) * filter(1) + smlal v24.4s, v18.4h, v2.h[1] // p2(top[1]) * filter(2) + smlal v24.4s, v19.4h, v2.h[2] // p3(top[2]) * filter(3) + sqrshrun v4.4h, v5.4s, #4 + sqrshrun2 v4.8h, v6.4s, #4 + smin v4.8h, v4.8h, v31.8h + smlal v24.4s, v20.4h, v2.h[3] // p4(top[3]) * filter(4) + smlal v24.4s, v16.4h, v1.h[7] // p0(topleft) * filter(0) + smlal v24.4s, v21.4h, v4.h[3] // p5(left[0]) * filter(5) + smlal v24.4s, v22.4h, v4.h[7] // p6(left[1]) * filter(6) + smull2 v25.4s, v17.8h, v2.h[0] // p1(top[0]) * filter(1) + smlal2 v25.4s, v18.8h, v2.h[1] // p2(top[1]) * filter(2) + smlal2 v25.4s, v19.8h, v2.h[2] // p3(top[2]) * filter(3) + smlal2 v25.4s, v20.8h, v2.h[3] // p4(top[3]) * filter(4) + smlal2 v25.4s, v16.8h, v1.h[7] // p0(topleft) * filter(0) + smlal2 v25.4s, v21.8h, v4.h[3] // p5(left[0]) * filter(5) + smlal2 v25.4s, v22.8h, v4.h[7] // p6(left[1]) * filter(6) + + smull v26.4s, v17.4h, v2.h[4] // p1(top[0]) * filter(1) + smlal v26.4s, v18.4h, v2.h[5] // p2(top[1]) * filter(2) + smlal v26.4s, v19.4h, v2.h[6] // p3(top[2]) * filter(3) + sqrshrun v5.4h, v24.4s, #4 + sqrshrun2 v5.8h, v25.4s, #4 + smin v5.8h, v5.8h, v31.8h + smlal v26.4s, v20.4h, v2.h[7] // p4(top[3]) * filter(4) + smlal v26.4s, v16.4h, v2.h[3] // p0(topleft) * filter(0) + smlal v26.4s, v21.4h, v5.h[3] // p5(left[0]) * filter(5) + smlal v26.4s, v22.4h, v5.h[7] // p6(left[1]) * filter(6) + smull2 v27.4s, v17.8h, v2.h[4] // p1(top[0]) * filter(1) + smlal2 v27.4s, v18.8h, v2.h[5] // p2(top[1]) * filter(2) + smlal2 v27.4s, v19.8h, v2.h[6] // p3(top[2]) * filter(3) + smlal2 v27.4s, v20.8h, v2.h[7] // p4(top[3]) * filter(4) + smlal2 v27.4s, v16.8h, v2.h[3] // p0(topleft) * filter(0) + smlal2 v27.4s, v21.8h, v5.h[3] // p5(left[0]) * filter(5) + smlal2 v27.4s, v22.8h, v5.h[7] // p6(left[1]) * filter(6) + + subs w3, w3, #16 + sqrshrun v6.4h, v26.4s, #4 + sqrshrun2 v6.8h, v27.4s, #4 +.endif + smin v6.8h, v6.8h, v31.8h + + ins v0.h[2], v2.h[7] + st4 {v3.d, v4.d, v5.d, v6.d}[0], [x0], #32 + ins v0.h[0], v6.h[7] + st4 {v3.d, v4.d, v5.d, v6.d}[1], [x6], #32 + ins v0.h[1], v6.h[3] + b.gt 2b + subs w4, w4, #2 + b.le 9f + sub x8, x6, w9, uxtw #1 + add x0, x0, x1 + add x6, x6, x1 + mov w3, w9 + b 1b +9: + ret + +L(ipred_filter\bpc\()_tbl): + .hword L(ipred_filter\bpc\()_tbl) - 320b + .hword L(ipred_filter\bpc\()_tbl) - 160b + .hword L(ipred_filter\bpc\()_tbl) - 80b + .hword L(ipred_filter\bpc\()_tbl) - 40b +endfunc +.endm + +filter_fn 10 +filter_fn 12 + +function ipred_filter_16bpc_neon, export=1 + ldr w8, [sp] + cmp w8, 0x3ff + b.le ipred_filter_10bpc_neon + b ipred_filter_12bpc_neon +endfunc + +// void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const uint16_t *const pal, const uint8_t *idx, +// const int w, const int h); +function pal_pred_16bpc_neon, export=1 + ld1 {v30.8h}, [x2] + clz w9, w4 + adr x6, L(pal_pred_tbl) + sub w9, w9, #25 + ldrh w9, [x6, w9, uxtw #1] + movi v31.8h, #1, lsl #8 + sub x6, x6, w9, uxtw + br x6 +40: + add x2, x0, x1 + lsl x1, x1, #1 +4: + ld1 {v1.16b}, [x3], #16 + subs w5, w5, #4 + // Restructure v1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ... + add v1.16b, v1.16b, v1.16b + zip1 v0.16b, v1.16b, v1.16b + zip2 v1.16b, v1.16b, v1.16b + add v0.8h, v0.8h, v31.8h + add v1.8h, v1.8h, v31.8h + tbl v0.16b, {v30.16b}, v0.16b + st1 {v0.d}[0], [x0], x1 + tbl v1.16b, {v30.16b}, v1.16b + st1 {v0.d}[1], [x2], x1 + st1 {v1.d}[0], [x0], x1 + st1 {v1.d}[1], [x2], x1 + b.gt 4b + ret +80: + add x2, x0, x1 + lsl x1, x1, #1 +8: + ld1 {v2.16b, v3.16b}, [x3], #32 + subs w5, w5, #4 + add v2.16b, v2.16b, v2.16b + add v3.16b, v3.16b, v3.16b + zip1 v0.16b, v2.16b, v2.16b + zip2 v1.16b, v2.16b, v2.16b + zip1 v2.16b, v3.16b, v3.16b + zip2 v3.16b, v3.16b, v3.16b + add v0.8h, v0.8h, v31.8h + add v1.8h, v1.8h, v31.8h + add v2.8h, v2.8h, v31.8h + add v3.8h, v3.8h, v31.8h + tbl v0.16b, {v30.16b}, v0.16b + tbl v1.16b, {v30.16b}, v1.16b + st1 {v0.8h}, [x0], x1 + tbl v2.16b, {v30.16b}, v2.16b + st1 {v1.8h}, [x2], x1 + tbl v3.16b, {v30.16b}, v3.16b + st1 {v2.8h}, [x0], x1 + st1 {v3.8h}, [x2], x1 + b.gt 8b + ret +160: + add x2, x0, x1 + lsl x1, x1, #1 +16: + ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64 + subs w5, w5, #4 + add v4.16b, v4.16b, v4.16b + add v5.16b, v5.16b, v5.16b + add v6.16b, v6.16b, v6.16b + add v7.16b, v7.16b, v7.16b + zip1 v0.16b, v4.16b, v4.16b + zip2 v1.16b, v4.16b, v4.16b + zip1 v2.16b, v5.16b, v5.16b + zip2 v3.16b, v5.16b, v5.16b + zip1 v4.16b, v6.16b, v6.16b + zip2 v5.16b, v6.16b, v6.16b + zip1 v6.16b, v7.16b, v7.16b + zip2 v7.16b, v7.16b, v7.16b + add v0.8h, v0.8h, v31.8h + add v1.8h, v1.8h, v31.8h + add v2.8h, v2.8h, v31.8h + add v3.8h, v3.8h, v31.8h + add v4.8h, v4.8h, v31.8h + tbl v0.16b, {v30.16b}, v0.16b + add v5.8h, v5.8h, v31.8h + tbl v1.16b, {v30.16b}, v1.16b + add v6.8h, v6.8h, v31.8h + tbl v2.16b, {v30.16b}, v2.16b + add v7.8h, v7.8h, v31.8h + tbl v3.16b, {v30.16b}, v3.16b + tbl v4.16b, {v30.16b}, v4.16b + tbl v5.16b, {v30.16b}, v5.16b + st1 {v0.8h, v1.8h}, [x0], x1 + tbl v6.16b, {v30.16b}, v6.16b + st1 {v2.8h, v3.8h}, [x2], x1 + tbl v7.16b, {v30.16b}, v7.16b + st1 {v4.8h, v5.8h}, [x0], x1 + st1 {v6.8h, v7.8h}, [x2], x1 + b.gt 16b + ret +320: + add x2, x0, x1 + lsl x1, x1, #1 +32: + ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64 + subs w5, w5, #2 + add v4.16b, v4.16b, v4.16b + add v5.16b, v5.16b, v5.16b + add v6.16b, v6.16b, v6.16b + add v7.16b, v7.16b, v7.16b + zip1 v0.16b, v4.16b, v4.16b + zip2 v1.16b, v4.16b, v4.16b + zip1 v2.16b, v5.16b, v5.16b + zip2 v3.16b, v5.16b, v5.16b + zip1 v4.16b, v6.16b, v6.16b + zip2 v5.16b, v6.16b, v6.16b + zip1 v6.16b, v7.16b, v7.16b + zip2 v7.16b, v7.16b, v7.16b + add v0.8h, v0.8h, v31.8h + add v1.8h, v1.8h, v31.8h + add v2.8h, v2.8h, v31.8h + add v3.8h, v3.8h, v31.8h + add v4.8h, v4.8h, v31.8h + tbl v0.16b, {v30.16b}, v0.16b + add v5.8h, v5.8h, v31.8h + tbl v1.16b, {v30.16b}, v1.16b + add v6.8h, v6.8h, v31.8h + tbl v2.16b, {v30.16b}, v2.16b + add v7.8h, v7.8h, v31.8h + tbl v3.16b, {v30.16b}, v3.16b + tbl v4.16b, {v30.16b}, v4.16b + tbl v5.16b, {v30.16b}, v5.16b + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + tbl v6.16b, {v30.16b}, v6.16b + tbl v7.16b, {v30.16b}, v7.16b + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1 + b.gt 32b + ret +640: + add x2, x0, #64 +64: + ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64 + subs w5, w5, #1 + add v4.16b, v4.16b, v4.16b + add v5.16b, v5.16b, v5.16b + add v6.16b, v6.16b, v6.16b + add v7.16b, v7.16b, v7.16b + zip1 v0.16b, v4.16b, v4.16b + zip2 v1.16b, v4.16b, v4.16b + zip1 v2.16b, v5.16b, v5.16b + zip2 v3.16b, v5.16b, v5.16b + zip1 v4.16b, v6.16b, v6.16b + zip2 v5.16b, v6.16b, v6.16b + zip1 v6.16b, v7.16b, v7.16b + zip2 v7.16b, v7.16b, v7.16b + add v0.8h, v0.8h, v31.8h + add v1.8h, v1.8h, v31.8h + add v2.8h, v2.8h, v31.8h + add v3.8h, v3.8h, v31.8h + add v4.8h, v4.8h, v31.8h + tbl v0.16b, {v30.16b}, v0.16b + add v5.8h, v5.8h, v31.8h + tbl v1.16b, {v30.16b}, v1.16b + add v6.8h, v6.8h, v31.8h + tbl v2.16b, {v30.16b}, v2.16b + add v7.8h, v7.8h, v31.8h + tbl v3.16b, {v30.16b}, v3.16b + tbl v4.16b, {v30.16b}, v4.16b + tbl v5.16b, {v30.16b}, v5.16b + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + tbl v6.16b, {v30.16b}, v6.16b + tbl v7.16b, {v30.16b}, v7.16b + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1 + b.gt 64b + ret + +L(pal_pred_tbl): + .hword L(pal_pred_tbl) - 640b + .hword L(pal_pred_tbl) - 320b + .hword L(pal_pred_tbl) - 160b + .hword L(pal_pred_tbl) - 80b + .hword L(pal_pred_tbl) - 40b +endfunc + +// void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha, +// const int bitdepth_max); +function ipred_cfl_128_16bpc_neon, export=1 + dup v31.8h, w7 // bitdepth_max + clz w9, w3 + adr x7, L(ipred_cfl_128_tbl) + sub w9, w9, #26 + ldrh w9, [x7, w9, uxtw #1] + urshr v0.8h, v31.8h, #1 + dup v1.8h, w6 // alpha + sub x7, x7, w9, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + movi v30.8h, #0 + br x7 +L(ipred_cfl_splat_w4): + ld1 {v4.8h, v5.8h}, [x5], #32 + subs w4, w4, #4 + smull v2.4s, v4.4h, v1.4h // diff = ac * alpha + smull2 v3.4s, v4.8h, v1.8h + smull v4.4s, v5.4h, v1.4h + smull2 v5.4s, v5.8h, v1.8h + sshr v16.4s, v2.4s, #31 // sign = diff >> 31 + sshr v17.4s, v3.4s, #31 + sshr v18.4s, v4.4s, #31 + sshr v19.4s, v5.4s, #31 + add v2.4s, v2.4s, v16.4s // diff + sign + add v3.4s, v3.4s, v17.4s + add v4.4s, v4.4s, v18.4s + add v5.4s, v5.4s, v19.4s + rshrn v2.4h, v2.4s, #6 // (diff + sign + 32) >> 6 = apply_sign() + rshrn2 v2.8h, v3.4s, #6 + rshrn v3.4h, v4.4s, #6 + rshrn2 v3.8h, v5.4s, #6 + add v2.8h, v2.8h, v0.8h // dc + apply_sign() + add v3.8h, v3.8h, v0.8h + smax v2.8h, v2.8h, v30.8h + smax v3.8h, v3.8h, v30.8h + smin v2.8h, v2.8h, v31.8h + smin v3.8h, v3.8h, v31.8h + st1 {v2.d}[0], [x0], x1 + st1 {v2.d}[1], [x6], x1 + st1 {v3.d}[0], [x0], x1 + st1 {v3.d}[1], [x6], x1 + b.gt L(ipred_cfl_splat_w4) + ret +L(ipred_cfl_splat_w8): + ld1 {v4.8h, v5.8h}, [x5], #32 + subs w4, w4, #2 + smull v2.4s, v4.4h, v1.4h // diff = ac * alpha + smull2 v3.4s, v4.8h, v1.8h + smull v4.4s, v5.4h, v1.4h + smull2 v5.4s, v5.8h, v1.8h + sshr v16.4s, v2.4s, #31 // sign = diff >> 31 + sshr v17.4s, v3.4s, #31 + sshr v18.4s, v4.4s, #31 + sshr v19.4s, v5.4s, #31 + add v2.4s, v2.4s, v16.4s // diff + sign + add v3.4s, v3.4s, v17.4s + add v4.4s, v4.4s, v18.4s + add v5.4s, v5.4s, v19.4s + rshrn v2.4h, v2.4s, #6 // (diff + sign + 32) >> 6 = apply_sign() + rshrn2 v2.8h, v3.4s, #6 + rshrn v3.4h, v4.4s, #6 + rshrn2 v3.8h, v5.4s, #6 + add v2.8h, v2.8h, v0.8h // dc + apply_sign() + add v3.8h, v3.8h, v0.8h + smax v2.8h, v2.8h, v30.8h + smax v3.8h, v3.8h, v30.8h + smin v2.8h, v2.8h, v31.8h + smin v3.8h, v3.8h, v31.8h + st1 {v2.8h}, [x0], x1 + st1 {v3.8h}, [x6], x1 + b.gt L(ipred_cfl_splat_w8) + ret +L(ipred_cfl_splat_w16): + add x7, x5, w3, uxtw #1 + sub x1, x1, w3, uxtw #1 + mov w9, w3 +1: + ld1 {v2.8h, v3.8h}, [x5], #32 + ld1 {v4.8h, v5.8h}, [x7], #32 + subs w3, w3, #16 + smull v16.4s, v2.4h, v1.4h // diff = ac * alpha + smull2 v17.4s, v2.8h, v1.8h + smull v18.4s, v3.4h, v1.4h + smull2 v19.4s, v3.8h, v1.8h + smull v2.4s, v4.4h, v1.4h + smull2 v3.4s, v4.8h, v1.8h + smull v4.4s, v5.4h, v1.4h + smull2 v5.4s, v5.8h, v1.8h + sshr v20.4s, v16.4s, #31 // sign = diff >> 31 + sshr v21.4s, v17.4s, #31 + sshr v22.4s, v18.4s, #31 + sshr v23.4s, v19.4s, #31 + sshr v24.4s, v2.4s, #31 + sshr v25.4s, v3.4s, #31 + sshr v26.4s, v4.4s, #31 + sshr v27.4s, v5.4s, #31 + add v16.4s, v16.4s, v20.4s // diff + sign + add v17.4s, v17.4s, v21.4s + add v18.4s, v18.4s, v22.4s + add v19.4s, v19.4s, v23.4s + add v2.4s, v2.4s, v24.4s + add v3.4s, v3.4s, v25.4s + add v4.4s, v4.4s, v26.4s + add v5.4s, v5.4s, v27.4s + rshrn v16.4h, v16.4s, #6 // (diff + sign + 32) >> 6 = apply_sign() + rshrn2 v16.8h, v17.4s, #6 + rshrn v17.4h, v18.4s, #6 + rshrn2 v17.8h, v19.4s, #6 + rshrn v6.4h, v2.4s, #6 + rshrn2 v6.8h, v3.4s, #6 + rshrn v7.4h, v4.4s, #6 + rshrn2 v7.8h, v5.4s, #6 + add v2.8h, v16.8h, v0.8h // dc + apply_sign() + add v3.8h, v17.8h, v0.8h + add v4.8h, v6.8h, v0.8h + add v5.8h, v7.8h, v0.8h + smax v2.8h, v2.8h, v30.8h + smax v3.8h, v3.8h, v30.8h + smax v4.8h, v4.8h, v30.8h + smax v5.8h, v5.8h, v30.8h + smin v2.8h, v2.8h, v31.8h + smin v3.8h, v3.8h, v31.8h + smin v4.8h, v4.8h, v31.8h + smin v5.8h, v5.8h, v31.8h + st1 {v2.8h, v3.8h}, [x0], #32 + st1 {v4.8h, v5.8h}, [x6], #32 + b.gt 1b + subs w4, w4, #2 + add x5, x5, w9, uxtw #1 + add x7, x7, w9, uxtw #1 + add x0, x0, x1 + add x6, x6, x1 + mov w3, w9 + b.gt 1b + ret + +L(ipred_cfl_128_tbl): +L(ipred_cfl_splat_tbl): + .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16) + .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16) + .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8) + .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4) +endfunc + +// void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha, +// const int bitdepth_max); +function ipred_cfl_top_16bpc_neon, export=1 + dup v31.8h, w7 // bitdepth_max + clz w9, w3 + adr x7, L(ipred_cfl_top_tbl) + sub w9, w9, #26 + ldrh w9, [x7, w9, uxtw #1] + dup v1.8h, w6 // alpha + add x2, x2, #2 + sub x7, x7, w9, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + movi v30.8h, #0 + br x7 +4: + ld1 {v0.4h}, [x2] + addv h0, v0.4h + urshr v0.4h, v0.4h, #2 + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w4) +8: + ld1 {v0.8h}, [x2] + addv h0, v0.8h + urshr v0.4h, v0.4h, #3 + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w8) +16: + ld1 {v2.8h, v3.8h}, [x2] + addp v0.8h, v2.8h, v3.8h + addv h0, v0.8h + urshr v0.4h, v0.4h, #4 + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w16) +32: + ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2] + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v0.8h, v2.8h, v4.8h + uaddlv s0, v0.8h + rshrn v0.4h, v0.4s, #5 + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w16) + +L(ipred_cfl_top_tbl): + .hword L(ipred_cfl_top_tbl) - 32b + .hword L(ipred_cfl_top_tbl) - 16b + .hword L(ipred_cfl_top_tbl) - 8b + .hword L(ipred_cfl_top_tbl) - 4b +endfunc + +// void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha, +// const int bitdepth_max); +function ipred_cfl_left_16bpc_neon, export=1 + dup v31.8h, w7 // bitdepth_max + sub x2, x2, w4, uxtw #1 + clz w9, w3 + clz w8, w4 + adr x10, L(ipred_cfl_splat_tbl) + adr x7, L(ipred_cfl_left_tbl) + sub w9, w9, #26 + sub w8, w8, #26 + ldrh w9, [x10, w9, uxtw #1] + ldrh w8, [x7, w8, uxtw #1] + dup v1.8h, w6 // alpha + sub x9, x10, w9, uxtw + sub x7, x7, w8, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + movi v30.8h, #0 + br x7 + +L(ipred_cfl_left_h4): + ld1 {v0.4h}, [x2] + addv h0, v0.4h + urshr v0.4h, v0.4h, #2 + dup v0.8h, v0.h[0] + br x9 + +L(ipred_cfl_left_h8): + ld1 {v0.8h}, [x2] + addv h0, v0.8h + urshr v0.4h, v0.4h, #3 + dup v0.8h, v0.h[0] + br x9 + +L(ipred_cfl_left_h16): + ld1 {v2.8h, v3.8h}, [x2] + addp v0.8h, v2.8h, v3.8h + addv h0, v0.8h + urshr v0.4h, v0.4h, #4 + dup v0.8h, v0.h[0] + br x9 + +L(ipred_cfl_left_h32): + ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2] + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v0.8h, v2.8h, v4.8h + uaddlv s0, v0.8h + rshrn v0.4h, v0.4s, #5 + dup v0.8h, v0.h[0] + br x9 + +L(ipred_cfl_left_tbl): + .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32) + .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16) + .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8) + .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4) +endfunc + +// void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha, +// const int bitdepth_max); +function ipred_cfl_16bpc_neon, export=1 + dup v31.8h, w7 // bitdepth_max + sub x2, x2, w4, uxtw #1 + add w8, w3, w4 // width + height + dup v1.8h, w6 // alpha + clz w9, w3 + clz w6, w4 + dup v16.4s, w8 // width + height + adr x7, L(ipred_cfl_tbl) + rbit w8, w8 // rbit(width + height) + sub w9, w9, #22 // 26 leading bits, minus table offset 4 + sub w6, w6, #26 + clz w8, w8 // ctz(width + height) + ldrh w9, [x7, w9, uxtw #1] + ldrh w6, [x7, w6, uxtw #1] + neg w8, w8 // -ctz(width + height) + sub x9, x7, w9, uxtw + sub x7, x7, w6, uxtw + ushr v16.4s, v16.4s, #1 // (width + height) >> 1 + dup v17.4s, w8 // -ctz(width + height) + add x6, x0, x1 + lsl x1, x1, #1 + movi v30.8h, #0 + br x7 + +L(ipred_cfl_h4): + ld1 {v0.4h}, [x2], #8 + uaddlv s0, v0.4h + br x9 +L(ipred_cfl_w4): + add x2, x2, #2 + ld1 {v2.4h}, [x2] + add v0.2s, v0.2s, v16.2s + uaddlv s2, v2.4h + cmp w4, #4 + add v0.2s, v0.2s, v2.2s + ushl v0.2s, v0.2s, v17.2s + b.eq 1f + // h = 8/16 + cmp w4, #16 + mov w16, #0x6667 + mov w17, #0xAAAB + csel w16, w16, w17, eq + dup v16.2s, w16 + mul v0.2s, v0.2s, v16.2s + ushr v0.2s, v0.2s, #17 +1: + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w4) + +L(ipred_cfl_h8): + ld1 {v0.8h}, [x2], #16 + uaddlv s0, v0.8h + br x9 +L(ipred_cfl_w8): + add x2, x2, #2 + ld1 {v2.8h}, [x2] + add v0.2s, v0.2s, v16.2s + uaddlv s2, v2.8h + cmp w4, #8 + add v0.2s, v0.2s, v2.2s + ushl v0.2s, v0.2s, v17.2s + b.eq 1f + // h = 4/16/32 + cmp w4, #32 + mov w16, #0x6667 + mov w17, #0xAAAB + csel w16, w16, w17, eq + dup v16.2s, w16 + mul v0.2s, v0.2s, v16.2s + ushr v0.2s, v0.2s, #17 +1: + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w8) + +L(ipred_cfl_h16): + ld1 {v2.8h, v3.8h}, [x2], #32 + addp v0.8h, v2.8h, v3.8h + uaddlv s0, v0.8h + br x9 +L(ipred_cfl_w16): + add x2, x2, #2 + ld1 {v2.8h, v3.8h}, [x2] + add v0.2s, v0.2s, v16.2s + addp v2.8h, v2.8h, v3.8h + uaddlv s2, v2.8h + cmp w4, #16 + add v0.2s, v0.2s, v2.2s + ushl v0.2s, v0.2s, v17.2s + b.eq 1f + // h = 4/8/32 + tst w4, #(32+16+8) // 16 added to make a consecutive bitmask + mov w16, #0x6667 + mov w17, #0xAAAB + csel w16, w16, w17, eq + dup v16.2s, w16 + mul v0.2s, v0.2s, v16.2s + ushr v0.2s, v0.2s, #17 +1: + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w16) + +L(ipred_cfl_h32): + ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2], #64 + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v0.8h, v2.8h, v4.8h + uaddlv s0, v0.8h + br x9 +L(ipred_cfl_w32): + add x2, x2, #2 + ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2] + add v0.4s, v0.4s, v16.4s + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v2.8h, v2.8h, v4.8h + cmp w4, #32 + uaddlv s2, v2.8h + add v0.2s, v0.2s, v2.2s + ushl v0.2s, v0.2s, v17.2s + b.eq 1f + // h = 8/16 + cmp w4, #8 + mov w16, #0x6667 + mov w17, #0xAAAB + csel w16, w16, w17, eq + dup v16.2s, w16 + mul v0.2s, v0.2s, v16.2s + ushr v0.2s, v0.2s, #17 +1: + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w16) + +L(ipred_cfl_tbl): + .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32) + .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16) + .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8) + .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4) + .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32) + .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16) + .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8) + .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4) +endfunc + +// void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx, +// const ptrdiff_t stride, const int w_pad, +// const int h_pad, const int cw, const int ch); +function ipred_cfl_ac_420_16bpc_neon, export=1 + clz w8, w5 + lsl w4, w4, #2 + adr x7, L(ipred_cfl_ac_420_tbl) + sub w8, w8, #27 + ldrh w8, [x7, w8, uxtw #1] + movi v24.4s, #0 + movi v25.4s, #0 + movi v26.4s, #0 + movi v27.4s, #0 + sub x7, x7, w8, uxtw + sub w8, w6, w4 // height - h_pad + rbit w9, w5 // rbit(width) + rbit w10, w6 // rbit(height) + clz w9, w9 // ctz(width) + clz w10, w10 // ctz(height) + add w9, w9, w10 // log2sz + add x10, x1, x2 + dup v31.4s, w9 + lsl x2, x2, #1 + neg v31.4s, v31.4s // -log2sz + br x7 + +L(ipred_cfl_ac_420_w4): +1: // Copy and subsample input + ld1 {v0.8h}, [x1], x2 + ld1 {v1.8h}, [x10], x2 + ld1 {v2.8h}, [x1], x2 + ld1 {v3.8h}, [x10], x2 + addp v0.8h, v0.8h, v2.8h + addp v1.8h, v1.8h, v3.8h + add v0.8h, v0.8h, v1.8h + shl v0.8h, v0.8h, #1 + subs w8, w8, #2 + st1 {v0.8h}, [x0], #16 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + b.gt 1b + trn2 v1.2d, v0.2d, v0.2d + trn2 v0.2d, v0.2d, v0.2d +L(ipred_cfl_ac_420_w4_hpad): + cbz w4, 3f +2: // Vertical padding (h_pad > 0) + subs w4, w4, #4 + st1 {v0.8h, v1.8h}, [x0], #32 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + b.gt 2b +3: +L(ipred_cfl_ac_420_w4_calc_subtract_dc): + // Aggregate the sums + add v24.4s, v24.4s, v25.4s + add v26.4s, v26.4s, v27.4s + add v0.4s, v24.4s, v26.4s + addv s0, v0.4s // sum + sub x0, x0, w6, uxtw #3 + urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz + dup v4.8h, v4.h[0] +6: // Subtract dc from ac + ld1 {v0.8h, v1.8h}, [x0] + subs w6, w6, #4 + sub v0.8h, v0.8h, v4.8h + sub v1.8h, v1.8h, v4.8h + st1 {v0.8h, v1.8h}, [x0], #32 + b.gt 6b + ret + +L(ipred_cfl_ac_420_w8): + cbnz w3, L(ipred_cfl_ac_420_w8_wpad) +1: // Copy and subsample input, without padding + ld1 {v0.8h, v1.8h}, [x1], x2 + ld1 {v2.8h, v3.8h}, [x10], x2 + ld1 {v4.8h, v5.8h}, [x1], x2 + addp v0.8h, v0.8h, v1.8h + ld1 {v6.8h, v7.8h}, [x10], x2 + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v6.8h, v6.8h, v7.8h + add v0.8h, v0.8h, v2.8h + add v4.8h, v4.8h, v6.8h + shl v0.8h, v0.8h, #1 + shl v1.8h, v4.8h, #1 + subs w8, w8, #2 + st1 {v0.8h, v1.8h}, [x0], #32 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + b.gt 1b + mov v0.16b, v1.16b + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_420_w8_wpad): +1: // Copy and subsample input, padding 4 + ld1 {v0.8h}, [x1], x2 + ld1 {v1.8h}, [x10], x2 + ld1 {v2.8h}, [x1], x2 + ld1 {v3.8h}, [x10], x2 + addp v0.8h, v0.8h, v2.8h + addp v1.8h, v1.8h, v3.8h + add v0.8h, v0.8h, v1.8h + shl v0.8h, v0.8h, #1 + dup v1.4h, v0.h[3] + dup v3.4h, v0.h[7] + trn2 v2.2d, v0.2d, v0.2d + subs w8, w8, #2 + st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 + uaddw v24.4s, v24.4s, v0.4h + uaddw v25.4s, v25.4s, v1.4h + uaddw v26.4s, v26.4s, v2.4h + uaddw v27.4s, v27.4s, v3.4h + b.gt 1b + trn1 v0.2d, v2.2d, v3.2d + trn1 v1.2d, v2.2d, v3.2d + +L(ipred_cfl_ac_420_w8_hpad): + cbz w4, 3f +2: // Vertical padding (h_pad > 0) + subs w4, w4, #4 + st1 {v0.8h, v1.8h}, [x0], #32 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + st1 {v0.8h, v1.8h}, [x0], #32 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + b.gt 2b +3: + + // Double the height and reuse the w4 summing/subtracting + lsl w6, w6, #1 + b L(ipred_cfl_ac_420_w4_calc_subtract_dc) + +L(ipred_cfl_ac_420_w16): + adr x7, L(ipred_cfl_ac_420_w16_tbl) + ldrh w3, [x7, w3, uxtw #1] + sub x7, x7, w3, uxtw + br x7 + +L(ipred_cfl_ac_420_w16_wpad0): +1: // Copy and subsample input, without padding + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2 + addp v0.8h, v0.8h, v1.8h + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v6.8h, v6.8h, v7.8h + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x1], x2 + add v0.8h, v0.8h, v4.8h + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x10], x2 + add v2.8h, v2.8h, v6.8h + addp v16.8h, v16.8h, v17.8h + addp v18.8h, v18.8h, v19.8h + addp v20.8h, v20.8h, v21.8h + addp v22.8h, v22.8h, v23.8h + add v16.8h, v16.8h, v20.8h + add v18.8h, v18.8h, v22.8h + shl v0.8h, v0.8h, #1 + shl v1.8h, v2.8h, #1 + shl v2.8h, v16.8h, #1 + shl v3.8h, v18.8h, #1 + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_wpad1): +1: // Copy and subsample input, padding 4 + ldr q2, [x1, #32] + ld1 {v0.8h, v1.8h}, [x1], x2 + ldr q5, [x10, #32] + ld1 {v3.8h, v4.8h}, [x10], x2 + addp v2.8h, v2.8h, v2.8h + addp v0.8h, v0.8h, v1.8h + addp v5.8h, v5.8h, v5.8h + addp v3.8h, v3.8h, v4.8h + ldr q18, [x1, #32] + add v2.4h, v2.4h, v5.4h + ld1 {v16.8h, v17.8h}, [x1], x2 + add v0.8h, v0.8h, v3.8h + ldr q21, [x10, #32] + ld1 {v19.8h, v20.8h}, [x10], x2 + addp v18.8h, v18.8h, v18.8h + addp v16.8h, v16.8h, v17.8h + addp v21.8h, v21.8h, v21.8h + addp v19.8h, v19.8h, v20.8h + add v18.4h, v18.4h, v21.4h + add v16.8h, v16.8h, v19.8h + shl v1.4h, v2.4h, #1 + shl v0.8h, v0.8h, #1 + shl v3.4h, v18.4h, #1 + shl v2.8h, v16.8h, #1 + dup v4.4h, v1.h[3] + dup v5.4h, v3.h[3] + trn1 v1.2d, v1.2d, v4.2d + trn1 v3.2d, v3.2d, v5.2d + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_wpad2): +1: // Copy and subsample input, padding 8 + ld1 {v0.8h, v1.8h}, [x1], x2 + ld1 {v2.8h, v3.8h}, [x10], x2 + ld1 {v4.8h, v5.8h}, [x1], x2 + addp v0.8h, v0.8h, v1.8h + ld1 {v6.8h, v7.8h}, [x10], x2 + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v6.8h, v6.8h, v7.8h + add v0.8h, v0.8h, v2.8h + add v4.8h, v4.8h, v6.8h + shl v0.8h, v0.8h, #1 + shl v2.8h, v4.8h, #1 + dup v1.8h, v0.h[7] + dup v3.8h, v2.h[7] + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_wpad3): +1: // Copy and subsample input, padding 12 + ld1 {v0.8h}, [x1], x2 + ld1 {v2.8h}, [x10], x2 + ld1 {v4.8h}, [x1], x2 + ld1 {v6.8h}, [x10], x2 + addp v0.8h, v0.8h, v4.8h + addp v2.8h, v2.8h, v6.8h + add v0.8h, v0.8h, v2.8h + shl v0.8h, v0.8h, #1 + dup v1.8h, v0.h[3] + dup v3.8h, v0.h[7] + trn2 v2.2d, v0.2d, v3.2d + trn1 v0.2d, v0.2d, v1.2d + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + +L(ipred_cfl_ac_420_w16_hpad): + cbz w4, 3f +2: // Vertical padding (h_pad > 0) + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 2b +3: + + // Quadruple the height and reuse the w4 summing/subtracting + lsl w6, w6, #2 + b L(ipred_cfl_ac_420_w4_calc_subtract_dc) + +L(ipred_cfl_ac_420_tbl): + .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16) + .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8) + .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4) + .hword 0 + +L(ipred_cfl_ac_420_w16_tbl): + .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0) + .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1) + .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2) + .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3) +endfunc + +// void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx, +// const ptrdiff_t stride, const int w_pad, +// const int h_pad, const int cw, const int ch); +function ipred_cfl_ac_422_16bpc_neon, export=1 + clz w8, w5 + lsl w4, w4, #2 + adr x7, L(ipred_cfl_ac_422_tbl) + sub w8, w8, #27 + ldrh w8, [x7, w8, uxtw #1] + movi v24.4s, #0 + movi v25.4s, #0 + movi v26.4s, #0 + movi v27.4s, #0 + sub x7, x7, w8, uxtw + sub w8, w6, w4 // height - h_pad + rbit w9, w5 // rbit(width) + rbit w10, w6 // rbit(height) + clz w9, w9 // ctz(width) + clz w10, w10 // ctz(height) + add w9, w9, w10 // log2sz + add x10, x1, x2 + dup v31.4s, w9 + lsl x2, x2, #1 + neg v31.4s, v31.4s // -log2sz + br x7 + +L(ipred_cfl_ac_422_w4): +1: // Copy and subsample input + ld1 {v0.8h}, [x1], x2 + ld1 {v1.8h}, [x10], x2 + ld1 {v2.8h}, [x1], x2 + ld1 {v3.8h}, [x10], x2 + addp v0.8h, v0.8h, v1.8h + addp v2.8h, v2.8h, v3.8h + shl v0.8h, v0.8h, #2 + shl v1.8h, v2.8h, #2 + subs w8, w8, #4 + st1 {v0.8h, v1.8h}, [x0], #32 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + b.gt 1b + trn2 v0.2d, v1.2d, v1.2d + trn2 v1.2d, v1.2d, v1.2d + b L(ipred_cfl_ac_420_w4_hpad) + +L(ipred_cfl_ac_422_w8): + cbnz w3, L(ipred_cfl_ac_422_w8_wpad) +1: // Copy and subsample input, without padding + ld1 {v0.8h, v1.8h}, [x1], x2 + ld1 {v2.8h, v3.8h}, [x10], x2 + ld1 {v4.8h, v5.8h}, [x1], x2 + addp v0.8h, v0.8h, v1.8h + ld1 {v6.8h, v7.8h}, [x10], x2 + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v6.8h, v6.8h, v7.8h + shl v0.8h, v0.8h, #2 + shl v1.8h, v2.8h, #2 + shl v2.8h, v4.8h, #2 + shl v3.8h, v6.8h, #2 + subs w8, w8, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v3.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_422_w8_wpad): +1: // Copy and subsample input, padding 4 + ld1 {v0.8h}, [x1], x2 + ld1 {v1.8h}, [x10], x2 + ld1 {v2.8h}, [x1], x2 + ld1 {v3.8h}, [x10], x2 + addp v0.8h, v0.8h, v1.8h + addp v2.8h, v2.8h, v3.8h + shl v0.8h, v0.8h, #2 + shl v2.8h, v2.8h, #2 + dup v4.4h, v0.h[3] + dup v5.8h, v0.h[7] + dup v6.4h, v2.h[3] + dup v7.8h, v2.h[7] + trn2 v1.2d, v0.2d, v5.2d + trn1 v0.2d, v0.2d, v4.2d + trn2 v3.2d, v2.2d, v7.2d + trn1 v2.2d, v2.2d, v6.2d + subs w8, w8, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v3.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_422_w16): + adr x7, L(ipred_cfl_ac_422_w16_tbl) + ldrh w3, [x7, w3, uxtw #1] + sub x7, x7, w3, uxtw + br x7 + +L(ipred_cfl_ac_422_w16_wpad0): +1: // Copy and subsample input, without padding + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2 + addp v0.8h, v0.8h, v1.8h + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v6.8h, v6.8h, v7.8h + shl v0.8h, v0.8h, #2 + shl v1.8h, v2.8h, #2 + shl v2.8h, v4.8h, #2 + shl v3.8h, v6.8h, #2 + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_422_w16_wpad1): +1: // Copy and subsample input, padding 4 + ldr q2, [x1, #32] + ld1 {v0.8h, v1.8h}, [x1], x2 + ldr q6, [x10, #32] + ld1 {v4.8h, v5.8h}, [x10], x2 + addp v2.8h, v2.8h, v2.8h + addp v0.8h, v0.8h, v1.8h + addp v6.8h, v6.8h, v6.8h + addp v4.8h, v4.8h, v5.8h + shl v1.4h, v2.4h, #2 + shl v0.8h, v0.8h, #2 + shl v3.4h, v6.4h, #2 + shl v2.8h, v4.8h, #2 + dup v4.4h, v1.h[3] + dup v5.4h, v3.h[3] + trn1 v1.2d, v1.2d, v4.2d + trn1 v3.2d, v3.2d, v5.2d + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_422_w16_wpad2): +1: // Copy and subsample input, padding 8 + ld1 {v0.8h, v1.8h}, [x1], x2 + ld1 {v2.8h, v3.8h}, [x10], x2 + addp v0.8h, v0.8h, v1.8h + addp v2.8h, v2.8h, v3.8h + shl v0.8h, v0.8h, #2 + shl v2.8h, v2.8h, #2 + dup v1.8h, v0.h[7] + dup v3.8h, v2.h[7] + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_422_w16_wpad3): +1: // Copy and subsample input, padding 12 + ld1 {v0.8h}, [x1], x2 + ld1 {v2.8h}, [x10], x2 + addp v0.8h, v0.8h, v0.8h + addp v2.8h, v2.8h, v2.8h + shl v0.4h, v0.4h, #2 + shl v2.4h, v2.4h, #2 + dup v1.8h, v0.h[3] + dup v3.8h, v2.h[3] + trn1 v0.2d, v0.2d, v1.2d + trn1 v2.2d, v2.2d, v3.2d + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_422_tbl): + .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16) + .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8) + .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4) + .hword 0 + +L(ipred_cfl_ac_422_w16_tbl): + .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0) + .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1) + .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2) + .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3) +endfunc + +// void cfl_ac_444_16bpc_neon(int16_t *const ac, const pixel *const ypx, +// const ptrdiff_t stride, const int w_pad, +// const int h_pad, const int cw, const int ch); +function ipred_cfl_ac_444_16bpc_neon, export=1 + clz w8, w5 + lsl w4, w4, #2 + adr x7, L(ipred_cfl_ac_444_tbl) + sub w8, w8, #26 + ldrh w8, [x7, w8, uxtw #1] + movi v24.4s, #0 + movi v25.4s, #0 + movi v26.4s, #0 + movi v27.4s, #0 + sub x7, x7, w8, uxtw + sub w8, w6, w4 // height - h_pad + rbit w9, w5 // rbit(width) + rbit w10, w6 // rbit(height) + clz w9, w9 // ctz(width) + clz w10, w10 // ctz(height) + add w9, w9, w10 // log2sz + add x10, x1, x2 + dup v31.4s, w9 + lsl x2, x2, #1 + neg v31.4s, v31.4s // -log2sz + br x7 + +L(ipred_cfl_ac_444_w4): +1: // Copy and expand input + ld1 {v0.4h}, [x1], x2 + ld1 {v0.d}[1], [x10], x2 + ld1 {v1.4h}, [x1], x2 + ld1 {v1.d}[1], [x10], x2 + shl v0.8h, v0.8h, #3 + shl v1.8h, v1.8h, #3 + subs w8, w8, #4 + st1 {v0.8h, v1.8h}, [x0], #32 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + b.gt 1b + trn2 v0.2d, v1.2d, v1.2d + trn2 v1.2d, v1.2d, v1.2d + b L(ipred_cfl_ac_420_w4_hpad) + +L(ipred_cfl_ac_444_w8): +1: // Copy and expand input + ld1 {v0.8h}, [x1], x2 + ld1 {v1.8h}, [x10], x2 + ld1 {v2.8h}, [x1], x2 + shl v0.8h, v0.8h, #3 + ld1 {v3.8h}, [x10], x2 + shl v1.8h, v1.8h, #3 + shl v2.8h, v2.8h, #3 + shl v3.8h, v3.8h, #3 + subs w8, w8, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v3.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_444_w16): + cbnz w3, L(ipred_cfl_ac_444_w16_wpad) +1: // Copy and expand input, without padding + ld1 {v0.8h, v1.8h}, [x1], x2 + ld1 {v2.8h, v3.8h}, [x10], x2 + shl v0.8h, v0.8h, #3 + shl v1.8h, v1.8h, #3 + shl v2.8h, v2.8h, #3 + shl v3.8h, v3.8h, #3 + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_444_w16_wpad): +1: // Copy and expand input, padding 8 + ld1 {v0.8h}, [x1], x2 + ld1 {v2.8h}, [x10], x2 + shl v0.8h, v0.8h, #3 + shl v2.8h, v2.8h, #3 + dup v1.8h, v0.h[7] + dup v3.8h, v2.h[7] + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_444_w32): + adr x7, L(ipred_cfl_ac_444_w32_tbl) + ldrh w3, [x7, w3, uxtw] // (w3>>1) << 1 + lsr x2, x2, #1 // Restore the stride to one line increments + sub x7, x7, w3, uxtw + br x7 + +L(ipred_cfl_ac_444_w32_wpad0): +1: // Copy and expand input, without padding + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 + shl v0.8h, v0.8h, #3 + shl v1.8h, v1.8h, #3 + shl v2.8h, v2.8h, #3 + shl v3.8h, v3.8h, #3 + subs w8, w8, #1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + b L(ipred_cfl_ac_444_w32_hpad) + +L(ipred_cfl_ac_444_w32_wpad2): +1: // Copy and expand input, padding 8 + ld1 {v0.8h, v1.8h, v2.8h}, [x1], x2 + shl v2.8h, v2.8h, #3 + shl v0.8h, v0.8h, #3 + shl v1.8h, v1.8h, #3 + dup v3.8h, v2.h[7] + subs w8, w8, #1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + b L(ipred_cfl_ac_444_w32_hpad) + +L(ipred_cfl_ac_444_w32_wpad4): +1: // Copy and expand input, padding 16 + ld1 {v0.8h, v1.8h}, [x1], x2 + shl v1.8h, v1.8h, #3 + shl v0.8h, v0.8h, #3 + dup v2.8h, v1.h[7] + dup v3.8h, v1.h[7] + subs w8, w8, #1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + b L(ipred_cfl_ac_444_w32_hpad) + +L(ipred_cfl_ac_444_w32_wpad6): +1: // Copy and expand input, padding 24 + ld1 {v0.8h}, [x1], x2 + shl v0.8h, v0.8h, #3 + dup v1.8h, v0.h[7] + dup v2.8h, v0.h[7] + dup v3.8h, v0.h[7] + subs w8, w8, #1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + +L(ipred_cfl_ac_444_w32_hpad): + cbz w4, 3f +2: // Vertical padding (h_pad > 0) + subs w4, w4, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 2b +3: + + // Multiply the height by eight and reuse the w4 subtracting + lsl w6, w6, #3 + b L(ipred_cfl_ac_420_w4_calc_subtract_dc) + +L(ipred_cfl_ac_444_tbl): + .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32) + .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16) + .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8) + .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4) + +L(ipred_cfl_ac_444_w32_tbl): + .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0) + .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2) + .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4) + .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6) +endfunc diff --git a/third_party/dav1d/src/arm/64/itx.S b/third_party/dav1d/src/arm/64/itx.S new file mode 100644 index 0000000000..245af0e786 --- /dev/null +++ b/third_party/dav1d/src/arm/64/itx.S @@ -0,0 +1,3288 @@ +/****************************************************************************** + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2019, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include "src/arm/asm.S" +#include "util.S" + +// The exported functions in this file have got the following signature: +// void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob); + +// Most of the functions use the following register layout: +// x0-x3 external parameters +// x4 function pointer to first transform +// x5 function pointer to second transform +// x6 output parameter for helper function +// x7 input parameter for helper function +// x8 input stride for helper function +// x9-x12 scratch variables for helper functions +// x13 pointer to list of eob thresholds +// x14 return pointer for helper function +// x15 return pointer for main function + +// The SIMD registers most often use the following layout: +// v0-v1 multiplication coefficients +// v2-v7 scratch registers +// v8-v15 unused +// v16-v31 inputs/outputs of transforms + +// Potential further optimizations, that are left unimplemented for now: +// - Trying to keep multiplication coefficients in registers across multiple +// transform functions. (The register layout is designed to potentially +// allow this.) +// - Use a simplified version of the transforms themselves for cases where +// we know a significant number of inputs are zero. E.g. if the eob value +// indicates only a quarter of input values are set, for idct16 and up, +// a significant amount of calculation can be skipped, at the cost of more +// code duplication and special casing. + +const idct_coeffs, align=4 + // idct4 + .short 2896, 2896*8, 1567, 3784 + // idct8 + .short 799, 4017, 3406, 2276 + // idct16 + .short 401, 4076, 3166, 2598 + .short 1931, 3612, 3920, 1189 + // idct32 + .short 201, 4091, 3035, 2751 + .short 1751, 3703, 3857, 1380 + .short 995, 3973, 3513, 2106 + .short 2440, 3290, 4052, 601 +endconst + +const idct64_coeffs, align=4 + .short 101*8, 4095*8, 2967*8, -2824*8 + .short 1660*8, 3745*8, 3822*8, -1474*8 + .short 4076, 401, 4017, 799 + .short 0, 0, 0, 0 + + .short 4036*8, -700*8, 2359*8, 3349*8 + .short 3461*8, -2191*8, 897*8, 3996*8 + .short -3166, -2598, -799, -4017 + .short 0, 0, 0, 0 + + .short 501*8, 4065*8, 3229*8, -2520*8 + .short 2019*8, 3564*8, 3948*8, -1092*8 + .short 3612, 1931, 2276, 3406 + .short 0, 0, 0, 0 + + .short 4085*8, -301*8, 2675*8, 3102*8 + .short 3659*8, -1842*8, 1285*8, 3889*8 + .short -3920, -1189, -3406, -2276 + .short 0, 0, 0, 0 +endconst + +const iadst4_coeffs, align=4 + // .h[4-5] can be interpreted as .s[2] + .short 1321, 3803, 2482, 3344, 3344, 0 +endconst + +const iadst8_coeffs, align=4 + .short 4076, 401, 3612, 1931 + .short 2598, 3166, 1189, 3920 + // idct_coeffs + .short 2896, 0, 1567, 3784, 0, 0, 0, 0 +endconst + +const iadst16_coeffs, align=4 + .short 4091, 201, 3973, 995 + .short 3703, 1751, 3290, 2440 + .short 2751, 3035, 2106, 3513 + .short 1380, 3857, 601, 4052 +endconst + +.macro smull_smlal d0, d1, s0, s1, c0, c1, sz + smull \d0\().4s, \s0\().4h, \c0 + smlal \d0\().4s, \s1\().4h, \c1 +.ifc \sz, .8h + smull2 \d1\().4s, \s0\().8h, \c0 + smlal2 \d1\().4s, \s1\().8h, \c1 +.endif +.endm + +.macro smull_smlsl d0, d1, s0, s1, c0, c1, sz + smull \d0\().4s, \s0\().4h, \c0 + smlsl \d0\().4s, \s1\().4h, \c1 +.ifc \sz, .8h + smull2 \d1\().4s, \s0\().8h, \c0 + smlsl2 \d1\().4s, \s1\().8h, \c1 +.endif +.endm + +.macro rshrn_sz d0, s0, s1, shift, sz + rshrn \d0\().4h, \s0\().4s, \shift +.ifc \sz, .8h + rshrn2 \d0\().8h, \s1\().4s, \shift +.endif +.endm + +.macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7 + sqrdmulh \r0\sz, \r0\sz, \c + sqrdmulh \r1\sz, \r1\sz, \c + sqrdmulh \r2\sz, \r2\sz, \c + sqrdmulh \r3\sz, \r3\sz, \c +.ifnb \r4 + sqrdmulh \r4\sz, \r4\sz, \c + sqrdmulh \r5\sz, \r5\sz, \c + sqrdmulh \r6\sz, \r6\sz, \c + sqrdmulh \r7\sz, \r7\sz, \c +.endif +.endm + +.macro load_add_store load, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src, shiftbits=4 +.ifnb \load + ld1 {\load}, [\src], x1 +.endif +.ifnb \shift + srshr \shift, \shift, #\shiftbits +.endif +.ifnb \addsrc + uaddw \adddst, \adddst, \addsrc +.endif +.ifnb \narrowsrc + sqxtun \narrowdst, \narrowsrc +.endif +.ifnb \store + st1 {\store}, [\dst], x1 +.endif +.endm +.macro load_add_store_8x16 dst, src + mov \src, \dst + load_add_store v2.8b, v16.8h, , , , , , \dst, \src + load_add_store v3.8b, v17.8h, , , , , , \dst, \src + load_add_store v4.8b, v18.8h, v2.8b, v16.8h, , , , \dst, \src + load_add_store v5.8b, v19.8h, v3.8b, v17.8h, v16.8h, v2.8b, , \dst, \src + load_add_store v6.8b, v20.8h, v4.8b, v18.8h, v17.8h, v3.8b, v2.8b, \dst, \src + load_add_store v7.8b, v21.8h, v5.8b, v19.8h, v18.8h, v4.8b, v3.8b, \dst, \src + load_add_store v2.8b, v22.8h, v6.8b, v20.8h, v19.8h, v5.8b, v4.8b, \dst, \src + load_add_store v3.8b, v23.8h, v7.8b, v21.8h, v20.8h, v6.8b, v5.8b, \dst, \src + load_add_store v4.8b, v24.8h, v2.8b, v22.8h, v21.8h, v7.8b, v6.8b, \dst, \src + load_add_store v5.8b, v25.8h, v3.8b, v23.8h, v22.8h, v2.8b, v7.8b, \dst, \src + load_add_store v6.8b, v26.8h, v4.8b, v24.8h, v23.8h, v3.8b, v2.8b, \dst, \src + load_add_store v7.8b, v27.8h, v5.8b, v25.8h, v24.8h, v4.8b, v3.8b, \dst, \src + load_add_store v2.8b, v28.8h, v6.8b, v26.8h, v25.8h, v5.8b, v4.8b, \dst, \src + load_add_store v3.8b, v29.8h, v7.8b, v27.8h, v26.8h, v6.8b, v5.8b, \dst, \src + load_add_store v4.8b, v30.8h, v2.8b, v28.8h, v27.8h, v7.8b, v6.8b, \dst, \src + load_add_store v5.8b, v31.8h, v3.8b, v29.8h, v28.8h, v2.8b, v7.8b, \dst, \src + load_add_store , , v4.8b, v30.8h, v29.8h, v3.8b, v2.8b, \dst, \src + load_add_store , , v5.8b, v31.8h, v30.8h, v4.8b, v3.8b, \dst, \src + load_add_store , , , , v31.8h, v5.8b, v4.8b, \dst, \src + load_add_store , , , , , , v5.8b, \dst, \src +.endm +.macro load_add_store_8x8 dst, src, shiftbits=4 + mov \src, \dst + load_add_store v2.8b, v16.8h, , , , , , \dst, \src, \shiftbits + load_add_store v3.8b, v17.8h, , , , , , \dst, \src, \shiftbits + load_add_store v4.8b, v18.8h, v2.8b, v16.8h, , , , \dst, \src, \shiftbits + load_add_store v5.8b, v19.8h, v3.8b, v17.8h, v16.8h, v2.8b, , \dst, \src, \shiftbits + load_add_store v6.8b, v20.8h, v4.8b, v18.8h, v17.8h, v3.8b, v2.8b, \dst, \src, \shiftbits + load_add_store v7.8b, v21.8h, v5.8b, v19.8h, v18.8h, v4.8b, v3.8b, \dst, \src, \shiftbits + load_add_store v2.8b, v22.8h, v6.8b, v20.8h, v19.8h, v5.8b, v4.8b, \dst, \src, \shiftbits + load_add_store v3.8b, v23.8h, v7.8b, v21.8h, v20.8h, v6.8b, v5.8b, \dst, \src, \shiftbits + load_add_store , , v2.8b, v22.8h, v21.8h, v7.8b, v6.8b, \dst, \src, \shiftbits + load_add_store , , v3.8b, v23.8h, v22.8h, v2.8b, v7.8b, \dst, \src, \shiftbits + load_add_store , , , , v23.8h, v3.8b, v2.8b, \dst, \src, \shiftbits + load_add_store , , , , , , v3.8b, \dst, \src, \shiftbits +.endm +.macro load_add_store_8x4 dst, src + mov \src, \dst + load_add_store v2.8b, v16.8h, , , , , , \dst, \src + load_add_store v3.8b, v17.8h, , , , , , \dst, \src + load_add_store v4.8b, v18.8h, v2.8b, v16.8h, , , , \dst, \src + load_add_store v5.8b, v19.8h, v3.8b, v17.8h, v16.8h, v2.8b, , \dst, \src + load_add_store , , v4.8b, v18.8h, v17.8h, v3.8b, v2.8b, \dst, \src + load_add_store , , v5.8b, v19.8h, v18.8h, v4.8b, v3.8b, \dst, \src + load_add_store , , , , v19.8h, v5.8b, v4.8b, \dst, \src + load_add_store , , , , , , v5.8b, \dst, \src +.endm +.macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src +.ifnb \load + ld1 {\load}[0], [\src], x1 +.endif +.ifnb \inssrc + ins \insdst\().d[1], \inssrc\().d[0] +.endif +.ifnb \shift + srshr \shift, \shift, #4 +.endif +.ifnb \load + ld1 {\load}[1], [\src], x1 +.endif +.ifnb \addsrc + uaddw \adddst, \adddst, \addsrc +.endif +.ifnb \store + st1 {\store}[0], [\dst], x1 +.endif +.ifnb \narrowsrc + sqxtun \narrowdst, \narrowsrc +.endif +.ifnb \store + st1 {\store}[1], [\dst], x1 +.endif +.endm +.macro load_add_store_4x16 dst, src + mov \src, \dst + load_add_store4 v0.s, v17, v16, , , , , , , \dst, \src + load_add_store4 v1.s, v19, v18, , , , , , , \dst, \src + load_add_store4 v2.s, v21, v20, v16.8h, , , , , , \dst, \src + load_add_store4 v3.s, v23, v22, v18.8h, v0.8b, v16.8h, , , , \dst, \src + load_add_store4 v4.s, v25, v24, v20.8h, v1.8b, v18.8h, v16.8h, v0.8b, , \dst, \src + load_add_store4 v5.s, v27, v26, v22.8h, v2.8b, v20.8h, v18.8h, v1.8b, v0.s, \dst, \src + load_add_store4 v6.s, v29, v28, v24.8h, v3.8b, v22.8h, v20.8h, v2.8b, v1.s, \dst, \src + load_add_store4 v7.s, v31, v30, v26.8h, v4.8b, v24.8h, v22.8h, v3.8b, v2.s, \dst, \src + load_add_store4 , , , v28.8h, v5.8b, v26.8h, v24.8h, v4.8b, v3.s, \dst, \src + load_add_store4 , , , v30.8h, v6.8b, v28.8h, v26.8h, v5.8b, v4.s, \dst, \src + load_add_store4 , , , , v7.8b, v30.8h, v28.8h, v6.8b, v5.s, \dst, \src + load_add_store4 , , , , , , v30.8h, v7.8b, v6.s, \dst, \src + load_add_store4 , , , , , , , , v7.s, \dst, \src +.endm +.macro load_add_store_4x8 dst, src + mov \src, \dst + load_add_store4 v0.s, v17, v16, , , , , , , \dst, \src + load_add_store4 v1.s, v19, v18, , , , , , , \dst, \src + load_add_store4 v2.s, v21, v20, v16.8h, , , , , , \dst, \src + load_add_store4 v3.s, v23, v22, v18.8h, v0.8b, v16.8h, , , , \dst, \src + load_add_store4 , , , v20.8h, v1.8b, v18.8h, v16.8h, v0.8b, , \dst, \src + load_add_store4 , , , v22.8h, v2.8b, v20.8h, v18.8h, v1.8b, v0.s, \dst, \src + load_add_store4 , , , , v3.8b, v22.8h, v20.8h, v2.8b, v1.s, \dst, \src + load_add_store4 , , , , , , v22.8h, v3.8b, v2.s, \dst, \src + load_add_store4 , , , , , , , , v3.s, \dst, \src +.endm + +.macro idct_dc w, h, shift + cbnz w3, 1f + mov w16, #2896*8 + ld1r {v16.8h}, [x2] + dup v0.4h, w16 + sqrdmulh v16.8h, v16.8h, v0.h[0] + strh wzr, [x2] +.if (\w == 2*\h) || (2*\w == \h) + sqrdmulh v16.8h, v16.8h, v0.h[0] +.endif +.if \shift > 0 + srshr v16.8h, v16.8h, #\shift +.endif + sqrdmulh v16.8h, v16.8h, v0.h[0] + srshr v16.8h, v16.8h, #4 + mov w4, #\h + b idct_dc_w\w\()_neon +1: +.endm + +function idct_dc_w4_neon +1: + ld1 {v0.s}[0], [x0], x1 + ld1 {v0.s}[1], [x0], x1 + ld1 {v1.s}[0], [x0], x1 + ld1 {v1.s}[1], [x0], x1 + subs w4, w4, #4 + sub x0, x0, x1, lsl #2 + uaddw v0.8h, v16.8h, v0.8b + sqxtun v0.8b, v0.8h + uaddw v1.8h, v16.8h, v1.8b + st1 {v0.s}[0], [x0], x1 + sqxtun v1.8b, v1.8h + st1 {v0.s}[1], [x0], x1 + st1 {v1.s}[0], [x0], x1 + st1 {v1.s}[1], [x0], x1 + b.gt 1b + ret +endfunc + +function idct_dc_w8_neon +1: + ld1 {v0.8b}, [x0], x1 + ld1 {v1.8b}, [x0], x1 + ld1 {v2.8b}, [x0], x1 + uaddw v20.8h, v16.8h, v0.8b + ld1 {v3.8b}, [x0], x1 + sub x0, x0, x1, lsl #2 + subs w4, w4, #4 + uaddw v21.8h, v16.8h, v1.8b + sqxtun v0.8b, v20.8h + uaddw v22.8h, v16.8h, v2.8b + sqxtun v1.8b, v21.8h + uaddw v23.8h, v16.8h, v3.8b + st1 {v0.8b}, [x0], x1 + sqxtun v2.8b, v22.8h + st1 {v1.8b}, [x0], x1 + sqxtun v3.8b, v23.8h + st1 {v2.8b}, [x0], x1 + st1 {v3.8b}, [x0], x1 + b.gt 1b + ret +endfunc + +function idct_dc_w16_neon +1: + ld1 {v0.16b}, [x0], x1 + ld1 {v1.16b}, [x0], x1 + ld1 {v2.16b}, [x0], x1 + subs w4, w4, #4 + uaddw v20.8h, v16.8h, v0.8b + uaddw2 v21.8h, v16.8h, v0.16b + ld1 {v3.16b}, [x0], x1 + uaddw v22.8h, v16.8h, v1.8b + uaddw2 v23.8h, v16.8h, v1.16b + sub x0, x0, x1, lsl #2 + uaddw v24.8h, v16.8h, v2.8b + uaddw2 v25.8h, v16.8h, v2.16b + sqxtun v0.8b, v20.8h + sqxtun2 v0.16b, v21.8h + uaddw v26.8h, v16.8h, v3.8b + uaddw2 v27.8h, v16.8h, v3.16b + sqxtun v1.8b, v22.8h + sqxtun2 v1.16b, v23.8h + sqxtun v2.8b, v24.8h + sqxtun2 v2.16b, v25.8h + st1 {v0.16b}, [x0], x1 + sqxtun v3.8b, v26.8h + sqxtun2 v3.16b, v27.8h + st1 {v1.16b}, [x0], x1 + st1 {v2.16b}, [x0], x1 + st1 {v3.16b}, [x0], x1 + b.gt 1b + ret +endfunc + +function idct_dc_w32_neon +1: + ld1 {v0.16b, v1.16b}, [x0], x1 + subs w4, w4, #2 + uaddw v20.8h, v16.8h, v0.8b + uaddw2 v21.8h, v16.8h, v0.16b + ld1 {v2.16b, v3.16b}, [x0] + uaddw v22.8h, v16.8h, v1.8b + uaddw2 v23.8h, v16.8h, v1.16b + sub x0, x0, x1 + uaddw v24.8h, v16.8h, v2.8b + uaddw2 v25.8h, v16.8h, v2.16b + sqxtun v0.8b, v20.8h + sqxtun2 v0.16b, v21.8h + uaddw v26.8h, v16.8h, v3.8b + uaddw2 v27.8h, v16.8h, v3.16b + sqxtun v1.8b, v22.8h + sqxtun2 v1.16b, v23.8h + sqxtun v2.8b, v24.8h + sqxtun2 v2.16b, v25.8h + st1 {v0.16b, v1.16b}, [x0], x1 + sqxtun v3.8b, v26.8h + sqxtun2 v3.16b, v27.8h + st1 {v2.16b, v3.16b}, [x0], x1 + b.gt 1b + ret +endfunc + +function idct_dc_w64_neon +1: + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0] + subs w4, w4, #1 + uaddw v20.8h, v16.8h, v0.8b + uaddw2 v21.8h, v16.8h, v0.16b + uaddw v22.8h, v16.8h, v1.8b + uaddw2 v23.8h, v16.8h, v1.16b + uaddw v24.8h, v16.8h, v2.8b + uaddw2 v25.8h, v16.8h, v2.16b + sqxtun v0.8b, v20.8h + sqxtun2 v0.16b, v21.8h + uaddw v26.8h, v16.8h, v3.8b + uaddw2 v27.8h, v16.8h, v3.16b + sqxtun v1.8b, v22.8h + sqxtun2 v1.16b, v23.8h + sqxtun v2.8b, v24.8h + sqxtun2 v2.16b, v25.8h + sqxtun v3.8b, v26.8h + sqxtun2 v3.16b, v27.8h + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 + b.gt 1b + ret +endfunc + +.macro iwht4 + add v16.4h, v16.4h, v17.4h + sub v21.4h, v18.4h, v19.4h + sub v20.4h, v16.4h, v21.4h + sshr v20.4h, v20.4h, #1 + sub v18.4h, v20.4h, v17.4h + sub v17.4h, v20.4h, v19.4h + add v19.4h, v21.4h, v18.4h + sub v16.4h, v16.4h, v17.4h +.endm + +.macro idct_4 r0, r1, r2, r3, sz + smull_smlal v6, v7, \r1, \r3, v0.h[3], v0.h[2], \sz + smull_smlsl v4, v5, \r1, \r3, v0.h[2], v0.h[3], \sz + smull_smlal v2, v3, \r0, \r2, v0.h[0], v0.h[0], \sz + rshrn_sz v6, v6, v7, #12, \sz + rshrn_sz v7, v4, v5, #12, \sz + smull_smlsl v4, v5, \r0, \r2, v0.h[0], v0.h[0], \sz + rshrn_sz v2, v2, v3, #12, \sz + rshrn_sz v3, v4, v5, #12, \sz + sqadd \r0\sz, v2\sz, v6\sz + sqsub \r3\sz, v2\sz, v6\sz + sqadd \r1\sz, v3\sz, v7\sz + sqsub \r2\sz, v3\sz, v7\sz +.endm + +function inv_dct_4h_x4_neon, export=1 + movrel x16, idct_coeffs + ld1 {v0.4h}, [x16] + idct_4 v16, v17, v18, v19, .4h + ret +endfunc + +function inv_dct_8h_x4_neon, export=1 + movrel x16, idct_coeffs + ld1 {v0.4h}, [x16] + idct_4 v16, v17, v18, v19, .8h + ret +endfunc + +.macro iadst_4x4 o0, o1, o2, o3 + movrel x16, iadst4_coeffs + ld1 {v0.8h}, [x16] + + ssubl v3.4s, v16.4h, v18.4h + smull v4.4s, v16.4h, v0.h[0] + smlal v4.4s, v18.4h, v0.h[1] + smlal v4.4s, v19.4h, v0.h[2] + smull v7.4s, v17.4h, v0.h[3] + saddw v3.4s, v3.4s, v19.4h + smull v5.4s, v16.4h, v0.h[2] + smlsl v5.4s, v18.4h, v0.h[0] + smlsl v5.4s, v19.4h, v0.h[1] + + add \o3\().4s, v4.4s, v5.4s + mul \o2\().4s, v3.4s, v0.s[2] + add \o0\().4s, v4.4s, v7.4s + add \o1\().4s, v5.4s, v7.4s + sub \o3\().4s, \o3\().4s, v7.4s + + rshrn \o0\().4h, \o0\().4s, #12 + rshrn \o2\().4h, \o2\().4s, #12 + rshrn \o1\().4h, \o1\().4s, #12 + rshrn \o3\().4h, \o3\().4s, #12 +.endm + +function inv_adst_4h_x4_neon, export=1 + iadst_4x4 v16, v17, v18, v19 + ret +endfunc + +function inv_flipadst_4h_x4_neon, export=1 + iadst_4x4 v19, v18, v17, v16 + ret +endfunc + +.macro iadst_8x4 o0, o1, o2, o3 + movrel x16, iadst4_coeffs + ld1 {v0.8h}, [x16] + + ssubl v2.4s, v16.4h, v18.4h + ssubl2 v3.4s, v16.8h, v18.8h + smull v4.4s, v16.4h, v0.h[0] + smlal v4.4s, v18.4h, v0.h[1] + smlal v4.4s, v19.4h, v0.h[2] + smull2 v5.4s, v16.8h, v0.h[0] + smlal2 v5.4s, v18.8h, v0.h[1] + smlal2 v5.4s, v19.8h, v0.h[2] + saddw v2.4s, v2.4s, v19.4h + saddw2 v3.4s, v3.4s, v19.8h + smull v6.4s, v16.4h, v0.h[2] + smlsl v6.4s, v18.4h, v0.h[0] + smlsl v6.4s, v19.4h, v0.h[1] + smull2 v7.4s, v16.8h, v0.h[2] + smlsl2 v7.4s, v18.8h, v0.h[0] + smlsl2 v7.4s, v19.8h, v0.h[1] + + mul v18.4s, v2.4s, v0.s[2] + mul v19.4s, v3.4s, v0.s[2] + + smull v2.4s, v17.4h, v0.h[3] + smull2 v3.4s, v17.8h, v0.h[3] + + add v16.4s, v4.4s, v2.4s // out0 + add v17.4s, v5.4s, v3.4s + + add v4.4s, v4.4s, v6.4s // out3 + add v5.4s, v5.4s, v7.4s + + add v6.4s, v6.4s, v2.4s // out1 + add v7.4s, v7.4s, v3.4s + + sub v4.4s, v4.4s, v2.4s // out3 + sub v5.4s, v5.4s, v3.4s + + rshrn v18.4h, v18.4s, #12 + rshrn2 v18.8h, v19.4s, #12 + + rshrn \o0\().4h, v16.4s, #12 + rshrn2 \o0\().8h, v17.4s, #12 + +.ifc \o2, v17 + mov v17.16b, v18.16b +.endif + + rshrn \o1\().4h, v6.4s, #12 + rshrn2 \o1\().8h, v7.4s, #12 + + rshrn \o3\().4h, v4.4s, #12 + rshrn2 \o3\().8h, v5.4s, #12 +.endm + +function inv_adst_8h_x4_neon, export=1 + iadst_8x4 v16, v17, v18, v19 + ret +endfunc + +function inv_flipadst_8h_x4_neon, export=1 + iadst_8x4 v19, v18, v17, v16 + ret +endfunc + +function inv_identity_4h_x4_neon, export=1 + mov w16, #(5793-4096)*8 + dup v0.4h, w16 + sqrdmulh v4.4h, v16.4h, v0.h[0] + sqrdmulh v5.4h, v17.4h, v0.h[0] + sqrdmulh v6.4h, v18.4h, v0.h[0] + sqrdmulh v7.4h, v19.4h, v0.h[0] + sqadd v16.4h, v16.4h, v4.4h + sqadd v17.4h, v17.4h, v5.4h + sqadd v18.4h, v18.4h, v6.4h + sqadd v19.4h, v19.4h, v7.4h + ret +endfunc + +function inv_identity_8h_x4_neon, export=1 + mov w16, #(5793-4096)*8 + dup v0.4h, w16 + sqrdmulh v4.8h, v16.8h, v0.h[0] + sqrdmulh v5.8h, v17.8h, v0.h[0] + sqrdmulh v6.8h, v18.8h, v0.h[0] + sqrdmulh v7.8h, v19.8h, v0.h[0] + sqadd v16.8h, v16.8h, v4.8h + sqadd v17.8h, v17.8h, v5.8h + sqadd v18.8h, v18.8h, v6.8h + sqadd v19.8h, v19.8h, v7.8h + ret +endfunc + +.macro identity_8x4_shift1 r0, r1, r2, r3, c +.irp i, \r0\().8h, \r1\().8h, \r2\().8h, \r3\().8h + sqrdmulh v2.8h, \i, \c + srhadd \i, \i, v2.8h +.endr +.endm + +function inv_txfm_add_wht_wht_4x4_8bpc_neon, export=1 + mov x15, x30 + movi v31.8h, #0 + ld1 {v16.4h,v17.4h,v18.4h,v19.4h}, [x2] + st1 {v31.8h}, [x2], #16 + + sshr v16.4h, v16.4h, #2 + sshr v17.4h, v17.4h, #2 + sshr v18.4h, v18.4h, #2 + sshr v19.4h, v19.4h, #2 + + iwht4 + + st1 {v31.8h}, [x2], #16 + transpose_4x4h v16, v17, v18, v19, v20, v21, v22, v23 + + iwht4 + + ld1 {v0.s}[0], [x0], x1 + ld1 {v0.s}[1], [x0], x1 + ins v16.d[1], v17.d[0] + ins v18.d[1], v19.d[0] + ld1 {v1.s}[0], [x0], x1 + ld1 {v1.s}[1], [x0], x1 + + b L(itx_4x4_end) +endfunc + +function inv_txfm_add_4x4_neon + movi v31.8h, #0 + ld1 {v16.4h,v17.4h,v18.4h,v19.4h}, [x2] + st1 {v31.8h}, [x2], #16 + + blr x4 + + st1 {v31.8h}, [x2], #16 + transpose_4x4h v16, v17, v18, v19, v20, v21, v22, v23 + + blr x5 + + ld1 {v0.s}[0], [x0], x1 + ld1 {v0.s}[1], [x0], x1 + ins v16.d[1], v17.d[0] + ins v18.d[1], v19.d[0] + ld1 {v1.s}[0], [x0], x1 + ld1 {v1.s}[1], [x0], x1 + srshr v16.8h, v16.8h, #4 + srshr v18.8h, v18.8h, #4 + +L(itx_4x4_end): + sub x0, x0, x1, lsl #2 + uaddw v16.8h, v16.8h, v0.8b + sqxtun v0.8b, v16.8h + uaddw v18.8h, v18.8h, v1.8b + st1 {v0.s}[0], [x0], x1 + sqxtun v1.8b, v18.8h + st1 {v0.s}[1], [x0], x1 + st1 {v1.s}[0], [x0], x1 + st1 {v1.s}[1], [x0], x1 + + br x15 +endfunc + +.macro def_fn_4x4 txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_neon, export=1 + mov x15, x30 + +.ifc \txfm1\()_\txfm2, dct_dct + cbnz w3, 1f + mov w16, #2896*8 + ld1r {v16.8h}, [x2] + dup v4.8h, w16 + strh wzr, [x2] + sqrdmulh v16.8h, v16.8h, v4.h[0] + ld1 {v0.s}[0], [x0], x1 + sqrdmulh v20.8h, v16.8h, v4.h[0] + ld1 {v0.s}[1], [x0], x1 + srshr v16.8h, v20.8h, #4 + ld1 {v1.s}[0], [x0], x1 + srshr v18.8h, v20.8h, #4 + ld1 {v1.s}[1], [x0], x1 + b L(itx_4x4_end) +1: +.endif + adr x4, inv_\txfm1\()_4h_x4_neon + adr x5, inv_\txfm2\()_4h_x4_neon + b inv_txfm_add_4x4_neon +endfunc +.endm + +def_fn_4x4 dct, dct +def_fn_4x4 identity, identity +def_fn_4x4 dct, adst +def_fn_4x4 dct, flipadst +def_fn_4x4 dct, identity +def_fn_4x4 adst, dct +def_fn_4x4 adst, adst +def_fn_4x4 adst, flipadst +def_fn_4x4 flipadst, dct +def_fn_4x4 flipadst, adst +def_fn_4x4 flipadst, flipadst +def_fn_4x4 identity, dct + +def_fn_4x4 adst, identity +def_fn_4x4 flipadst, identity +def_fn_4x4 identity, adst +def_fn_4x4 identity, flipadst + +.macro idct_8 r0, r1, r2, r3, r4, r5, r6, r7, sz, szb + idct_4 \r0, \r2, \r4, \r6, \sz + + smull_smlsl v2, v3, \r1, \r7, v0.h[4], v0.h[5], \sz // -> t4a + smull_smlal v4, v5, \r1, \r7, v0.h[5], v0.h[4], \sz // -> t7a + smull_smlsl v6, v7, \r5, \r3, v0.h[6], v0.h[7], \sz // -> t5a + rshrn_sz \r1, v2, v3, #12, \sz // t4a + rshrn_sz \r7, v4, v5, #12, \sz // t7a + smull_smlal v2, v3, \r5, \r3, v0.h[7], v0.h[6], \sz // -> t6a + rshrn_sz \r3, v6, v7, #12, \sz // t5a + rshrn_sz \r5, v2, v3, #12, \sz // taa + + sqadd v2\sz, \r1\sz, \r3\sz // t4 + sqsub \r1\sz, \r1\sz, \r3\sz // t5a + sqadd v3\sz, \r7\sz, \r5\sz // t7 + sqsub \r3\sz, \r7\sz, \r5\sz // t6a + + smull_smlsl v4, v5, \r3, \r1, v0.h[0], v0.h[0], \sz // -> t5 + smull_smlal v6, v7, \r3, \r1, v0.h[0], v0.h[0], \sz // -> t6 + rshrn_sz v4, v4, v5, #12, \sz // t5 + rshrn_sz v5, v6, v7, #12, \sz // t6 + + sqsub \r7\sz, \r0\sz, v3\sz // out7 + sqadd \r0\sz, \r0\sz, v3\sz // out0 + sqadd \r1\sz, \r2\sz, v5\sz // out1 + sqsub v6\sz, \r2\sz, v5\sz // out6 + sqadd \r2\sz, \r4\sz, v4\sz // out2 + sqsub \r5\sz, \r4\sz, v4\sz // out5 + sqadd \r3\sz, \r6\sz, v2\sz // out3 + sqsub \r4\sz, \r6\sz, v2\sz // out4 + mov \r6\szb, v6\szb // out6 +.endm + +function inv_dct_8h_x8_neon, export=1 + movrel x16, idct_coeffs + ld1 {v0.8h}, [x16] + idct_8 v16, v17, v18, v19, v20, v21, v22, v23, .8h, .16b + ret +endfunc + +function inv_dct_4h_x8_neon, export=1 + movrel x16, idct_coeffs + ld1 {v0.8h}, [x16] + idct_8 v16, v17, v18, v19, v20, v21, v22, v23, .4h, .8b + ret +endfunc + +.macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7, sz + movrel x16, iadst8_coeffs + ld1 {v0.8h, v1.8h}, [x16] + + smull_smlal v2, v3, v23, v16, v0.h[0], v0.h[1], \sz + smull_smlsl v4, v5, v23, v16, v0.h[1], v0.h[0], \sz + smull_smlal v6, v7, v21, v18, v0.h[2], v0.h[3], \sz + rshrn_sz v16, v2, v3, #12, \sz // t0a + rshrn_sz v23, v4, v5, #12, \sz // t1a + smull_smlsl v2, v3, v21, v18, v0.h[3], v0.h[2], \sz + smull_smlal v4, v5, v19, v20, v0.h[4], v0.h[5], \sz + rshrn_sz v18, v6, v7, #12, \sz // t2a + rshrn_sz v21, v2, v3, #12, \sz // t3a + smull_smlsl v6, v7, v19, v20, v0.h[5], v0.h[4], \sz + smull_smlal v2, v3, v17, v22, v0.h[6], v0.h[7], \sz + rshrn_sz v20, v4, v5, #12, \sz // t4a + rshrn_sz v19, v6, v7, #12, \sz // t5a + smull_smlsl v4, v5, v17, v22, v0.h[7], v0.h[6], \sz + rshrn_sz v22, v2, v3, #12, \sz // t6a + rshrn_sz v17, v4, v5, #12, \sz // t7a + + sqadd v2\sz, v16\sz, v20\sz // t0 + sqsub v3\sz, v16\sz, v20\sz // t4 + sqadd v4\sz, v23\sz, v19\sz // t1 + sqsub v5\sz, v23\sz, v19\sz // t5 + sqadd v6\sz, v18\sz, v22\sz // t2 + sqsub v7\sz, v18\sz, v22\sz // t6 + sqadd v18\sz, v21\sz, v17\sz // t3 + sqsub v19\sz, v21\sz, v17\sz // t7 + + smull_smlal v16, v17, v3, v5, v1.h[3], v1.h[2], \sz + smull_smlsl v20, v21, v3, v5, v1.h[2], v1.h[3], \sz + smull_smlsl v22, v23, v19, v7, v1.h[3], v1.h[2], \sz + + rshrn_sz v3, v16, v17, #12, \sz // t4a + rshrn_sz v5, v20, v21, #12, \sz // t5a + + smull_smlal v16, v17, v19, v7, v1.h[2], v1.h[3], \sz + + rshrn_sz v7, v22, v23, #12, \sz // t6a + rshrn_sz v19, v16, v17, #12, \sz // t7a + + sqadd \o0\()\sz, v2\sz, v6\sz // out0 + sqsub v2\sz, v2\sz, v6\sz // t2 + sqadd \o7\()\sz, v4\sz, v18\sz // out7 + sqsub v4\sz, v4\sz, v18\sz // t3 + sqneg \o7\()\sz, \o7\()\sz // out7 + + sqadd \o1\()\sz, v3\sz, v7\sz // out1 + sqsub v3\sz, v3\sz, v7\sz // t6 + sqadd \o6\()\sz, v5\sz, v19\sz // out6 + sqsub v5\sz, v5\sz, v19\sz // t7 + sqneg \o1\()\sz, \o1\()\sz // out1 + + smull_smlal v18, v19, v2, v4, v1.h[0], v1.h[0], \sz // -> out3 (v19 or v20) + smull_smlsl v6, v7, v2, v4, v1.h[0], v1.h[0], \sz // -> out4 (v20 or v19) + smull_smlsl v20, v21, v3, v5, v1.h[0], v1.h[0], \sz // -> out5 (v21 or v18) + rshrn_sz v2, v18, v19, #12, \sz // out3 + smull_smlal v18, v19, v3, v5, v1.h[0], v1.h[0], \sz // -> out2 (v18 or v21) + rshrn_sz v3, v20, v21, #12, \sz // out5 + rshrn_sz \o2, v18, v19, #12, \sz // out2 (v18 or v21) + rshrn_sz \o4, v6, v7, #12, \sz // out4 (v20 or v19) + + sqneg \o3\()\sz, v2\sz // out3 + sqneg \o5\()\sz, v3\sz // out5 +.endm + +function inv_adst_8h_x8_neon, export=1 + iadst_8 v16, v17, v18, v19, v20, v21, v22, v23, .8h + ret +endfunc + +function inv_flipadst_8h_x8_neon, export=1 + iadst_8 v23, v22, v21, v20, v19, v18, v17, v16, .8h + ret +endfunc + +function inv_adst_4h_x8_neon, export=1 + iadst_8 v16, v17, v18, v19, v20, v21, v22, v23, .4h + ret +endfunc + +function inv_flipadst_4h_x8_neon, export=1 + iadst_8 v23, v22, v21, v20, v19, v18, v17, v16, .4h + ret +endfunc + +function inv_identity_8h_x8_neon, export=1 + sqshl v16.8h, v16.8h, #1 + sqshl v17.8h, v17.8h, #1 + sqshl v18.8h, v18.8h, #1 + sqshl v19.8h, v19.8h, #1 + sqshl v20.8h, v20.8h, #1 + sqshl v21.8h, v21.8h, #1 + sqshl v22.8h, v22.8h, #1 + sqshl v23.8h, v23.8h, #1 + ret +endfunc + +function inv_identity_4h_x8_neon, export=1 + sqshl v16.4h, v16.4h, #1 + sqshl v17.4h, v17.4h, #1 + sqshl v18.4h, v18.4h, #1 + sqshl v19.4h, v19.4h, #1 + sqshl v20.4h, v20.4h, #1 + sqshl v21.4h, v21.4h, #1 + sqshl v22.4h, v22.4h, #1 + sqshl v23.4h, v23.4h, #1 + ret +endfunc + +.macro def_fn_8x8_base variant +function inv_txfm_\variant\()add_8x8_neon + movi v28.8h, #0 + movi v29.8h, #0 + movi v30.8h, #0 + movi v31.8h, #0 + ld1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x2] + st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x2], #64 + ld1 {v20.8h,v21.8h,v22.8h,v23.8h}, [x2] + st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x2] + +.ifc \variant, identity_ + // The identity shl #1 and downshift srshr #1 cancel out +.else + blr x4 + + srshr v16.8h, v16.8h, #1 + srshr v17.8h, v17.8h, #1 + srshr v18.8h, v18.8h, #1 + srshr v19.8h, v19.8h, #1 + srshr v20.8h, v20.8h, #1 + srshr v21.8h, v21.8h, #1 + srshr v22.8h, v22.8h, #1 + srshr v23.8h, v23.8h, #1 +.endif + + transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 + + blr x5 + + load_add_store_8x8 x0, x7 + br x15 +endfunc +.endm + +def_fn_8x8_base +def_fn_8x8_base identity_ + +.macro def_fn_8x8 txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1 + mov x15, x30 + +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc 8, 8, 1 +.endif + adr x5, inv_\txfm2\()_8h_x8_neon +.ifc \txfm1, identity + b inv_txfm_identity_add_8x8_neon +.else + adr x4, inv_\txfm1\()_8h_x8_neon + b inv_txfm_add_8x8_neon +.endif +endfunc +.endm + +def_fn_8x8 dct, dct +def_fn_8x8 identity, identity +def_fn_8x8 dct, adst +def_fn_8x8 dct, flipadst +def_fn_8x8 dct, identity +def_fn_8x8 adst, dct +def_fn_8x8 adst, adst +def_fn_8x8 adst, flipadst +def_fn_8x8 flipadst, dct +def_fn_8x8 flipadst, adst +def_fn_8x8 flipadst, flipadst +def_fn_8x8 identity, dct +def_fn_8x8 adst, identity +def_fn_8x8 flipadst, identity +def_fn_8x8 identity, adst +def_fn_8x8 identity, flipadst + +function inv_txfm_add_8x4_neon + movi v30.8h, #0 + movi v31.8h, #0 + mov w16, #2896*8 + dup v0.4h, w16 + ld1 {v16.4h,v17.4h,v18.4h,v19.4h}, [x2] + st1 {v30.8h,v31.8h}, [x2], #32 + ld1 {v20.4h,v21.4h,v22.4h,v23.4h}, [x2] + st1 {v30.8h,v31.8h}, [x2] + + scale_input .4h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 + + blr x4 + + transpose_4x4h v16, v17, v18, v19, v4, v5, v6, v7 + transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7 + ins v16.d[1], v20.d[0] + ins v17.d[1], v21.d[0] + ins v18.d[1], v22.d[0] + ins v19.d[1], v23.d[0] + + blr x5 + + load_add_store_8x4 x0, x7 + br x15 +endfunc + +function inv_txfm_add_4x8_neon + movi v28.8h, #0 + movi v29.8h, #0 + movi v30.8h, #0 + movi v31.8h, #0 + mov w16, #2896*8 + dup v0.4h, w16 + ld1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x2] + st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x2] + + scale_input .8h, v0.h[0], v16, v17, v18, v19 + + blr x4 + + transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7 + ins v20.d[0], v16.d[1] + ins v21.d[0], v17.d[1] + ins v22.d[0], v18.d[1] + ins v23.d[0], v19.d[1] + + blr x5 + + load_add_store_4x8 x0, x7 + br x15 +endfunc + +.macro def_fn_48 w, h, txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 + mov x15, x30 + +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc \w, \h, 0 +.endif + adr x4, inv_\txfm1\()_\h\()h_x\w\()_neon + adr x5, inv_\txfm2\()_\w\()h_x\h\()_neon + b inv_txfm_add_\w\()x\h\()_neon +endfunc +.endm + +.macro def_fns_48 w, h +def_fn_48 \w, \h, dct, dct +def_fn_48 \w, \h, identity, identity +def_fn_48 \w, \h, dct, adst +def_fn_48 \w, \h, dct, flipadst +def_fn_48 \w, \h, dct, identity +def_fn_48 \w, \h, adst, dct +def_fn_48 \w, \h, adst, adst +def_fn_48 \w, \h, adst, flipadst +def_fn_48 \w, \h, flipadst, dct +def_fn_48 \w, \h, flipadst, adst +def_fn_48 \w, \h, flipadst, flipadst +def_fn_48 \w, \h, identity, dct +def_fn_48 \w, \h, adst, identity +def_fn_48 \w, \h, flipadst, identity +def_fn_48 \w, \h, identity, adst +def_fn_48 \w, \h, identity, flipadst +.endm + +def_fns_48 4, 8 +def_fns_48 8, 4 + + +.macro idct_16 sz, szb + idct_8 v16, v18, v20, v22, v24, v26, v28, v30, \sz, \szb + + smull_smlsl v2, v3, v17, v31, v1.h[0], v1.h[1], \sz // -> t8a + smull_smlal v4, v5, v17, v31, v1.h[1], v1.h[0], \sz // -> t15a + smull_smlsl v6, v7, v25, v23, v1.h[2], v1.h[3], \sz // -> t9a + rshrn_sz v17, v2, v3, #12, \sz // t8a + rshrn_sz v31, v4, v5, #12, \sz // t15a + smull_smlal v2, v3, v25, v23, v1.h[3], v1.h[2], \sz // -> t14a + smull_smlsl v4, v5, v21, v27, v1.h[4], v1.h[5], \sz // -> t10a + rshrn_sz v23, v6, v7, #12, \sz // t9a + rshrn_sz v25, v2, v3, #12, \sz // t14a + smull_smlal v6, v7, v21, v27, v1.h[5], v1.h[4], \sz // -> t13a + smull_smlsl v2, v3, v29, v19, v1.h[6], v1.h[7], \sz // -> t11a + rshrn_sz v21, v4, v5, #12, \sz // t10a + rshrn_sz v27, v6, v7, #12, \sz // t13a + smull_smlal v4, v5, v29, v19, v1.h[7], v1.h[6], \sz // -> t12a + rshrn_sz v19, v2, v3, #12, \sz // t11a + rshrn_sz v29, v4, v5, #12, \sz // t12a + + sqsub v2\sz, v17\sz, v23\sz // t9 + sqadd v17\sz, v17\sz, v23\sz // t8 + sqsub v3\sz, v31\sz, v25\sz // t14 + sqadd v31\sz, v31\sz, v25\sz // t15 + sqsub v23\sz, v19\sz, v21\sz // t10 + sqadd v19\sz, v19\sz, v21\sz // t11 + sqadd v25\sz, v29\sz, v27\sz // t12 + sqsub v29\sz, v29\sz, v27\sz // t13 + + smull_smlsl v4, v5, v3, v2, v0.h[2], v0.h[3], \sz // -> t9a + smull_smlal v6, v7, v3, v2, v0.h[3], v0.h[2], \sz // -> t14a + rshrn_sz v21, v4, v5, #12, \sz // t9a + rshrn_sz v27, v6, v7, #12, \sz // t14a + + smull_smlsl v4, v5, v29, v23, v0.h[2], v0.h[3], \sz // -> t13a + smull_smlal v6, v7, v29, v23, v0.h[3], v0.h[2], \sz // -> t10a + rshrn_sz v29, v4, v5, #12, \sz // t13a + neg v6.4s, v6.4s +.ifc \sz, .8h + neg v7.4s, v7.4s +.endif + rshrn_sz v23, v6, v7, #12, \sz // t10a + + sqsub v2\sz, v17\sz, v19\sz // t11a + sqadd v17\sz, v17\sz, v19\sz // t8a + sqsub v3\sz, v31\sz, v25\sz // t12a + sqadd v31\sz, v31\sz, v25\sz // t15a + sqadd v19\sz, v21\sz, v23\sz // t9 + sqsub v21\sz, v21\sz, v23\sz // t10 + sqsub v25\sz, v27\sz, v29\sz // t13 + sqadd v27\sz, v27\sz, v29\sz // t14 + + smull_smlsl v4, v5, v3, v2, v0.h[0], v0.h[0], \sz // -> t11 + smull_smlal v6, v7, v3, v2, v0.h[0], v0.h[0], \sz // -> t12 + smull_smlsl v2, v3, v25, v21, v0.h[0], v0.h[0], \sz // -> t10a + + rshrn_sz v4, v4, v5, #12, \sz // t11 + rshrn_sz v5, v6, v7, #12, \sz // t12 + smull_smlal v6, v7, v25, v21, v0.h[0], v0.h[0], \sz // -> t10a + rshrn_sz v2, v2, v3, #12, \sz // t10a + rshrn_sz v3, v6, v7, #12, \sz // t13a + + sqadd v6\sz, v16\sz, v31\sz // out0 + sqsub v31\sz, v16\sz, v31\sz // out15 + mov v16\szb, v6\szb + sqadd v23\sz, v30\sz, v17\sz // out7 + sqsub v7\sz, v30\sz, v17\sz // out8 + sqadd v17\sz, v18\sz, v27\sz // out1 + sqsub v30\sz, v18\sz, v27\sz // out14 + sqadd v18\sz, v20\sz, v3\sz // out2 + sqsub v29\sz, v20\sz, v3\sz // out13 + sqadd v3\sz, v28\sz, v19\sz // out6 + sqsub v25\sz, v28\sz, v19\sz // out9 + sqadd v19\sz, v22\sz, v5\sz // out3 + sqsub v28\sz, v22\sz, v5\sz // out12 + sqadd v20\sz, v24\sz, v4\sz // out4 + sqsub v27\sz, v24\sz, v4\sz // out11 + sqadd v21\sz, v26\sz, v2\sz // out5 + sqsub v26\sz, v26\sz, v2\sz // out10 + mov v24\szb, v7\szb + mov v22\szb, v3\szb +.endm + +function inv_dct_8h_x16_neon, export=1 + movrel x16, idct_coeffs + ld1 {v0.8h, v1.8h}, [x16] + idct_16 .8h, .16b + ret +endfunc + +function inv_dct_4h_x16_neon, export=1 + movrel x16, idct_coeffs + ld1 {v0.8h, v1.8h}, [x16] + idct_16 .4h, .8b + ret +endfunc + +.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15, sz, szb + movrel x16, iadst16_coeffs + ld1 {v0.8h, v1.8h}, [x16] + movrel x16, idct_coeffs + + smull_smlal v2, v3, v31, v16, v0.h[0], v0.h[1], \sz // -> t0 + smull_smlsl v4, v5, v31, v16, v0.h[1], v0.h[0], \sz // -> t1 + smull_smlal v6, v7, v29, v18, v0.h[2], v0.h[3], \sz // -> t2 + rshrn_sz v16, v2, v3, #12, \sz // t0 + rshrn_sz v31, v4, v5, #12, \sz // t1 + smull_smlsl v2, v3, v29, v18, v0.h[3], v0.h[2], \sz // -> t3 + smull_smlal v4, v5, v27, v20, v0.h[4], v0.h[5], \sz // -> t4 + rshrn_sz v18, v6, v7, #12, \sz // t2 + rshrn_sz v29, v2, v3, #12, \sz // t3 + smull_smlsl v6, v7, v27, v20, v0.h[5], v0.h[4], \sz // -> t5 + smull_smlal v2, v3, v25, v22, v0.h[6], v0.h[7], \sz // -> t6 + rshrn_sz v20, v4, v5, #12, \sz // t4 + rshrn_sz v27, v6, v7, #12, \sz // t5 + smull_smlsl v4, v5, v25, v22, v0.h[7], v0.h[6], \sz // -> t7 + smull_smlal v6, v7, v23, v24, v1.h[0], v1.h[1], \sz // -> t8 + rshrn_sz v22, v2, v3, #12, \sz // t6 + rshrn_sz v25, v4, v5, #12, \sz // t7 + smull_smlsl v2, v3, v23, v24, v1.h[1], v1.h[0], \sz // -> t9 + smull_smlal v4, v5, v21, v26, v1.h[2], v1.h[3], \sz // -> t10 + rshrn_sz v23, v6, v7, #12, \sz // t8 + rshrn_sz v24, v2, v3, #12, \sz // t9 + smull_smlsl v6, v7, v21, v26, v1.h[3], v1.h[2], \sz // -> t11 + smull_smlal v2, v3, v19, v28, v1.h[4], v1.h[5], \sz // -> t12 + rshrn_sz v21, v4, v5, #12, \sz // t10 + rshrn_sz v26, v6, v7, #12, \sz // t11 + smull_smlsl v4, v5, v19, v28, v1.h[5], v1.h[4], \sz // -> t13 + smull_smlal v6, v7, v17, v30, v1.h[6], v1.h[7], \sz // -> t14 + rshrn_sz v19, v2, v3, #12, \sz // t12 + rshrn_sz v28, v4, v5, #12, \sz // t13 + smull_smlsl v2, v3, v17, v30, v1.h[7], v1.h[6], \sz // -> t15 + rshrn_sz v17, v6, v7, #12, \sz // t14 + rshrn_sz v30, v2, v3, #12, \sz // t15 + + ld1 {v0.8h}, [x16] + + sqsub v2\sz, v16\sz, v23\sz // t8a + sqadd v16\sz, v16\sz, v23\sz // t0a + sqsub v3\sz, v31\sz, v24\sz // t9a + sqadd v31\sz, v31\sz, v24\sz // t1a + sqadd v23\sz, v18\sz, v21\sz // t2a + sqsub v18\sz, v18\sz, v21\sz // t10a + sqadd v24\sz, v29\sz, v26\sz // t3a + sqsub v29\sz, v29\sz, v26\sz // t11a + sqadd v21\sz, v20\sz, v19\sz // t4a + sqsub v20\sz, v20\sz, v19\sz // t12a + sqadd v26\sz, v27\sz, v28\sz // t5a + sqsub v27\sz, v27\sz, v28\sz // t13a + sqadd v19\sz, v22\sz, v17\sz // t6a + sqsub v22\sz, v22\sz, v17\sz // t14a + sqadd v28\sz, v25\sz, v30\sz // t7a + sqsub v25\sz, v25\sz, v30\sz // t15a + + smull_smlal v4, v5, v2, v3, v0.h[5], v0.h[4], \sz // -> t8 + smull_smlsl v6, v7, v2, v3, v0.h[4], v0.h[5], \sz // -> t9 + smull_smlal v2, v3, v18, v29, v0.h[7], v0.h[6], \sz // -> t10 + rshrn_sz v17, v4, v5, #12, \sz // t8 + rshrn_sz v30, v6, v7, #12, \sz // t9 + smull_smlsl v4, v5, v18, v29, v0.h[6], v0.h[7], \sz // -> t11 + smull_smlsl v6, v7, v27, v20, v0.h[5], v0.h[4], \sz // -> t12 + rshrn_sz v18, v2, v3, #12, \sz // t10 + rshrn_sz v29, v4, v5, #12, \sz // t11 + smull_smlal v2, v3, v27, v20, v0.h[4], v0.h[5], \sz // -> t13 + smull_smlsl v4, v5, v25, v22, v0.h[7], v0.h[6], \sz // -> t14 + rshrn_sz v27, v6, v7, #12, \sz // t12 + rshrn_sz v20, v2, v3, #12, \sz // t13 + smull_smlal v6, v7, v25, v22, v0.h[6], v0.h[7], \sz // -> t15 + rshrn_sz v25, v4, v5, #12, \sz // t14 + rshrn_sz v22, v6, v7, #12, \sz // t15 + + sqsub v2\sz, v16\sz, v21\sz // t4 + sqadd v16\sz, v16\sz, v21\sz // t0 + sqsub v3\sz, v31\sz, v26\sz // t5 + sqadd v31\sz, v31\sz, v26\sz // t1 + sqadd v21\sz, v23\sz, v19\sz // t2 + sqsub v23\sz, v23\sz, v19\sz // t6 + sqadd v26\sz, v24\sz, v28\sz // t3 + sqsub v24\sz, v24\sz, v28\sz // t7 + sqadd v19\sz, v17\sz, v27\sz // t8a + sqsub v17\sz, v17\sz, v27\sz // t12a + sqadd v28\sz, v30\sz, v20\sz // t9a + sqsub v30\sz, v30\sz, v20\sz // t13a + sqadd v27\sz, v18\sz, v25\sz // t10a + sqsub v18\sz, v18\sz, v25\sz // t14a + sqadd v20\sz, v29\sz, v22\sz // t11a + sqsub v29\sz, v29\sz, v22\sz // t15a + + smull_smlal v4, v5, v2, v3, v0.h[3], v0.h[2], \sz // -> t4a + smull_smlsl v6, v7, v2, v3, v0.h[2], v0.h[3], \sz // -> t5a + smull_smlsl v2, v3, v24, v23, v0.h[3], v0.h[2], \sz // -> t6a + rshrn_sz v22, v4, v5, #12, \sz // t4a + rshrn_sz v25, v6, v7, #12, \sz // t5a + smull_smlal v4, v5, v24, v23, v0.h[2], v0.h[3], \sz // -> t7a + smull_smlal v6, v7, v17, v30, v0.h[3], v0.h[2], \sz // -> t12 + rshrn_sz v24, v2, v3, #12, \sz // t6a + rshrn_sz v23, v4, v5, #12, \sz // t7a + smull_smlsl v2, v3, v17, v30, v0.h[2], v0.h[3], \sz // -> t13 + smull_smlsl v4, v5, v29, v18, v0.h[3], v0.h[2], \sz // -> t14 + rshrn_sz v17, v6, v7, #12, \sz // t12 + smull_smlal v6, v7, v29, v18, v0.h[2], v0.h[3], \sz // -> t15 + rshrn_sz v29, v2, v3, #12, \sz // t13 + rshrn_sz v30, v4, v5, #12, \sz // t14 + rshrn_sz v18, v6, v7, #12, \sz // t15 + + sqsub v2\sz, v16\sz, v21\sz // t2a +.ifc \o0, v16 + sqadd \o0\sz, v16\sz, v21\sz // out0 + sqsub v21\sz, v31\sz, v26\sz // t3a + sqadd \o15\sz, v31\sz, v26\sz // out15 +.else + sqadd v4\sz, v16\sz, v21\sz // out0 + sqsub v21\sz, v31\sz, v26\sz // t3a + sqadd \o15\sz, v31\sz, v26\sz // out15 + mov \o0\szb, v4\szb +.endif + sqneg \o15\sz, \o15\sz // out15 + + sqsub v3\sz, v29\sz, v18\sz // t15a + sqadd \o13\sz, v29\sz, v18\sz // out13 + sqadd \o2\sz, v17\sz, v30\sz // out2 + sqsub v26\sz, v17\sz, v30\sz // t14a + sqneg \o13\sz, \o13\sz // out13 + + sqadd \o1\sz, v19\sz, v27\sz // out1 + sqsub v27\sz, v19\sz, v27\sz // t10 + sqadd \o14\sz, v28\sz, v20\sz // out14 + sqsub v20\sz, v28\sz, v20\sz // t11 + sqneg \o1\sz, \o1\sz // out1 + + sqadd \o3\sz, v22\sz, v24\sz // out3 + sqsub v22\sz, v22\sz, v24\sz // t6 + sqadd \o12\sz, v25\sz, v23\sz // out12 + sqsub v23\sz, v25\sz, v23\sz // t7 + sqneg \o3\sz, \o3\sz // out3 + + smull_smlsl v24, v25, v2, v21, v0.h[0], v0.h[0], \sz // -> out8 (v24 or v23) + smull_smlal v4, v5, v2, v21, v0.h[0], v0.h[0], \sz // -> out7 (v23 or v24) + smull_smlal v6, v7, v26, v3, v0.h[0], v0.h[0], \sz // -> out5 (v21 or v26) + + rshrn_sz v24, v24, v25, #12, \sz // out8 + rshrn_sz v4, v4, v5, #12, \sz // out7 + rshrn_sz v5, v6, v7, #12, \sz // out5 + smull_smlsl v6, v7, v26, v3, v0.h[0], v0.h[0], \sz // -> out10 (v26 or v21) + smull_smlal v2, v3, v22, v23, v0.h[0], v0.h[0], \sz // -> out4 (v20 or v27) + rshrn_sz v26, v6, v7, #12, \sz // out10 + + smull_smlsl v6, v7, v22, v23, v0.h[0], v0.h[0], \sz // -> out11 (v27 or v20) + smull_smlal v22, v23, v27, v20, v0.h[0], v0.h[0], \sz // -> out6 (v22 or v25) + smull_smlsl v21, v25, v27, v20, v0.h[0], v0.h[0], \sz // -> out9 (v25 or v22) + + rshrn_sz \o4, v2, v3, #12, \sz // out4 + rshrn_sz v6, v6, v7, #12, \sz // out11 + rshrn_sz v7, v21, v25, #12, \sz // out9 + rshrn_sz \o6, v22, v23, #12, \sz // out6 + +.ifc \o8, v23 + mov \o8\szb, v24\szb + mov \o10\szb, v26\szb +.endif + + sqneg \o7\sz, v4\sz // out7 + sqneg \o5\sz, v5\sz // out5 + sqneg \o11\sz, v6\sz // out11 + sqneg \o9\sz, v7\sz // out9 +.endm + +function inv_adst_8h_x16_neon, export=1 + iadst_16 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, .8h, .16b + ret +endfunc + +function inv_flipadst_8h_x16_neon, export=1 + iadst_16 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, .8h, .16b + ret +endfunc + +function inv_adst_4h_x16_neon, export=1 + iadst_16 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, .4h, .8b + ret +endfunc + +function inv_flipadst_4h_x16_neon, export=1 + iadst_16 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, .4h, .8b + ret +endfunc + +function inv_identity_8h_x16_neon, export=1 + mov w16, #2*(5793-4096)*8 + dup v0.4h, w16 +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + sqrdmulh v2.8h, v\i\().8h, v0.h[0] + sqadd v\i\().8h, v\i\().8h, v\i\().8h + sqadd v\i\().8h, v\i\().8h, v2.8h +.endr + ret +endfunc + +function inv_identity_4h_x16_neon, export=1 + mov w16, #2*(5793-4096)*8 + dup v0.4h, w16 +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + sqrdmulh v2.4h, v\i\().4h, v0.h[0] + sqadd v\i\().4h, v\i\().4h, v\i\().4h + sqadd v\i\().4h, v\i\().4h, v2.4h +.endr + ret +endfunc + +.macro identity_8x16_shift2 c +.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h + sqrdmulh v2.8h, \i, \c + sshr v2.8h, v2.8h, #1 + srhadd \i, \i, v2.8h +.endr +.endm + +.macro identity_8x16_shift1 c +.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h + sqrdmulh v2.8h, \i, \c + srshr v2.8h, v2.8h, #1 + sqadd \i, \i, v2.8h +.endr +.endm + +.macro identity_8x8_shift1 c +.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h + sqrdmulh v2.8h, \i, \c + srshr v2.8h, v2.8h, #1 + sqadd \i, \i, v2.8h +.endr +.endm + +.macro identity_8x8 c +.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h + sqrdmulh v2.8h, \i, \c + sqadd \i, \i, \i + sqadd \i, \i, v2.8h +.endr +.endm + +.macro def_horz_16 scale=0, identity=0, shift=2, suffix +function inv_txfm_horz\suffix\()_16x8_neon + mov x14, x30 + movi v7.8h, #0 +.if \identity + mov w16, #2*(5793-4096)*8 + dup v0.4h, w16 +.elseif \scale + mov w16, #2896*8 + dup v0.4h, w16 +.endif +.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h + ld1 {\i}, [x7] + st1 {v7.8h}, [x7], x8 +.endr +.if \scale + scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 + scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31 +.endif +.if \identity + identity_8x16_shift2 v0.h[0] +.else + blr x4 +.endif +.if \shift > 0 +.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h + srshr \i, \i, #\shift +.endr +.endif + transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 + transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v4, v5 + +.irp i, v16.8h, v24.8h, v17.8h, v25.8h, v18.8h, v26.8h, v19.8h, v27.8h, v20.8h, v28.8h, v21.8h, v29.8h, v22.8h, v30.8h, v23.8h, v31.8h + st1 {\i}, [x6], #16 +.endr + + br x14 +endfunc +.endm + +def_horz_16 scale=0, identity=0, shift=2 +def_horz_16 scale=1, identity=0, shift=1, suffix=_scale +def_horz_16 scale=0, identity=1, shift=0, suffix=_identity + +function inv_txfm_add_vert_8x16_neon + mov x14, x30 +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + ld1 {v\i\().8h}, [x7], x8 +.endr + blr x5 + load_add_store_8x16 x6, x7 + br x14 +endfunc + +function inv_txfm_add_16x16_neon + mov x15, x30 + sub sp, sp, #512 +.irp i, 0, 8 + add x6, sp, #(\i*16*2) +.if \i == 8 + cmp w3, w13 + b.lt 1f +.endif + add x7, x2, #(\i*2) + mov x8, #16*2 + blr x9 +.endr + b 2f +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr +2: +.irp i, 0, 8 + add x6, x0, #(\i) + add x7, sp, #(\i*2) + mov x8, #32 + bl inv_txfm_add_vert_8x16_neon +.endr + + add sp, sp, #512 + br x15 +endfunc + +.macro def_fn_16x16 txfm1, txfm2, eob_half +function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc 16, 16, 2 +.endif +.ifc \txfm1, identity + adr x9, inv_txfm_horz_identity_16x8_neon +.else + adr x9, inv_txfm_horz_16x8_neon + adr x4, inv_\txfm1\()_8h_x16_neon +.endif + adr x5, inv_\txfm2\()_8h_x16_neon + mov x13, #\eob_half + b inv_txfm_add_16x16_neon +endfunc +.endm + +def_fn_16x16 dct, dct, 36 +def_fn_16x16 identity, identity, 36 +def_fn_16x16 dct, adst, 36 +def_fn_16x16 dct, flipadst, 36 +def_fn_16x16 dct, identity, 8 +def_fn_16x16 adst, dct, 36 +def_fn_16x16 adst, adst, 36 +def_fn_16x16 adst, flipadst, 36 +def_fn_16x16 flipadst, dct, 36 +def_fn_16x16 flipadst, adst, 36 +def_fn_16x16 flipadst, flipadst, 36 +def_fn_16x16 identity, dct, 8 + +.macro def_fn_416_base variant +function inv_txfm_\variant\()add_16x4_neon + mov x15, x30 + movi v4.8h, #0 + +.ifc \variant, identity_ +.irp i, v16.4h, v17.4h, v18.4h, v19.4h + ld1 {\i}, [x2] + st1 {v4.4h}, [x2], #8 +.endr +.irp i, v16.d, v17.d, v18.d, v19.d + ld1 {\i}[1], [x2] + st1 {v4.4h}, [x2], #8 +.endr + mov w16, #2*(5793-4096)*8 + dup v0.4h, w16 +.irp i, v20.4h, v21.4h, v22.4h, v23.4h + ld1 {\i}, [x2] + st1 {v4.4h}, [x2], #8 +.endr +.irp i, v20.d, v21.d, v22.d, v23.d + ld1 {\i}[1], [x2] + st1 {v4.4h}, [x2], #8 +.endr + + identity_8x16_shift1 v0.h[0] +.else +.irp i, v16.4h, v17.4h, v18.4h, v19.4h, v20.4h, v21.4h, v22.4h, v23.4h, v24.4h, v25.4h, v26.4h, v27.4h, v28.4h, v29.4h, v30.4h, v31.4h + ld1 {\i}, [x2] + st1 {v4.4h}, [x2], #8 +.endr + + blr x4 + + ins v16.d[1], v20.d[0] + ins v17.d[1], v21.d[0] + ins v18.d[1], v22.d[0] + ins v19.d[1], v23.d[0] +.irp i, v16.8h, v17.8h, v18.8h, v19.8h + srshr \i, \i, #1 +.endr +.endif + transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 + blr x5 + mov x6, x0 + load_add_store_8x4 x6, x7 + +.ifc \variant, identity_ + mov v16.16b, v20.16b + mov v17.16b, v21.16b + mov v18.16b, v22.16b + mov v19.16b, v23.16b +.else + ins v24.d[1], v28.d[0] + ins v25.d[1], v29.d[0] + ins v26.d[1], v30.d[0] + ins v27.d[1], v31.d[0] + srshr v16.8h, v24.8h, #1 + srshr v17.8h, v25.8h, #1 + srshr v18.8h, v26.8h, #1 + srshr v19.8h, v27.8h, #1 +.endif + transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 + blr x5 + add x6, x0, #8 + load_add_store_8x4 x6, x7 + + br x15 +endfunc + +function inv_txfm_\variant\()add_4x16_neon + mov x15, x30 + movi v2.8h, #0 + + mov x11, #32 + cmp w3, w13 + b.lt 1f + + add x6, x2, #16 +.ifc \variant, identity_ +.irp i, v24.8h, v25.8h, v26.8h, v27.8h + ld1 {\i}, [x6] + st1 {v2.8h}, [x6], x11 +.endr + mov w16, #(5793-4096)*8 + dup v0.4h, w16 + identity_8x4_shift1 v24, v25, v26, v27, v0.h[0] +.else +.irp i, v16.8h, v17.8h, v18.8h, v19.8h + ld1 {\i}, [x6] + st1 {v2.8h}, [x6], x11 +.endr + blr x4 + srshr v24.8h, v16.8h, #1 + srshr v25.8h, v17.8h, #1 + srshr v26.8h, v18.8h, #1 + srshr v27.8h, v19.8h, #1 +.endif + transpose_4x8h v24, v25, v26, v27, v4, v5, v6, v7 + ins v28.d[0], v24.d[1] + ins v29.d[0], v25.d[1] + ins v30.d[0], v26.d[1] + ins v31.d[0], v27.d[1] + + b 2f +1: +.irp i, v24.4h, v25.4h, v26.4h, v27.4h, v28.4h, v29.4h, v30.4h, v31.4h + movi \i, #0 +.endr +2: + movi v2.8h, #0 +.irp i, v16.8h, v17.8h, v18.8h, v19.8h + ld1 {\i}, [x2] + st1 {v2.8h}, [x2], x11 +.endr +.ifc \variant, identity_ + mov w16, #(5793-4096)*8 + dup v0.4h, w16 + identity_8x4_shift1 v16, v17, v18, v19, v0.h[0] +.else + blr x4 +.irp i, v16.8h, v17.8h, v18.8h, v19.8h + srshr \i, \i, #1 +.endr +.endif + transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7 + ins v20.d[0], v16.d[1] + ins v21.d[0], v17.d[1] + ins v22.d[0], v18.d[1] + ins v23.d[0], v19.d[1] + + blr x5 + + load_add_store_4x16 x0, x6 + + br x15 +endfunc +.endm + +def_fn_416_base +def_fn_416_base identity_ + +.macro def_fn_416 w, h, txfm1, txfm2, eob_half +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc \w, \h, 1 +.endif +.if \w == 4 + adr x4, inv_\txfm1\()_8h_x\w\()_neon + adr x5, inv_\txfm2\()_4h_x\h\()_neon + mov w13, #\eob_half +.else + adr x4, inv_\txfm1\()_4h_x\w\()_neon + adr x5, inv_\txfm2\()_8h_x\h\()_neon +.endif +.ifc \txfm1, identity + b inv_txfm_identity_add_\w\()x\h\()_neon +.else + b inv_txfm_add_\w\()x\h\()_neon +.endif +endfunc +.endm + +.macro def_fns_416 w, h +def_fn_416 \w, \h, dct, dct, 29 +def_fn_416 \w, \h, identity, identity, 29 +def_fn_416 \w, \h, dct, adst, 29 +def_fn_416 \w, \h, dct, flipadst, 29 +def_fn_416 \w, \h, dct, identity, 8 +def_fn_416 \w, \h, adst, dct, 29 +def_fn_416 \w, \h, adst, adst, 29 +def_fn_416 \w, \h, adst, flipadst, 29 +def_fn_416 \w, \h, flipadst, dct, 29 +def_fn_416 \w, \h, flipadst, adst, 29 +def_fn_416 \w, \h, flipadst, flipadst, 29 +def_fn_416 \w, \h, identity, dct, 32 +def_fn_416 \w, \h, adst, identity, 8 +def_fn_416 \w, \h, flipadst, identity, 8 +def_fn_416 \w, \h, identity, adst, 32 +def_fn_416 \w, \h, identity, flipadst, 32 +.endm + +def_fns_416 4, 16 +def_fns_416 16, 4 + + +.macro def_fn_816_base variant +function inv_txfm_\variant\()add_16x8_neon + mov x15, x30 + movi v4.8h, #0 + mov w16, #2896*8 + dup v0.4h, w16 + +.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h + ld1 {\i}, [x2] + st1 {v4.8h}, [x2], #16 +.endr + + scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 + scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31 +.ifc \variant, identity_ + mov w16, #2*(5793-4096)*8 + dup v0.4h, w16 + identity_8x16_shift1 v0.h[0] +.else + blr x4 + +.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h + srshr \i, \i, #1 +.endr +.endif + transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3 + + blr x5 + + mov x6, x0 + load_add_store_8x8 x6, x7 + +.ifc \variant, identity_ + mov v16.16b, v24.16b + mov v17.16b, v25.16b + mov v18.16b, v26.16b + mov v19.16b, v27.16b + mov v20.16b, v28.16b + mov v21.16b, v29.16b + mov v22.16b, v30.16b + mov v23.16b, v31.16b +.else + srshr v16.8h, v24.8h, #1 + srshr v17.8h, v25.8h, #1 + srshr v18.8h, v26.8h, #1 + srshr v19.8h, v27.8h, #1 + srshr v20.8h, v28.8h, #1 + srshr v21.8h, v29.8h, #1 + srshr v22.8h, v30.8h, #1 + srshr v23.8h, v31.8h, #1 +.endif + + transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3 + + blr x5 + + add x0, x0, #8 + load_add_store_8x8 x0, x7 + + br x15 +endfunc + +function inv_txfm_\variant\()add_8x16_neon + mov x15, x30 + movi v4.8h, #0 + mov w16, #2896*8 + dup v0.4h, w16 + mov x11, #32 + + cmp w3, w13 + b.lt 1f + + add x6, x2, #16 +.ifc \variant, identity_ +.irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h + ld1 {\i}, [x6] + st1 {v4.8h}, [x6], x11 +.endr + scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31 + // The identity shl #1 and downshift srshr #1 cancel out +.else +.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h + ld1 {\i}, [x6] + st1 {v4.8h}, [x6], x11 +.endr + scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 + blr x4 + + srshr v24.8h, v16.8h, #1 + srshr v25.8h, v17.8h, #1 + srshr v26.8h, v18.8h, #1 + srshr v27.8h, v19.8h, #1 + srshr v28.8h, v20.8h, #1 + srshr v29.8h, v21.8h, #1 + srshr v30.8h, v22.8h, #1 + srshr v31.8h, v23.8h, #1 +.endif + transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v2, v3 + + b 2f + +1: +.irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h + movi \i, #0 +.endr + +2: + movi v4.8h, #0 + mov w16, #2896*8 + dup v0.4h, w16 + +.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h + ld1 {\i}, [x2] + st1 {v4.8h}, [x2], x11 +.endr + scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 +.ifc \variant, identity_ + // The identity shl #1 and downshift srshr #1 cancel out +.else + blr x4 + +.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h + srshr \i, \i, #1 +.endr +.endif + + transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3 + + blr x5 + + load_add_store_8x16 x0, x6 + + br x15 +endfunc +.endm + +def_fn_816_base +def_fn_816_base identity_ + +.macro def_fn_816 w, h, txfm1, txfm2, eob_half +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc \w, \h, 1 +.endif + adr x4, inv_\txfm1\()_8h_x\w\()_neon + adr x5, inv_\txfm2\()_8h_x\h\()_neon +.if \w == 8 + mov x13, #\eob_half +.endif +.ifc \txfm1, identity + b inv_txfm_identity_add_\w\()x\h\()_neon +.else + b inv_txfm_add_\w\()x\h\()_neon +.endif +endfunc +.endm + +.macro def_fns_816 w, h +def_fn_816 \w, \h, dct, dct, 43 +def_fn_816 \w, \h, identity, identity, 43 +def_fn_816 \w, \h, dct, adst, 43 +def_fn_816 \w, \h, dct, flipadst, 43 +def_fn_816 \w, \h, dct, identity, 8 +def_fn_816 \w, \h, adst, dct, 43 +def_fn_816 \w, \h, adst, adst, 43 +def_fn_816 \w, \h, adst, flipadst, 43 +def_fn_816 \w, \h, flipadst, dct, 43 +def_fn_816 \w, \h, flipadst, adst, 43 +def_fn_816 \w, \h, flipadst, flipadst, 43 +def_fn_816 \w, \h, identity, dct, 64 +def_fn_816 \w, \h, adst, identity, 8 +def_fn_816 \w, \h, flipadst, identity, 8 +def_fn_816 \w, \h, identity, adst, 64 +def_fn_816 \w, \h, identity, flipadst, 64 +.endm + +def_fns_816 8, 16 +def_fns_816 16, 8 + +function inv_dct32_odd_8h_x16_neon, export=1 + movrel x16, idct_coeffs, 2*16 + ld1 {v0.8h, v1.8h}, [x16] + sub x16, x16, #2*16 + + smull_smlsl v2, v3, v16, v31, v0.h[0], v0.h[1], .8h // -> t16a + smull_smlal v4, v5, v16, v31, v0.h[1], v0.h[0], .8h // -> t31a + smull_smlsl v6, v7, v24, v23, v0.h[2], v0.h[3], .8h // -> t17a + rshrn_sz v16, v2, v3, #12, .8h // t16a + rshrn_sz v31, v4, v5, #12, .8h // t31a + smull_smlal v2, v3, v24, v23, v0.h[3], v0.h[2], .8h // -> t30a + smull_smlsl v4, v5, v20, v27, v0.h[4], v0.h[5], .8h // -> t18a + rshrn_sz v24, v6, v7, #12, .8h // t17a + rshrn_sz v23, v2, v3, #12, .8h // t30a + smull_smlal v6, v7, v20, v27, v0.h[5], v0.h[4], .8h // -> t29a + smull_smlsl v2, v3, v28, v19, v0.h[6], v0.h[7], .8h // -> t19a + rshrn_sz v20, v4, v5, #12, .8h // t18a + rshrn_sz v27, v6, v7, #12, .8h // t29a + smull_smlal v4, v5, v28, v19, v0.h[7], v0.h[6], .8h // -> t28a + smull_smlsl v6, v7, v18, v29, v1.h[0], v1.h[1], .8h // -> t20a + rshrn_sz v28, v2, v3, #12, .8h // t19a + rshrn_sz v19, v4, v5, #12, .8h // t28a + smull_smlal v2, v3, v18, v29, v1.h[1], v1.h[0], .8h // -> t27a + smull_smlsl v4, v5, v26, v21, v1.h[2], v1.h[3], .8h // -> t21a + rshrn_sz v18, v6, v7, #12, .8h // t20a + rshrn_sz v29, v2, v3, #12, .8h // t27a + smull_smlal v6, v7, v26, v21, v1.h[3], v1.h[2], .8h // -> t26a + smull_smlsl v2, v3, v22, v25, v1.h[4], v1.h[5], .8h // -> t22a + rshrn_sz v26, v4, v5, #12, .8h // t21a + rshrn_sz v21, v6, v7, #12, .8h // t26a + smull_smlal v4, v5, v22, v25, v1.h[5], v1.h[4], .8h // -> t25a + smull_smlsl v6, v7, v30, v17, v1.h[6], v1.h[7], .8h // -> t23a + rshrn_sz v22, v2, v3, #12, .8h // t22a + rshrn_sz v25, v4, v5, #12, .8h // t25a + smull_smlal v2, v3, v30, v17, v1.h[7], v1.h[6], .8h // -> t24a + rshrn_sz v30, v6, v7, #12, .8h // t23a + rshrn_sz v17, v2, v3, #12, .8h // t24a + + ld1 {v0.8h}, [x16] + + sqsub v2.8h, v16.8h, v24.8h // t17 + sqadd v16.8h, v16.8h, v24.8h // t16 + sqsub v3.8h, v31.8h, v23.8h // t30 + sqadd v31.8h, v31.8h, v23.8h // t31 + sqsub v24.8h, v28.8h, v20.8h // t18 + sqadd v28.8h, v28.8h, v20.8h // t19 + sqadd v23.8h, v18.8h, v26.8h // t20 + sqsub v18.8h, v18.8h, v26.8h // t21 + sqsub v20.8h, v30.8h, v22.8h // t22 + sqadd v30.8h, v30.8h, v22.8h // t23 + sqadd v26.8h, v17.8h, v25.8h // t24 + sqsub v17.8h, v17.8h, v25.8h // t25 + sqsub v22.8h, v29.8h, v21.8h // t26 + sqadd v29.8h, v29.8h, v21.8h // t27 + sqadd v25.8h, v19.8h, v27.8h // t28 + sqsub v19.8h, v19.8h, v27.8h // t29 + + smull_smlsl v4, v5, v3, v2, v0.h[4], v0.h[5], .8h // -> t17a + smull_smlal v6, v7, v3, v2, v0.h[5], v0.h[4], .8h // -> t30a + smull_smlal v2, v3, v19, v24, v0.h[5], v0.h[4], .8h // -> t18a + rshrn_sz v21, v4, v5, #12, .8h // t17a + rshrn_sz v27, v6, v7, #12, .8h // t30a + neg v2.4s, v2.4s // -> t18a + neg v3.4s, v3.4s // -> t18a + smull_smlsl v4, v5, v19, v24, v0.h[4], v0.h[5], .8h // -> t29a + smull_smlsl v6, v7, v22, v18, v0.h[6], v0.h[7], .8h // -> t21a + rshrn_sz v19, v2, v3, #12, .8h // t18a + rshrn_sz v24, v4, v5, #12, .8h // t29a + smull_smlal v2, v3, v22, v18, v0.h[7], v0.h[6], .8h // -> t26a + smull_smlal v4, v5, v17, v20, v0.h[7], v0.h[6], .8h // -> t22a + rshrn_sz v22, v6, v7, #12, .8h // t21a + rshrn_sz v18, v2, v3, #12, .8h // t26a + neg v4.4s, v4.4s // -> t22a + neg v5.4s, v5.4s // -> t22a + smull_smlsl v6, v7, v17, v20, v0.h[6], v0.h[7], .8h // -> t25a + rshrn_sz v17, v4, v5, #12, .8h // t22a + rshrn_sz v20, v6, v7, #12, .8h // t25a + + sqsub v2.8h, v27.8h, v24.8h // t29 + sqadd v27.8h, v27.8h, v24.8h // t30 + sqsub v3.8h, v21.8h, v19.8h // t18 + sqadd v21.8h, v21.8h, v19.8h // t17 + sqsub v24.8h, v16.8h, v28.8h // t19a + sqadd v16.8h, v16.8h, v28.8h // t16a + sqsub v19.8h, v30.8h, v23.8h // t20a + sqadd v30.8h, v30.8h, v23.8h // t23a + sqsub v28.8h, v17.8h, v22.8h // t21 + sqadd v17.8h, v17.8h, v22.8h // t22 + sqadd v23.8h, v26.8h, v29.8h // t24a + sqsub v26.8h, v26.8h, v29.8h // t27a + sqadd v22.8h, v20.8h, v18.8h // t25 + sqsub v20.8h, v20.8h, v18.8h // t26 + sqsub v29.8h, v31.8h, v25.8h // t28a + sqadd v31.8h, v31.8h, v25.8h // t31a + + smull_smlsl v4, v5, v2, v3, v0.h[2], v0.h[3], .8h // -> t18a + smull_smlal v6, v7, v2, v3, v0.h[3], v0.h[2], .8h // -> t29a + smull_smlsl v2, v3, v29, v24, v0.h[2], v0.h[3], .8h // -> t19 + rshrn_sz v18, v4, v5, #12, .8h // t18a + rshrn_sz v25, v6, v7, #12, .8h // t29a + smull_smlal v4, v5, v29, v24, v0.h[3], v0.h[2], .8h // -> t28 + smull_smlal v6, v7, v26, v19, v0.h[3], v0.h[2], .8h // -> t20 + rshrn_sz v29, v2, v3, #12, .8h // t19 + rshrn_sz v24, v4, v5, #12, .8h // t28 + neg v6.4s, v6.4s // -> t20 + neg v7.4s, v7.4s // -> t20 + smull_smlsl v2, v3, v26, v19, v0.h[2], v0.h[3], .8h // -> t27 + smull_smlal v4, v5, v20, v28, v0.h[3], v0.h[2], .8h // -> t21a + rshrn_sz v26, v6, v7, #12, .8h // t20 + rshrn_sz v19, v2, v3, #12, .8h // t27 + neg v4.4s, v4.4s // -> t21a + neg v5.4s, v5.4s // -> t21a + smull_smlsl v6, v7, v20, v28, v0.h[2], v0.h[3], .8h // -> t26a + rshrn_sz v20, v4, v5, #12, .8h // t21a + rshrn_sz v28, v6, v7, #12, .8h // t26a + + sqsub v2.8h, v16.8h, v30.8h // t23 + sqadd v16.8h, v16.8h, v30.8h // t16 = out16 + sqsub v3.8h, v31.8h, v23.8h // t24 + sqadd v31.8h, v31.8h, v23.8h // t31 = out31 + sqsub v23.8h, v21.8h, v17.8h // t22a + sqadd v17.8h, v21.8h, v17.8h // t17a = out17 + sqadd v30.8h, v27.8h, v22.8h // t30a = out30 + sqsub v21.8h, v27.8h, v22.8h // t25a + sqsub v27.8h, v18.8h, v20.8h // t21 + sqadd v18.8h, v18.8h, v20.8h // t18 = out18 + sqadd v4.8h, v29.8h, v26.8h // t19a = out19 + sqsub v26.8h, v29.8h, v26.8h // t20a + sqadd v29.8h, v25.8h, v28.8h // t29 = out29 + sqsub v25.8h, v25.8h, v28.8h // t26 + sqadd v28.8h, v24.8h, v19.8h // t28a = out28 + sqsub v24.8h, v24.8h, v19.8h // t27a + mov v19.16b, v4.16b // out19 + + smull_smlsl v4, v5, v24, v26, v0.h[0], v0.h[0], .8h // -> t20 + smull_smlal v6, v7, v24, v26, v0.h[0], v0.h[0], .8h // -> t27 + rshrn_sz v20, v4, v5, #12, .8h // t20 + rshrn_sz v22, v6, v7, #12, .8h // t27 + + smull_smlal v4, v5, v25, v27, v0.h[0], v0.h[0], .8h // -> t26a + smull_smlsl v6, v7, v25, v27, v0.h[0], v0.h[0], .8h // -> t21a + mov v27.16b, v22.16b // t27 + rshrn_sz v26, v4, v5, #12, .8h // t26a + + smull_smlsl v24, v25, v21, v23, v0.h[0], v0.h[0], .8h // -> t22 + smull_smlal v4, v5, v21, v23, v0.h[0], v0.h[0], .8h // -> t25 + rshrn_sz v21, v6, v7, #12, .8h // t21a + rshrn_sz v22, v24, v25, #12, .8h // t22 + rshrn_sz v25, v4, v5, #12, .8h // t25 + + smull_smlsl v4, v5, v3, v2, v0.h[0], v0.h[0], .8h // -> t23a + smull_smlal v6, v7, v3, v2, v0.h[0], v0.h[0], .8h // -> t24a + rshrn_sz v23, v4, v5, #12, .8h // t23a + rshrn_sz v24, v6, v7, #12, .8h // t24a + + ret +endfunc + +.macro def_horz_32 scale=0, shift=2, suffix +function inv_txfm_horz\suffix\()_dct_32x8_neon + mov x14, x30 + movi v7.8h, #0 + lsl x8, x8, #1 +.if \scale + mov w16, #2896*8 + dup v0.4h, w16 +.endif + +.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h + ld1 {\i}, [x7] + st1 {v7.8h}, [x7], x8 +.endr + sub x7, x7, x8, lsl #4 + add x7, x7, x8, lsr #1 +.if \scale + scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 + scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31 +.endif + bl inv_dct_8h_x16_neon + transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 + transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v4, v5 + +.macro store1 r0, r1 + st1 {\r0}, [x6], #16 + st1 {\r1}, [x6], #16 + add x6, x6, #32 +.endm + store1 v16.8h, v24.8h + store1 v17.8h, v25.8h + store1 v18.8h, v26.8h + store1 v19.8h, v27.8h + store1 v20.8h, v28.8h + store1 v21.8h, v29.8h + store1 v22.8h, v30.8h + store1 v23.8h, v31.8h +.purgem store1 + sub x6, x6, #64*8 + + movi v7.8h, #0 +.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h + ld1 {\i}, [x7] + st1 {v7.8h}, [x7], x8 +.endr +.if \scale + // This relies on the fact that the idct also leaves the right coeff in v0.h[1] + scale_input .8h, v0.h[1], v16, v17, v18, v19, v20, v21, v22, v23 + scale_input .8h, v0.h[1], v24, v25, v26, v27, v28, v29, v30, v31 +.endif + bl inv_dct32_odd_8h_x16_neon + transpose_8x8h v31, v30, v29, v28, v27, v26, v25, v24, v4, v5 + transpose_8x8h v23, v22, v21, v20, v19, v18, v17, v16, v4, v5 +.macro store2 r0, r1, shift + ld1 {v4.8h, v5.8h}, [x6] + sqsub v7.8h, v4.8h, \r0 + sqsub v6.8h, v5.8h, \r1 + sqadd v4.8h, v4.8h, \r0 + sqadd v5.8h, v5.8h, \r1 + rev64 v6.8h, v6.8h + rev64 v7.8h, v7.8h + srshr v4.8h, v4.8h, #\shift + srshr v5.8h, v5.8h, #\shift + srshr v6.8h, v6.8h, #\shift + srshr v7.8h, v7.8h, #\shift + ext v6.16b, v6.16b, v6.16b, #8 + st1 {v4.8h, v5.8h}, [x6], #32 + ext v7.16b, v7.16b, v7.16b, #8 + st1 {v6.8h, v7.8h}, [x6], #32 +.endm + + store2 v31.8h, v23.8h, \shift + store2 v30.8h, v22.8h, \shift + store2 v29.8h, v21.8h, \shift + store2 v28.8h, v20.8h, \shift + store2 v27.8h, v19.8h, \shift + store2 v26.8h, v18.8h, \shift + store2 v25.8h, v17.8h, \shift + store2 v24.8h, v16.8h, \shift +.purgem store2 + br x14 +endfunc +.endm + +def_horz_32 scale=0, shift=2 +def_horz_32 scale=1, shift=1, suffix=_scale + +function inv_txfm_add_vert_dct_8x32_neon + mov x14, x30 + lsl x8, x8, #1 + +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + ld1 {v\i\().8h}, [x7], x8 +.endr + sub x7, x7, x8, lsl #4 + + bl inv_dct_8h_x16_neon + +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + st1 {v\i\().8h}, [x7], x8 +.endr + sub x7, x7, x8, lsl #4 + add x7, x7, x8, lsr #1 + +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + ld1 {v\i\().8h}, [x7], x8 +.endr + sub x7, x7, x8, lsl #4 + sub x7, x7, x8, lsr #1 + bl inv_dct32_odd_8h_x16_neon + + neg x9, x8 + mov x10, x6 +.macro combine r0, r1, r2, r3, op, stride + ld1 {v5.8h}, [x7], \stride + ld1 {v2.8b}, [x10], x1 + ld1 {v6.8h}, [x7], \stride + ld1 {v3.8b}, [x10], x1 + \op v5.8h, v5.8h, \r0 + ld1 {v7.8h}, [x7], \stride + ld1 {v4.8b}, [x10], x1 + srshr v5.8h, v5.8h, #4 + \op v6.8h, v6.8h, \r1 + uaddw v5.8h, v5.8h, v2.8b + srshr v6.8h, v6.8h, #4 + \op v7.8h, v7.8h, \r2 + sqxtun v2.8b, v5.8h + ld1 {v5.8h}, [x7], \stride + uaddw v6.8h, v6.8h, v3.8b + srshr v7.8h, v7.8h, #4 + \op v5.8h, v5.8h, \r3 + st1 {v2.8b}, [x6], x1 + ld1 {v2.8b}, [x10], x1 + sqxtun v3.8b, v6.8h + uaddw v7.8h, v7.8h, v4.8b + srshr v5.8h, v5.8h, #4 + st1 {v3.8b}, [x6], x1 + sqxtun v4.8b, v7.8h + uaddw v5.8h, v5.8h, v2.8b + st1 {v4.8b}, [x6], x1 + sqxtun v2.8b, v5.8h + st1 {v2.8b}, [x6], x1 +.endm + combine v31.8h, v30.8h, v29.8h, v28.8h, sqadd, x8 + combine v27.8h, v26.8h, v25.8h, v24.8h, sqadd, x8 + combine v23.8h, v22.8h, v21.8h, v20.8h, sqadd, x8 + combine v19.8h, v18.8h, v17.8h, v16.8h, sqadd, x8 + sub x7, x7, x8 + combine v16.8h, v17.8h, v18.8h, v19.8h, sqsub, x9 + combine v20.8h, v21.8h, v22.8h, v23.8h, sqsub, x9 + combine v24.8h, v25.8h, v26.8h, v27.8h, sqsub, x9 + combine v28.8h, v29.8h, v30.8h, v31.8h, sqsub, x9 +.purgem combine + + br x14 +endfunc + +const eob_32x32 + .short 36, 136, 300, 1024 +endconst + +const eob_16x32 + .short 36, 151, 279, 512 +endconst + +const eob_16x32_shortside + .short 36, 512 +endconst + +const eob_8x32 + .short 43, 107, 171, 256 +endconst + +function inv_txfm_add_identity_identity_32x32_8bpc_neon, export=1 + movi v0.8h, #0 + movrel x13, eob_32x32 + + mov x8, #2*32 +1: + mov w9, #0 + movrel x12, eob_32x32 +2: + add w9, w9, #8 +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + ld1 {v\i\().8h}, [x2] + st1 {v0.8h}, [x2], x8 +.endr + transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 + + load_add_store_8x8 x0, x7, shiftbits=2 + ldrh w11, [x12], #2 + sub x0, x0, x1, lsl #3 + add x0, x0, #8 + cmp w3, w11 + b.ge 2b + + ldrh w11, [x13], #2 + cmp w3, w11 + b.lt 9f + + sub x0, x0, w9, uxtw + add x0, x0, x1, lsl #3 + msub x2, x8, x9, x2 + add x2, x2, #2*8 + b 1b +9: + ret +endfunc + +.macro shift_8_regs op, shift +.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h + \op \i, \i, #\shift +.endr +.endm + +.macro def_identity_1632 w, h, wshort, hshort +function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1 + mov w16, #2896*8 + mov w17, #2*(5793-4096)*8 + dup v1.4h, w16 + movi v0.8h, #0 + mov v1.h[1], w17 + movrel x13, eob_16x32\hshort + + mov x8, #2*\h +1: + mov w9, #0 + movrel x12, eob_16x32\wshort +2: + add w9, w9, #8 +.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h + ld1 {\i}, [x2] + st1 {v0.8h}, [x2], x8 +.endr + scale_input .8h, v1.h[0], v16, v17, v18, v19, v20, v21, v22, v23 + +.if \w == 16 + // 16x32 + identity_8x8_shift1 v1.h[1] +.else + // 32x16 + shift_8_regs sqshl, 1 + identity_8x8 v1.h[1] +.endif + + transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 + +.if \w == 16 + load_add_store_8x8 x0, x7, shiftbits=2 +.else + load_add_store_8x8 x0, x7, shiftbits=4 +.endif + ldrh w11, [x12], #2 + sub x0, x0, x1, lsl #3 + add x0, x0, #8 + cmp w3, w11 + b.ge 2b + + ldrh w11, [x13], #2 + cmp w3, w11 + b.lt 9f + + sub x0, x0, w9, uxtw + add x0, x0, x1, lsl #3 + msub x2, x8, x9, x2 + add x2, x2, #2*8 + b 1b +9: + ret +endfunc +.endm + +def_identity_1632 16, 32, _shortside, +def_identity_1632 32, 16, , _shortside + +.macro def_identity_832 w, h +function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1 + movi v0.8h, #0 + movrel x13, eob_8x32 + + mov w8, #2*\h +1: + ldrh w12, [x13], #2 +.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h + ld1 {\i}, [x2] + st1 {v0.8h}, [x2], x8 +.endr + +.if \w == 8 + // 8x32 + shift_8_regs srshr, 1 +.endif + + transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 + + cmp w3, w12 +.if \w == 8 + load_add_store_8x8 x0, x7, shiftbits=2 +.else + load_add_store_8x8 x0, x7, shiftbits=3 +.endif + + b.lt 9f +.if \w == 8 + sub x2, x2, x8, lsl #3 + add x2, x2, #2*8 +.else + sub x0, x0, x1, lsl #3 + add x0, x0, #8 +.endif + b 1b + +9: + ret +endfunc +.endm + +def_identity_832 8, 32 +def_identity_832 32, 8 + +function inv_txfm_add_dct_dct_32x32_8bpc_neon, export=1 + idct_dc 32, 32, 2 + + mov x15, x30 + sub sp, sp, #2048 + movrel x13, eob_32x32 + ldrh w12, [x13], #2 + +.irp i, 0, 8, 16, 24 + add x6, sp, #(\i*32*2) +.if \i > 0 + mov w8, #(32 - \i) + cmp w3, w12 + b.lt 1f +.if \i < 24 + ldrh w12, [x13], #2 +.endif +.endif + add x7, x2, #(\i*2) + mov x8, #32*2 + bl inv_txfm_horz_dct_32x8_neon +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #4 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8, 16, 24 + add x6, x0, #(\i) + add x7, sp, #(\i*2) + mov x8, #32*2 + bl inv_txfm_add_vert_dct_8x32_neon +.endr + + add sp, sp, #2048 + br x15 +endfunc + +function inv_txfm_add_dct_dct_16x32_8bpc_neon, export=1 + idct_dc 16, 32, 1 + + mov x15, x30 + sub sp, sp, #1024 + movrel x13, eob_16x32 + ldrh w12, [x13], #2 + adr x4, inv_dct_8h_x16_neon + +.irp i, 0, 8, 16, 24 + add x6, sp, #(\i*16*2) + add x7, x2, #(\i*2) +.if \i > 0 + mov w8, #(32 - \i) + cmp w3, w12 + b.lt 1f +.if \i < 24 + ldrh w12, [x13], #2 +.endif +.endif + mov x8, #2*32 + bl inv_txfm_horz_scale_16x8_neon +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #8 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8 + add x6, x0, #(\i) + add x7, sp, #(\i*2) + mov x8, #16*2 + bl inv_txfm_add_vert_dct_8x32_neon +.endr + + add sp, sp, #1024 + br x15 +endfunc + +function inv_txfm_add_dct_dct_32x16_8bpc_neon, export=1 + idct_dc 32, 16, 1 + + mov x15, x30 + sub sp, sp, #1024 + + adr x5, inv_dct_8h_x16_neon + +.irp i, 0, 8 + add x6, sp, #(\i*32*2) + add x7, x2, #(\i*2) +.if \i > 0 + mov w8, #(16 - \i) + cmp w3, #36 + b.lt 1f +.endif + mov x8, #2*16 + bl inv_txfm_horz_scale_dct_32x8_neon +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #4 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8, 16, 24 + add x6, x0, #(\i) + add x7, sp, #(\i*2) + mov x8, #32*2 + bl inv_txfm_add_vert_8x16_neon +.endr + + add sp, sp, #1024 + br x15 +endfunc + +function inv_txfm_add_dct_dct_8x32_8bpc_neon, export=1 + idct_dc 8, 32, 2 + + mov x15, x30 + sub sp, sp, #512 + + movrel x13, eob_8x32 + + movi v28.8h, #0 + mov x8, #2*32 + mov w9, #32 + mov x6, sp +1: +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + ld1 {v\i\().8h}, [x2] + st1 {v28.8h}, [x2], x8 +.endr + ldrh w12, [x13], #2 + sub x2, x2, x8, lsl #3 + sub w9, w9, #8 + add x2, x2, #2*8 + + bl inv_dct_8h_x8_neon + +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + srshr v\i\().8h, v\i\().8h, #2 +.endr + + transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 + + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64 + cmp w3, w12 + st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x6], #64 + + b.ge 1b + cbz w9, 3f + + movi v29.8h, #0 + movi v30.8h, #0 + movi v31.8h, #0 +2: + subs w9, w9, #8 +.rept 2 + st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x6], #64 +.endr + b.gt 2b + +3: + mov x6, x0 + mov x7, sp + mov x8, #8*2 + bl inv_txfm_add_vert_dct_8x32_neon + + add sp, sp, #512 + br x15 +endfunc + +function inv_txfm_add_dct_dct_32x8_8bpc_neon, export=1 + idct_dc 32, 8, 2 + + mov x15, x30 + sub sp, sp, #512 + + mov x6, sp + mov x7, x2 + mov x8, #8*2 + bl inv_txfm_horz_dct_32x8_neon + + mov x8, #2*32 + mov w9, #0 +1: + add x6, x0, x9 + add x7, sp, x9, lsl #1 // #(\i*2) + +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + ld1 {v\i\().8h}, [x7], x8 +.endr + add w9, w9, #8 + + bl inv_dct_8h_x8_neon + + cmp w9, #32 + + load_add_store_8x8 x6, x7 + + b.lt 1b + + add sp, sp, #512 + br x15 +endfunc + +function inv_dct64_step1_neon + // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a + // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a + // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a + // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a + + ld1 {v0.8h, v1.8h}, [x17], #32 + + sqrdmulh v23.8h, v16.8h, v0.h[1] // t63a + sqrdmulh v16.8h, v16.8h, v0.h[0] // t32a + sqrdmulh v22.8h, v17.8h, v0.h[2] // t62a + sqrdmulh v17.8h, v17.8h, v0.h[3] // t33a + sqrdmulh v21.8h, v18.8h, v0.h[5] // t61a + sqrdmulh v18.8h, v18.8h, v0.h[4] // t34a + sqrdmulh v20.8h, v19.8h, v0.h[6] // t60a + sqrdmulh v19.8h, v19.8h, v0.h[7] // t35a + + sqadd v24.8h, v16.8h, v17.8h // t32 + sqsub v25.8h, v16.8h, v17.8h // t33 + sqsub v26.8h, v19.8h, v18.8h // t34 + sqadd v27.8h, v19.8h, v18.8h // t35 + sqadd v28.8h, v20.8h, v21.8h // t60 + sqsub v29.8h, v20.8h, v21.8h // t61 + sqsub v30.8h, v23.8h, v22.8h // t62 + sqadd v31.8h, v23.8h, v22.8h // t63 + + smull_smlal v2, v3, v29, v26, v1.h[0], v1.h[1], .8h // -> t34a + smull_smlsl v4, v5, v29, v26, v1.h[1], v1.h[0], .8h // -> t61a + neg v2.4s, v2.4s // t34a + neg v3.4s, v3.4s // t34a + smull_smlsl v6, v7, v30, v25, v1.h[1], v1.h[0], .8h // -> t33a + rshrn_sz v26, v2, v3, #12, .8h // t34a + smull_smlal v2, v3, v30, v25, v1.h[0], v1.h[1], .8h // -> t62a + rshrn_sz v29, v4, v5, #12, .8h // t61a + rshrn_sz v25, v6, v7, #12, .8h // t33a + rshrn_sz v30, v2, v3, #12, .8h // t62a + + sqadd v16.8h, v24.8h, v27.8h // t32a + sqsub v19.8h, v24.8h, v27.8h // t35a + sqadd v17.8h, v25.8h, v26.8h // t33 + sqsub v18.8h, v25.8h, v26.8h // t34 + sqsub v20.8h, v31.8h, v28.8h // t60a + sqadd v23.8h, v31.8h, v28.8h // t63a + sqsub v21.8h, v30.8h, v29.8h // t61 + sqadd v22.8h, v30.8h, v29.8h // t62 + + smull_smlal v2, v3, v21, v18, v1.h[2], v1.h[3], .8h // -> t61a + smull_smlsl v4, v5, v21, v18, v1.h[3], v1.h[2], .8h // -> t34a + smull_smlal v6, v7, v20, v19, v1.h[2], v1.h[3], .8h // -> t60 + rshrn_sz v21, v2, v3, #12, .8h // t61a + rshrn_sz v18, v4, v5, #12, .8h // t34a + smull_smlsl v2, v3, v20, v19, v1.h[3], v1.h[2], .8h // -> t35 + rshrn_sz v20, v6, v7, #12, .8h // t60 + rshrn_sz v19, v2, v3, #12, .8h // t35 + + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64 + st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x6], #64 + + ret +endfunc + +function inv_dct64_step2_neon + movrel x16, idct_coeffs + ld1 {v0.4h}, [x16] +1: + // t32a/33/34a/35/60/61a/62/63a + // t56a/57/58a/59/36/37a/38/39a + // t40a/41/42a/43/52/53a/54/55a + // t48a/49/50a/51/44/45a/46/47a + ldr q16, [x6, #2*8*0] // t32a + ldr q17, [x9, #2*8*8] // t39a + ldr q18, [x9, #2*8*0] // t63a + ldr q19, [x6, #2*8*8] // t56a + ldr q20, [x6, #2*8*16] // t40a + ldr q21, [x9, #2*8*24] // t47a + ldr q22, [x9, #2*8*16] // t55a + ldr q23, [x6, #2*8*24] // t48a + + sqadd v24.8h, v16.8h, v17.8h // t32 + sqsub v25.8h, v16.8h, v17.8h // t39 + sqadd v26.8h, v18.8h, v19.8h // t63 + sqsub v27.8h, v18.8h, v19.8h // t56 + sqsub v28.8h, v21.8h, v20.8h // t40 + sqadd v29.8h, v21.8h, v20.8h // t47 + sqadd v30.8h, v23.8h, v22.8h // t48 + sqsub v31.8h, v23.8h, v22.8h // t55 + + smull_smlal v2, v3, v27, v25, v0.h[3], v0.h[2], .8h // -> t56a + smull_smlsl v4, v5, v27, v25, v0.h[2], v0.h[3], .8h // -> t39a + smull_smlal v6, v7, v31, v28, v0.h[3], v0.h[2], .8h // -> t40a + rshrn_sz v25, v2, v3, #12, .8h // t56a + rshrn_sz v27, v4, v5, #12, .8h // t39a + neg v6.4s, v6.4s // t40a + neg v7.4s, v7.4s // t40a + smull_smlsl v2, v3, v31, v28, v0.h[2], v0.h[3], .8h // -> t55a + rshrn_sz v31, v6, v7, #12, .8h // t40a + rshrn_sz v28, v2, v3, #12, .8h // t55a + + sqadd v16.8h, v24.8h, v29.8h // t32a + sqsub v19.8h, v24.8h, v29.8h // t47a + sqadd v17.8h, v27.8h, v31.8h // t39 + sqsub v18.8h, v27.8h, v31.8h // t40 + sqsub v20.8h, v26.8h, v30.8h // t48a + sqadd v23.8h, v26.8h, v30.8h // t63a + sqsub v21.8h, v25.8h, v28.8h // t55 + sqadd v22.8h, v25.8h, v28.8h // t56 + + smull_smlsl v2, v3, v21, v18, v0.h[0], v0.h[0], .8h // -> t40a + smull_smlal v4, v5, v21, v18, v0.h[0], v0.h[0], .8h // -> t55a + smull_smlsl v6, v7, v20, v19, v0.h[0], v0.h[0], .8h // -> t47 + rshrn_sz v18, v2, v3, #12, .8h // t40a + rshrn_sz v21, v4, v5, #12, .8h // t55a + smull_smlal v2, v3, v20, v19, v0.h[0], v0.h[0], .8h // -> t48 + rshrn_sz v19, v6, v7, #12, .8h // t47 + rshrn_sz v20, v2, v3, #12, .8h // t48 + + str q16, [x6, #2*8*0] // t32a + str q17, [x9, #2*8*0] // t39 + str q18, [x6, #2*8*8] // t40a + str q19, [x9, #2*8*8] // t47 + str q20, [x6, #2*8*16] // t48 + str q21, [x9, #2*8*16] // t55a + str q22, [x6, #2*8*24] // t56 + str q23, [x9, #2*8*24] // t63a + + add x6, x6, #2*8 + sub x9, x9, #2*8 + cmp x6, x9 + b.lt 1b + ret +endfunc + +.macro load8 src, strd, zero, clear +.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h +.if \clear + ld1 {\i}, [\src] + st1 {\zero}, [\src], \strd +.else + ld1 {\i}, [\src], \strd +.endif +.endr +.endm + +.macro store16 dst +.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h + st1 {\i}, [\dst], #16 +.endr +.endm + +.macro clear_upper8 +.irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h + movi \i, #0 +.endr +.endm + +.macro movi_if reg, val, cond +.if \cond + movi \reg, \val +.endif +.endm + +.macro movdup_if reg, gpr, val, cond +.if \cond + mov \gpr, \val + dup \reg, \gpr +.endif +.endm + +.macro st1_if regs, dst, cond +.if \cond + st1 \regs, \dst +.endif +.endm + +.macro str_if reg, dst, cond +.if \cond + str \reg, \dst +.endif +.endm + +.macro stroff_if reg, dst, dstoff, cond +.if \cond + str \reg, \dst, \dstoff +.endif +.endm + +.macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7 +.if \cond + scale_input .8h, \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 +.endif +.endm + +.macro def_dct64_func suffix, clear=0, scale=0 +function inv_txfm_dct\suffix\()_8h_x64_neon, export=1 + mov x14, x30 + mov x6, sp + lsl x8, x8, #2 + + movdup_if v0.4h, w16, #2896*8, \scale + movi_if v7.8h, #0, \clear + load8 x7, x8, v7.8h, \clear + clear_upper8 + sub x7, x7, x8, lsl #3 + add x7, x7, x8, lsr #1 + scale_if \scale, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 + + bl inv_dct_8h_x16_neon + + store16 x6 + + movdup_if v0.4h, w16, #2896*8, \scale + movi_if v7.8h, #0, \clear + load8 x7, x8, v7.8h, \clear + clear_upper8 + sub x7, x7, x8, lsl #3 + lsr x8, x8, #1 + sub x7, x7, x8, lsr #1 + scale_if \scale, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 + + bl inv_dct32_odd_8h_x16_neon + + add x10, x6, #16*15 + sub x6, x6, #16*16 + + mov x9, #-16 + +.macro store_addsub r0, r1, r2, r3 + ld1 {v2.8h}, [x6], #16 + ld1 {v3.8h}, [x6], #16 + sqadd v6.8h, v2.8h, \r0 + sqsub \r0, v2.8h, \r0 + ld1 {v4.8h}, [x6], #16 + sqadd v7.8h, v3.8h, \r1 + sqsub \r1, v3.8h, \r1 + ld1 {v5.8h}, [x6], #16 + sqadd v2.8h, v4.8h, \r2 + sub x6, x6, #16*4 + sqsub \r2, v4.8h, \r2 + st1 {v6.8h}, [x6], #16 + st1 {\r0}, [x10], x9 + sqadd v3.8h, v5.8h, \r3 + sqsub \r3, v5.8h, \r3 + st1 {v7.8h}, [x6], #16 + st1 {\r1}, [x10], x9 + st1 {v2.8h}, [x6], #16 + st1 {\r2}, [x10], x9 + st1 {v3.8h}, [x6], #16 + st1 {\r3}, [x10], x9 +.endm + store_addsub v31.8h, v30.8h, v29.8h, v28.8h + store_addsub v27.8h, v26.8h, v25.8h, v24.8h + store_addsub v23.8h, v22.8h, v21.8h, v20.8h + store_addsub v19.8h, v18.8h, v17.8h, v16.8h +.purgem store_addsub + + add x6, x6, #2*8*16 + + movrel x17, idct64_coeffs + movdup_if v0.4h, w16, #2896*8, \scale + movi_if v7.8h, #0, \clear + add x9, x7, x8, lsl #4 // offset 16 + add x10, x7, x8, lsl #3 // offset 8 + sub x9, x9, x8 // offset 15 + sub x11, x10, x8 // offset 7 + ld1 {v16.8h}, [x7] // in1 (offset 0) + ld1 {v17.8h}, [x9] // in31 (offset 15) + ld1 {v18.8h}, [x10] // in17 (offset 8) + ld1 {v19.8h}, [x11] // in15 (offset 7) + st1_if {v7.8h}, [x7], \clear + st1_if {v7.8h}, [x9], \clear + st1_if {v7.8h}, [x10], \clear + st1_if {v7.8h}, [x11], \clear + scale_if \scale, v0.h[0], v16, v17, v18, v19 + bl inv_dct64_step1_neon + movdup_if v0.4h, w16, #2896*8, \scale + movi_if v7.8h, #0, \clear + add x7, x7, x8, lsl #2 // offset 4 + sub x9, x9, x8, lsl #2 // offset 11 + sub x10, x7, x8 // offset 3 + add x11, x9, x8 // offset 12 + ld1 {v16.8h}, [x10] // in7 (offset 3) + ld1 {v17.8h}, [x11] // in25 (offset 12) + ld1 {v18.8h}, [x9] // in23 (offset 11) + ld1 {v19.8h}, [x7] // in9 (offset 4) + st1_if {v7.8h}, [x7], \clear + st1_if {v7.8h}, [x9], \clear + st1_if {v7.8h}, [x10], \clear + st1_if {v7.8h}, [x11], \clear + scale_if \scale, v0.h[0], v16, v17, v18, v19 + bl inv_dct64_step1_neon + movdup_if v0.4h, w16, #2896*8, \scale + movi_if v7.8h, #0, \clear + sub x10, x10, x8, lsl #1 // offset 1 + sub x9, x9, x8, lsl #1 // offset 9 + add x7, x7, x8 // offset 5 + add x11, x11, x8 // offset 13 + ldr q16, [x10, x8] // in5 (offset 2) + ldr q17, [x11] // in27 (offset 13) + ldr q18, [x9, x8] // in21 (offset 10) + ldr q19, [x7] // in11 (offset 5) + stroff_if q7, [x10, x8], \clear + str_if q7, [x11], \clear + stroff_if q7, [x9, x8], \clear + str_if q7, [x7], \clear + scale_if \scale, v0.h[0], v16, v17, v18, v19 + bl inv_dct64_step1_neon + movdup_if v0.4h, w16, #2896*8, \scale + movi_if v7.8h, #0, \clear + ldr q16, [x10] // in3 (offset 1) + ldr q17, [x11, x8] // in29 (offset 14) + ldr q18, [x9] // in19 (offset 9) + ldr q19, [x7, x8] // in13 (offset 6) + str_if q7, [x10], \clear + stroff_if q7, [x11, x8], \clear + str_if q7, [x9], \clear + stroff_if q7, [x7, x8], \clear + scale_if \scale, v0.h[0], v16, v17, v18, v19 + bl inv_dct64_step1_neon + + sub x6, x6, #2*8*32 + add x9, x6, #2*8*7 + + bl inv_dct64_step2_neon + + br x14 +endfunc +.endm + +def_dct64_func +def_dct64_func _clear, clear=1 +def_dct64_func _clear_scale, clear=1, scale=1 + + +function inv_txfm_horz_dct_64x8_neon + mov x14, x30 + + mov x7, sp + add x8, sp, #2*8*(64 - 4) + add x9, x6, #2*56 + mov x10, #2*64 + mov x11, #-2*8*4 + + dup v7.8h, w12 +1: + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64 + ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11 + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64 + ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11 + transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 + transpose_8x8h v31, v30, v29, v28, v27, v26, v25, v24, v4, v5 + +.macro store_addsub src0, src1, src2, src3 + sqsub v1.8h, \src0, \src1 + sqadd v0.8h, \src0, \src1 + sqsub v3.8h, \src2, \src3 + srshl v1.8h, v1.8h, v7.8h + sqadd v2.8h, \src2, \src3 + srshl v0.8h, v0.8h, v7.8h + srshl v3.8h, v3.8h, v7.8h + rev64 v1.8h, v1.8h + srshl v2.8h, v2.8h, v7.8h + rev64 v3.8h, v3.8h + ext v1.16b, v1.16b, v1.16b, #8 + st1 {v0.8h}, [x6], x10 + ext v3.16b, v3.16b, v3.16b, #8 + st1 {v1.8h}, [x9], x10 + st1 {v2.8h}, [x6], x10 + st1 {v3.8h}, [x9], x10 +.endm + store_addsub v16.8h, v31.8h, v17.8h, v30.8h + store_addsub v18.8h, v29.8h, v19.8h, v28.8h + store_addsub v20.8h, v27.8h, v21.8h, v26.8h + store_addsub v22.8h, v25.8h, v23.8h, v24.8h +.purgem store_addsub + sub x6, x6, x10, lsl #3 + sub x9, x9, x10, lsl #3 + add x6, x6, #16 + sub x9, x9, #16 + + cmp x7, x8 + b.lt 1b + br x14 +endfunc + +function inv_txfm_add_vert_dct_8x64_neon + mov x14, x30 + lsl x8, x8, #1 + + mov x7, sp + add x8, sp, #2*8*(64 - 4) + add x9, x6, x1, lsl #6 + sub x9, x9, x1 + neg x10, x1 + mov x11, #-2*8*4 + +1: + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64 + ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11 + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64 + ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11 + +.macro add_dest_addsub src0, src1, src2, src3 + ld1 {v0.8b}, [x6], x1 + ld1 {v1.8b}, [x9], x10 + sqadd v4.8h, \src0, \src1 + ld1 {v2.8b}, [x6] + sqsub v5.8h, \src0, \src1 + ld1 {v3.8b}, [x9] + sqadd v6.8h, \src2, \src3 + sqsub v7.8h, \src2, \src3 + sub x6, x6, x1 + sub x9, x9, x10 + srshr v4.8h, v4.8h, #4 + srshr v5.8h, v5.8h, #4 + srshr v6.8h, v6.8h, #4 + uaddw v4.8h, v4.8h, v0.8b + srshr v7.8h, v7.8h, #4 + uaddw v5.8h, v5.8h, v1.8b + uaddw v6.8h, v6.8h, v2.8b + sqxtun v0.8b, v4.8h + uaddw v7.8h, v7.8h, v3.8b + sqxtun v1.8b, v5.8h + st1 {v0.8b}, [x6], x1 + sqxtun v2.8b, v6.8h + st1 {v1.8b}, [x9], x10 + sqxtun v3.8b, v7.8h + st1 {v2.8b}, [x6], x1 + st1 {v3.8b}, [x9], x10 +.endm + add_dest_addsub v16.8h, v31.8h, v17.8h, v30.8h + add_dest_addsub v18.8h, v29.8h, v19.8h, v28.8h + add_dest_addsub v20.8h, v27.8h, v21.8h, v26.8h + add_dest_addsub v22.8h, v25.8h, v23.8h, v24.8h +.purgem add_dest_addsub + cmp x7, x8 + b.lt 1b + + br x14 +endfunc + +.macro sub_sp space +#ifdef _WIN32 +.if \space > 8192 + // Here, we'd need to touch two (or more) pages while decrementing + // the stack pointer. + .error "sub_sp_align doesn't support values over 8K at the moment" +.elseif \space > 4096 + sub x16, sp, #4096 + ldr xzr, [x16] + sub sp, x16, #(\space - 4096) +.else + sub sp, sp, #\space +.endif +#else +.if \space >= 4096 + sub sp, sp, #(\space)/4096*4096 +.endif +.if (\space % 4096) != 0 + sub sp, sp, #(\space)%4096 +.endif +#endif +.endm + +function inv_txfm_add_dct_dct_64x64_8bpc_neon, export=1 + idct_dc 64, 64, 2 + + mov x15, x30 + + sub_sp 64*32*2+64*8*2 + add x5, sp, #64*8*2 + + movrel x13, eob_32x32 + +.irp i, 0, 8, 16, 24 + add x6, x5, #(\i*64*2) +.if \i > 0 + mov w8, #(32 - \i) + cmp w3, w12 + b.lt 1f +.endif + add x7, x2, #(\i*2) + mov x8, #32*2 + mov x12, #-2 // shift + bl inv_txfm_dct_clear_8h_x64_neon + add x6, x5, #(\i*64*2) + bl inv_txfm_horz_dct_64x8_neon +.if \i < 24 + ldrh w12, [x13], #2 +.endif +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #2 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8, 16, 24, 32, 40, 48, 56 + add x7, x5, #(\i*2) + mov x8, #64*2 + bl inv_txfm_dct_8h_x64_neon + add x6, x0, #(\i) + bl inv_txfm_add_vert_dct_8x64_neon +.endr + + add sp, x5, #64*32*2 + br x15 +endfunc + +function inv_txfm_add_dct_dct_64x32_8bpc_neon, export=1 + idct_dc 64, 32, 1 + + mov x15, x30 + + sub_sp 64*32*2+64*8*2 + add x5, sp, #64*8*2 + + movrel x13, eob_32x32 + +.irp i, 0, 8, 16, 24 + add x6, x5, #(\i*64*2) +.if \i > 0 + mov w8, #(32 - \i) + cmp w3, w12 + b.lt 1f +.endif + add x7, x2, #(\i*2) + mov x8, #32*2 + mov x12, #-1 // shift + bl inv_txfm_dct_clear_scale_8h_x64_neon + add x6, x5, #(\i*64*2) + bl inv_txfm_horz_dct_64x8_neon +.if \i < 24 + ldrh w12, [x13], #2 +.endif +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #2 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8, 16, 24, 32, 40, 48, 56 + add x6, x0, #(\i) + add x7, x5, #(\i*2) + mov x8, #64*2 + bl inv_txfm_add_vert_dct_8x32_neon +.endr + + add sp, x5, #64*32*2 + br x15 +endfunc + +function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1 + idct_dc 32, 64, 1 + + mov x15, x30 + + sub_sp 32*32*2+64*8*2 + add x5, sp, #64*8*2 + + movrel x13, eob_32x32 + ldrh w12, [x13], #2 + +.irp i, 0, 8, 16, 24 + add x6, x5, #(\i*32*2) +.if \i > 0 + mov w8, #(32 - \i) + cmp w3, w12 + b.lt 1f + ldrh w12, [x13], #2 +.endif + add x7, x2, #(\i*2) + mov x8, #32*2 + bl inv_txfm_horz_scale_dct_32x8_neon +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #4 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8, 16, 24 + add x7, x5, #(\i*2) + mov x8, #32*2 + bl inv_txfm_dct_8h_x64_neon + add x6, x0, #(\i) + bl inv_txfm_add_vert_dct_8x64_neon +.endr + + add sp, x5, #32*32*2 + br x15 +endfunc + +function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1 + idct_dc 64, 16, 2 + + mov x15, x30 + + sub_sp 64*16*2+64*8*2 + add x4, sp, #64*8*2 + + movrel x13, eob_16x32 + +.irp i, 0, 8 + add x6, x4, #(\i*64*2) +.if \i > 0 + mov w8, #(16 - \i) + cmp w3, w12 + b.lt 1f +.endif + add x7, x2, #(\i*2) + mov x8, #16*2 + mov x12, #-2 // shift + bl inv_txfm_dct_clear_8h_x64_neon + add x6, x4, #(\i*64*2) + bl inv_txfm_horz_dct_64x8_neon +.if \i < 8 + ldrh w12, [x13], #2 +.endif +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #2 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: + adr x5, inv_dct_8h_x16_neon +.irp i, 0, 8, 16, 24, 32, 40, 48, 56 + add x6, x0, #(\i) + add x7, x4, #(\i*2) + mov x8, #64*2 + bl inv_txfm_add_vert_8x16_neon +.endr + + add sp, x4, #64*16*2 + br x15 +endfunc + +function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1 + idct_dc 16, 64, 2 + + mov x15, x30 + + sub_sp 16*32*2+64*8*2 + add x5, sp, #64*8*2 + + movrel x13, eob_16x32 + ldrh w12, [x13], #2 + + adr x4, inv_dct_8h_x16_neon +.irp i, 0, 8, 16, 24 + add x6, x5, #(\i*16*2) +.if \i > 0 + mov w8, #(32 - \i) + cmp w3, w12 + b.lt 1f + ldrh w12, [x13], #2 +.endif + add x7, x2, #(\i*2) + mov x8, #32*2 + bl inv_txfm_horz_16x8_neon +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #8 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8 + add x7, x5, #(\i*2) + mov x8, #16*2 + bl inv_txfm_dct_8h_x64_neon + add x6, x0, #(\i) + bl inv_txfm_add_vert_dct_8x64_neon +.endr + + add sp, x5, #16*32*2 + br x15 +endfunc diff --git a/third_party/dav1d/src/arm/64/itx16.S b/third_party/dav1d/src/arm/64/itx16.S new file mode 100644 index 0000000000..266f57e36e --- /dev/null +++ b/third_party/dav1d/src/arm/64/itx16.S @@ -0,0 +1,3526 @@ +/****************************************************************************** + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include "src/arm/asm.S" +#include "util.S" + +// The exported functions in this file have got the following signature: +// void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob, +// int bitdepth_max); + +// Most of the functions use the following register layout: +// x0-x3 external parameters +// x4 function pointer to first transform +// x5 function pointer to second transform +// x6 output parameter for helper function +// x7 input parameter for helper function +// x8 input stride for helper function +// x9-x12 scratch variables for helper functions +// x13 pointer to list of eob thresholds +// x14 return pointer for helper function +// x15 return pointer for main function + +// The SIMD registers most often use the following layout: +// v0-v1 multiplication coefficients +// v2-v7 scratch registers +// v8-v15 unused +// v16-v31 inputs/outputs of transforms + +const idct_coeffs, align=4 + // idct4 + .int 2896, 2896*8*(1<<16), 1567, 3784 + // idct8 + .int 799, 4017, 3406, 2276 + // idct16 + .int 401, 4076, 3166, 2598 + .int 1931, 3612, 3920, 1189 + // idct32 + .int 201, 4091, 3035, 2751 + .int 1751, 3703, 3857, 1380 + .int 995, 3973, 3513, 2106 + .int 2440, 3290, 4052, 601 +endconst + +const idct64_coeffs, align=4 + .int 101*8*(1<<16), 4095*8*(1<<16), 2967*8*(1<<16), -2824*8*(1<<16) + .int 1660*8*(1<<16), 3745*8*(1<<16), 3822*8*(1<<16), -1474*8*(1<<16) + .int 4076, 401, 4017, 799 + + .int 4036*8*(1<<16), -700*8*(1<<16), 2359*8*(1<<16), 3349*8*(1<<16) + .int 3461*8*(1<<16), -2191*8*(1<<16), 897*8*(1<<16), 3996*8*(1<<16) + .int -3166, -2598, -799, -4017 + + .int 501*8*(1<<16), 4065*8*(1<<16), 3229*8*(1<<16), -2520*8*(1<<16) + .int 2019*8*(1<<16), 3564*8*(1<<16), 3948*8*(1<<16), -1092*8*(1<<16) + .int 3612, 1931, 2276, 3406 + + .int 4085*8*(1<<16), -301*8*(1<<16), 2675*8*(1<<16), 3102*8*(1<<16) + .int 3659*8*(1<<16), -1842*8*(1<<16), 1285*8*(1<<16), 3889*8*(1<<16) + .int -3920, -1189, -3406, -2276 +endconst + +const iadst4_coeffs, align=4 + .int 1321, 3803, 2482, 3344 +endconst + +const iadst8_coeffs, align=4 + .int 4076, 401, 3612, 1931 + .int 2598, 3166, 1189, 3920 + // idct_coeffs + .int 2896, 0, 1567, 3784 +endconst + +const iadst16_coeffs, align=4 + .int 4091, 201, 3973, 995 + .int 3703, 1751, 3290, 2440 + .int 2751, 3035, 2106, 3513 + .int 1380, 3857, 601, 4052 +endconst + +.macro mul_mla d, s0, s1, c0, c1 + mul \d\().4s, \s0\().4s, \c0 + mla \d\().4s, \s1\().4s, \c1 +.endm + +.macro mul_mls d, s0, s1, c0, c1 + mul \d\().4s, \s0\().4s, \c0 + mls \d\().4s, \s1\().4s, \c1 +.endm + +.macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7 + sqrdmulh \r0\sz, \r0\sz, \c + sqrdmulh \r1\sz, \r1\sz, \c + sqrdmulh \r2\sz, \r2\sz, \c + sqrdmulh \r3\sz, \r3\sz, \c +.ifnb \r4 + sqrdmulh \r4\sz, \r4\sz, \c + sqrdmulh \r5\sz, \r5\sz, \c + sqrdmulh \r6\sz, \r6\sz, \c + sqrdmulh \r7\sz, \r7\sz, \c +.endif +.endm + +.macro load_add_store load, shift, addsrc, adddst, max, min, store, dst, src, shiftbits=4 +.ifnb \load + ld1 {\load}, [\src], x1 +.endif +.ifnb \shift + srshr \shift, \shift, #\shiftbits +.endif +.ifnb \addsrc + sqadd \adddst, \adddst, \addsrc +.endif +.ifnb \max + smax \max, \max, v6.8h +.endif +.ifnb \min + smin \min, \min, v7.8h +.endif +.ifnb \store + st1 {\store}, [\dst], x1 +.endif +.endm +.macro load_add_store_8x16 dst, src + mov \src, \dst + movi v6.8h, #0 + mvni v7.8h, #0xfc, lsl #8 // 0x3ff + load_add_store v2.8h, v16.8h, , , , , , \dst, \src + load_add_store v3.8h, v17.8h, , , , , , \dst, \src + load_add_store v4.8h, v18.8h, v2.8h, v16.8h, , , , \dst, \src + load_add_store v5.8h, v19.8h, v3.8h, v17.8h, v16.8h, , , \dst, \src + load_add_store v2.8h, v20.8h, v4.8h, v18.8h, v17.8h, v16.8h, , \dst, \src + load_add_store v3.8h, v21.8h, v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src + load_add_store v4.8h, v22.8h, v2.8h, v20.8h, v19.8h, v18.8h, v17.8h, \dst, \src + load_add_store v5.8h, v23.8h, v3.8h, v21.8h, v20.8h, v19.8h, v18.8h, \dst, \src + load_add_store v2.8h, v24.8h, v4.8h, v22.8h, v21.8h, v20.8h, v19.8h, \dst, \src + load_add_store v3.8h, v25.8h, v5.8h, v23.8h, v22.8h, v21.8h, v20.8h, \dst, \src + load_add_store v4.8h, v26.8h, v2.8h, v24.8h, v23.8h, v22.8h, v21.8h, \dst, \src + load_add_store v5.8h, v27.8h, v3.8h, v25.8h, v24.8h, v23.8h, v22.8h, \dst, \src + load_add_store v2.8h, v28.8h, v4.8h, v26.8h, v25.8h, v24.8h, v23.8h, \dst, \src + load_add_store v3.8h, v29.8h, v5.8h, v27.8h, v26.8h, v25.8h, v24.8h, \dst, \src + load_add_store v4.8h, v30.8h, v2.8h, v28.8h, v27.8h, v26.8h, v25.8h, \dst, \src + load_add_store v5.8h, v31.8h, v3.8h, v29.8h, v28.8h, v27.8h, v26.8h, \dst, \src + load_add_store , , v4.8h, v30.8h, v29.8h, v28.8h, v27.8h, \dst, \src + load_add_store , , v5.8h, v31.8h, v30.8h, v29.8h, v28.8h, \dst, \src + load_add_store , , , , v31.8h, v30.8h, v29.8h, \dst, \src + load_add_store , , , , , v31.8h, v30.8h, \dst, \src + load_add_store , , , , , , v31.8h, \dst, \src +.endm +.macro load_add_store_8x8 dst, src, shiftbits=4 + mov \src, \dst + movi v6.8h, #0 + mvni v7.8h, #0xfc, lsl #8 // 0x3ff + load_add_store v2.8h, v16.8h, , , , , , \dst, \src, \shiftbits + load_add_store v3.8h, v17.8h, , , , , , \dst, \src, \shiftbits + load_add_store v4.8h, v18.8h, v2.8h, v16.8h, , , , \dst, \src, \shiftbits + load_add_store v5.8h, v19.8h, v3.8h, v17.8h, v16.8h, , , \dst, \src, \shiftbits + load_add_store v2.8h, v20.8h, v4.8h, v18.8h, v17.8h, v16.8h, , \dst, \src, \shiftbits + load_add_store v3.8h, v21.8h, v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits + load_add_store v4.8h, v22.8h, v2.8h, v20.8h, v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits + load_add_store v5.8h, v23.8h, v3.8h, v21.8h, v20.8h, v19.8h, v18.8h, \dst, \src, \shiftbits + load_add_store , , v4.8h, v22.8h, v21.8h, v20.8h, v19.8h, \dst, \src, \shiftbits + load_add_store , , v5.8h, v23.8h, v22.8h, v21.8h, v20.8h, \dst, \src, \shiftbits + load_add_store , , , , v23.8h, v22.8h, v21.8h, \dst, \src, \shiftbits + load_add_store , , , , , v23.8h, v22.8h, \dst, \src, \shiftbits + load_add_store , , , , , , v23.8h, \dst, \src, \shiftbits +.endm +.macro load_add_store_8x4 dst, src, shiftbits=4 + mov \src, \dst + movi v6.8h, #0 + mvni v7.8h, #0xfc, lsl #8 // 0x3ff + load_add_store v2.8h, v16.8h, , , , , , \dst, \src, \shiftbits + load_add_store v3.8h, v17.8h, , , , , , \dst, \src, \shiftbits + load_add_store v4.8h, v18.8h, v2.8h, v16.8h, , , , \dst, \src, \shiftbits + load_add_store v5.8h, v19.8h, v3.8h, v17.8h, v16.8h, , , \dst, \src, \shiftbits + load_add_store , , v4.8h, v18.8h, v17.8h, v16.8h, , \dst, \src, \shiftbits + load_add_store , , v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits + load_add_store , , , , v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits + load_add_store , , , , , v19.8h, v18.8h, \dst, \src, \shiftbits + load_add_store , , , , , , v19.8h, \dst, \src, \shiftbits +.endm +.macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, max, min, store, dst, src +.ifnb \load + ld1 {\load}[0], [\src], x1 +.endif +.ifnb \inssrc + ins \insdst\().d[1], \inssrc\().d[0] +.endif +.ifnb \shift + srshr \shift, \shift, #4 +.endif +.ifnb \load + ld1 {\load}[1], [\src], x1 +.endif +.ifnb \addsrc + sqadd \adddst, \adddst, \addsrc +.endif +.ifnb \store + st1 {\store}[0], [\dst], x1 +.endif +.ifnb \max + smax \max, \max, v6.8h +.endif +.ifnb \min + smin \min, \min, v7.8h +.endif +.ifnb \store + st1 {\store}[1], [\dst], x1 +.endif +.endm +.macro load_add_store_4x16 dst, src + mov \src, \dst + movi v6.8h, #0 + mvni v7.8h, #0xfc, lsl #8 // 0x3ff + load_add_store4 v0.d, v17, v16, , , , , , , \dst, \src + load_add_store4 v1.d, v19, v18, , , , , , , \dst, \src + load_add_store4 v2.d, v21, v20, v16.8h, , , , , , \dst, \src + load_add_store4 v3.d, v23, v22, v18.8h, v0.8h, v16.8h, , , , \dst, \src + load_add_store4 v0.d, v25, v24, v20.8h, v1.8h, v18.8h, v16.8h, , , \dst, \src + load_add_store4 v1.d, v27, v26, v22.8h, v2.8h, v20.8h, v18.8h, v16.8h, , \dst, \src + load_add_store4 v2.d, v29, v28, v24.8h, v3.8h, v22.8h, v20.8h, v18.8h, v16.d, \dst, \src + load_add_store4 v3.d, v31, v30, v26.8h, v0.8h, v24.8h, v22.8h, v20.8h, v18.d, \dst, \src + load_add_store4 , , , v28.8h, v1.8h, v26.8h, v24.8h, v22.8h, v20.d, \dst, \src + load_add_store4 , , , v30.8h, v2.8h, v28.8h, v26.8h, v24.8h, v22.d, \dst, \src + load_add_store4 , , , , v3.8h, v30.8h, v28.8h, v26.8h, v24.d, \dst, \src + load_add_store4 , , , , , , v30.8h, v28.8h, v26.d, \dst, \src + load_add_store4 , , , , , , , v30.8h, v28.d, \dst, \src + load_add_store4 , , , , , , , , v30.d, \dst, \src +.endm +.macro load_add_store_4x8 dst, src + mov \src, \dst + movi v6.8h, #0 + mvni v7.8h, #0xfc, lsl #8 // 0x3ff + load_add_store4 v0.d, v17, v16, , , , , , , \dst, \src + load_add_store4 v1.d, v19, v18, , , , , , , \dst, \src + load_add_store4 v2.d, v21, v20, v16.8h, , , , , , \dst, \src + load_add_store4 v3.d, v23, v22, v18.8h, v0.8h, v16.8h, , , , \dst, \src + load_add_store4 , , , v20.8h, v1.8h, v18.8h, v16.8h, , , \dst, \src + load_add_store4 , , , v22.8h, v2.8h, v20.8h, v18.8h, v16.8h, , \dst, \src + load_add_store4 , , , , v3.8h, v22.8h, v20.8h, v18.8h, v16.d, \dst, \src + load_add_store4 , , , , , , v22.8h, v20.8h, v18.d, \dst, \src + load_add_store4 , , , , , , , v22.8h, v20.d, \dst, \src + load_add_store4 , , , , , , , , v22.d, \dst, \src +.endm + +.macro idct_dc w, h, shift + cbnz w3, 1f + movz w16, #2896*8, lsl #16 + ld1r {v16.4s}, [x2] + dup v0.2s, w16 + sqrdmulh v20.4s, v16.4s, v0.s[0] + str wzr, [x2] +.if (\w == 2*\h) || (2*\w == \h) + sqrdmulh v20.4s, v20.4s, v0.s[0] +.endif +.if \shift > 0 + sqrshrn v16.4h, v20.4s, #\shift + sqrshrn2 v16.8h, v20.4s, #\shift +.else + sqxtn v16.4h, v20.4s + sqxtn2 v16.8h, v20.4s +.endif + sqrdmulh v16.8h, v16.8h, v0.h[1] + srshr v16.8h, v16.8h, #4 + mov w4, #\h + b idct_dc_w\w\()_neon +1: +.endm + +function idct_dc_w4_neon + movi v30.8h, #0 + mvni v31.8h, #0xfc, lsl #8 // 0x3ff +1: + ld1 {v0.d}[0], [x0], x1 + ld1 {v0.d}[1], [x0], x1 + ld1 {v1.d}[0], [x0], x1 + subs w4, w4, #4 + ld1 {v1.d}[1], [x0], x1 + sqadd v0.8h, v0.8h, v16.8h + sub x0, x0, x1, lsl #2 + sqadd v1.8h, v1.8h, v16.8h + smax v0.8h, v0.8h, v30.8h + smax v1.8h, v1.8h, v30.8h + smin v0.8h, v0.8h, v31.8h + st1 {v0.d}[0], [x0], x1 + smin v1.8h, v1.8h, v31.8h + st1 {v0.d}[1], [x0], x1 + st1 {v1.d}[0], [x0], x1 + st1 {v1.d}[1], [x0], x1 + b.gt 1b + ret +endfunc + +function idct_dc_w8_neon + movi v30.8h, #0 + mvni v31.8h, #0xfc, lsl #8 // 0x3ff +1: + ld1 {v0.8h}, [x0], x1 + subs w4, w4, #4 + ld1 {v1.8h}, [x0], x1 + sqadd v0.8h, v0.8h, v16.8h + ld1 {v2.8h}, [x0], x1 + sqadd v1.8h, v1.8h, v16.8h + ld1 {v3.8h}, [x0], x1 + sqadd v2.8h, v2.8h, v16.8h + sqadd v3.8h, v3.8h, v16.8h + sub x0, x0, x1, lsl #2 + smax v0.8h, v0.8h, v30.8h + smax v1.8h, v1.8h, v30.8h + smax v2.8h, v2.8h, v30.8h + smax v3.8h, v3.8h, v30.8h + smin v0.8h, v0.8h, v31.8h + smin v1.8h, v1.8h, v31.8h + st1 {v0.8h}, [x0], x1 + smin v2.8h, v2.8h, v31.8h + st1 {v1.8h}, [x0], x1 + smin v3.8h, v3.8h, v31.8h + st1 {v2.8h}, [x0], x1 + st1 {v3.8h}, [x0], x1 + b.gt 1b + ret +endfunc + +function idct_dc_w16_neon + movi v30.8h, #0 + mvni v31.8h, #0xfc, lsl #8 // 0x3ff +1: + ld1 {v0.8h, v1.8h}, [x0], x1 + subs w4, w4, #2 + ld1 {v2.8h, v3.8h}, [x0], x1 + sqadd v0.8h, v0.8h, v16.8h + sqadd v1.8h, v1.8h, v16.8h + sub x0, x0, x1, lsl #1 + sqadd v2.8h, v2.8h, v16.8h + sqadd v3.8h, v3.8h, v16.8h + smax v0.8h, v0.8h, v30.8h + smax v1.8h, v1.8h, v30.8h + smax v2.8h, v2.8h, v30.8h + smax v3.8h, v3.8h, v30.8h + smin v0.8h, v0.8h, v31.8h + smin v1.8h, v1.8h, v31.8h + smin v2.8h, v2.8h, v31.8h + st1 {v0.8h, v1.8h}, [x0], x1 + smin v3.8h, v3.8h, v31.8h + st1 {v2.8h, v3.8h}, [x0], x1 + b.gt 1b + ret +endfunc + +function idct_dc_w32_neon + movi v30.8h, #0 + mvni v31.8h, #0xfc, lsl #8 // 0x3ff +1: + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] + subs w4, w4, #1 + sqadd v0.8h, v0.8h, v16.8h + sqadd v1.8h, v1.8h, v16.8h + sqadd v2.8h, v2.8h, v16.8h + sqadd v3.8h, v3.8h, v16.8h + smax v0.8h, v0.8h, v30.8h + smax v1.8h, v1.8h, v30.8h + smax v2.8h, v2.8h, v30.8h + smax v3.8h, v3.8h, v30.8h + smin v0.8h, v0.8h, v31.8h + smin v1.8h, v1.8h, v31.8h + smin v2.8h, v2.8h, v31.8h + smin v3.8h, v3.8h, v31.8h + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + b.gt 1b + ret +endfunc + +function idct_dc_w64_neon + movi v30.8h, #0 + mvni v31.8h, #0xfc, lsl #8 // 0x3ff + sub x1, x1, #64 +1: + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + subs w4, w4, #1 + sqadd v0.8h, v0.8h, v16.8h + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0] + sqadd v1.8h, v1.8h, v16.8h + sub x0, x0, #64 + sqadd v2.8h, v2.8h, v16.8h + sqadd v3.8h, v3.8h, v16.8h + sqadd v4.8h, v4.8h, v16.8h + sqadd v5.8h, v5.8h, v16.8h + sqadd v6.8h, v6.8h, v16.8h + sqadd v7.8h, v7.8h, v16.8h + smax v0.8h, v0.8h, v30.8h + smax v1.8h, v1.8h, v30.8h + smax v2.8h, v2.8h, v30.8h + smax v3.8h, v3.8h, v30.8h + smax v4.8h, v4.8h, v30.8h + smax v5.8h, v5.8h, v30.8h + smax v6.8h, v6.8h, v30.8h + smax v7.8h, v7.8h, v30.8h + smin v0.8h, v0.8h, v31.8h + smin v1.8h, v1.8h, v31.8h + smin v2.8h, v2.8h, v31.8h + smin v3.8h, v3.8h, v31.8h + smin v4.8h, v4.8h, v31.8h + smin v5.8h, v5.8h, v31.8h + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + smin v6.8h, v6.8h, v31.8h + smin v7.8h, v7.8h, v31.8h + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 + b.gt 1b + ret +endfunc + +.macro iwht4 + add v16.4s, v16.4s, v17.4s + sub v21.4s, v18.4s, v19.4s + sub v20.4s, v16.4s, v21.4s + sshr v20.4s, v20.4s, #1 + sub v18.4s, v20.4s, v17.4s + sub v17.4s, v20.4s, v19.4s + add v19.4s, v21.4s, v18.4s + sub v16.4s, v16.4s, v17.4s +.endm + +.macro idct_4 r0, r1, r2, r3 + mul_mla v6, \r1, \r3, v0.s[3], v0.s[2] + mul_mls v4, \r1, \r3, v0.s[2], v0.s[3] + mul_mla v2, \r0, \r2, v0.s[0], v0.s[0] + mul_mls v3, \r0, \r2, v0.s[0], v0.s[0] + srshr v6.4s, v6.4s, #12 + srshr v7.4s, v4.4s, #12 + srshr v2.4s, v2.4s, #12 + srshr v3.4s, v3.4s, #12 + sqadd \r0\().4s, v2.4s, v6.4s + sqsub \r3\().4s, v2.4s, v6.4s + sqadd \r1\().4s, v3.4s, v7.4s + sqsub \r2\().4s, v3.4s, v7.4s +.endm + +function inv_dct_4s_x4_neon + movrel x16, idct_coeffs + ld1 {v0.4s}, [x16] + idct_4 v16, v17, v18, v19 + ret +endfunc + +.macro iadst_4x4 o0, o1, o2, o3 + movrel x16, iadst4_coeffs + ld1 {v0.4s}, [x16] + + sub v3.4s, v16.4s, v18.4s + mul v4.4s, v16.4s, v0.s[0] + mla v4.4s, v18.4s, v0.s[1] + mla v4.4s, v19.4s, v0.s[2] + mul v7.4s, v17.4s, v0.s[3] + add v3.4s, v3.4s, v19.4s + mul v5.4s, v16.4s, v0.s[2] + mls v5.4s, v18.4s, v0.s[0] + mls v5.4s, v19.4s, v0.s[1] + + add \o3\().4s, v4.4s, v5.4s + mul \o2\().4s, v3.4s, v0.s[3] + add \o0\().4s, v4.4s, v7.4s + add \o1\().4s, v5.4s, v7.4s + sub \o3\().4s, \o3\().4s, v7.4s + + srshr \o0\().4s, \o0\().4s, #12 + srshr \o2\().4s, \o2\().4s, #12 + srshr \o1\().4s, \o1\().4s, #12 + srshr \o3\().4s, \o3\().4s, #12 +.endm + +function inv_adst_4s_x4_neon + iadst_4x4 v16, v17, v18, v19 + ret +endfunc + +function inv_flipadst_4s_x4_neon + iadst_4x4 v19, v18, v17, v16 + ret +endfunc + +function inv_identity_4s_x4_neon + movz w16, #(5793-4096)*8, lsl #16 + dup v0.2s, w16 + sqrdmulh v4.4s, v16.4s, v0.s[0] + sqrdmulh v5.4s, v17.4s, v0.s[0] + sqrdmulh v6.4s, v18.4s, v0.s[0] + sqrdmulh v7.4s, v19.4s, v0.s[0] + sqadd v16.4s, v16.4s, v4.4s + sqadd v17.4s, v17.4s, v5.4s + sqadd v18.4s, v18.4s, v6.4s + sqadd v19.4s, v19.4s, v7.4s + ret +endfunc + +function inv_txfm_add_wht_wht_4x4_16bpc_neon, export=1 + mov x15, x30 + movi v30.4s, #0 + movi v31.4s, #0 + ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2] + st1 {v30.4s, v31.4s}, [x2], #32 + + sshr v16.4s, v16.4s, #2 + sshr v17.4s, v17.4s, #2 + sshr v18.4s, v18.4s, #2 + sshr v19.4s, v19.4s, #2 + + iwht4 + + st1 {v30.4s, v31.4s}, [x2], #32 + transpose_4x4s v16, v17, v18, v19, v20, v21, v22, v23 + + iwht4 + + ld1 {v0.d}[0], [x0], x1 + sqxtn v16.4h, v16.4s + ld1 {v0.d}[1], [x0], x1 + sqxtn2 v16.8h, v17.4s + ld1 {v1.d}[0], [x0], x1 + sqxtn v18.4h, v18.4s + ld1 {v1.d}[1], [x0], x1 + sqxtn2 v18.8h, v19.4s + + b L(itx_4x4_end) +endfunc + +function inv_txfm_add_4x4_neon + movi v30.4s, #0 + movi v31.4s, #0 + ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2] + st1 {v30.4s, v31.4s}, [x2], #32 + + blr x4 + + st1 {v30.4s, v31.4s}, [x2], #32 + sqxtn v16.4h, v16.4s + sqxtn v17.4h, v17.4s + sqxtn v18.4h, v18.4s + sqxtn v19.4h, v19.4s + transpose_4x4h v16, v17, v18, v19, v20, v21, v22, v23 + + blr x5 + + ld1 {v0.d}[0], [x0], x1 + ld1 {v0.d}[1], [x0], x1 + ins v16.d[1], v17.d[0] + ins v18.d[1], v19.d[0] + ld1 {v1.d}[0], [x0], x1 + ld1 {v1.d}[1], [x0], x1 + srshr v16.8h, v16.8h, #4 + srshr v18.8h, v18.8h, #4 + +L(itx_4x4_end): + mvni v31.8h, #0xfc, lsl #8 // 0x3ff + sub x0, x0, x1, lsl #2 + sqadd v16.8h, v16.8h, v0.8h + sqadd v18.8h, v18.8h, v1.8h + smax v16.8h, v16.8h, v30.8h + smax v18.8h, v18.8h, v30.8h + smin v16.8h, v16.8h, v31.8h + st1 {v16.d}[0], [x0], x1 + smin v18.8h, v18.8h, v31.8h + st1 {v16.d}[1], [x0], x1 + st1 {v18.d}[0], [x0], x1 + st1 {v18.d}[1], [x0], x1 + + br x15 +endfunc + +.macro def_fn_4x4 txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_16bpc_neon, export=1 + mov x15, x30 + +.ifc \txfm1\()_\txfm2, dct_dct + cbnz w3, 1f + movz w16, #2896*8, lsl #16 + ld1r {v16.4s}, [x2] + dup v4.2s, w16 + str wzr, [x2] + sqrdmulh v16.4s, v16.4s, v4.s[0] + ld1 {v0.d}[0], [x0], x1 + sqxtn v20.4h, v16.4s + sqxtn2 v20.8h, v16.4s + ld1 {v0.d}[1], [x0], x1 + sqrdmulh v20.8h, v20.8h, v4.h[1] + ld1 {v1.d}[0], [x0], x1 + srshr v16.8h, v20.8h, #4 + ld1 {v1.d}[1], [x0], x1 + srshr v18.8h, v20.8h, #4 + movi v30.8h, #0 + b L(itx_4x4_end) +1: +.endif + adr x4, inv_\txfm1\()_4s_x4_neon + movrel x5, X(inv_\txfm2\()_4h_x4_neon) + b inv_txfm_add_4x4_neon +endfunc +.endm + +def_fn_4x4 dct, dct +def_fn_4x4 identity, identity +def_fn_4x4 dct, adst +def_fn_4x4 dct, flipadst +def_fn_4x4 dct, identity +def_fn_4x4 adst, dct +def_fn_4x4 adst, adst +def_fn_4x4 adst, flipadst +def_fn_4x4 flipadst, dct +def_fn_4x4 flipadst, adst +def_fn_4x4 flipadst, flipadst +def_fn_4x4 identity, dct + +def_fn_4x4 adst, identity +def_fn_4x4 flipadst, identity +def_fn_4x4 identity, adst +def_fn_4x4 identity, flipadst + +.macro idct_8 r0, r1, r2, r3, r4, r5, r6, r7 + idct_4 \r0, \r2, \r4, \r6 + + mul_mls v2, \r1, \r7, v1.s[0], v1.s[1] // -> t4a + mul_mla v4, \r1, \r7, v1.s[1], v1.s[0] // -> t7a + mul_mls v6, \r5, \r3, v1.s[2], v1.s[3] // -> t5a + mul_mla v7, \r5, \r3, v1.s[3], v1.s[2] // -> t6a + srshr \r1\().4s, v2.4s, #12 // t4a + srshr \r7\().4s, v4.4s, #12 // t7a + srshr \r3\().4s, v6.4s, #12 // t5a + srshr \r5\().4s, v7.4s, #12 // taa + + sqadd v2.4s, \r1\().4s, \r3\().4s // t4 + sqsub \r1\().4s, \r1\().4s, \r3\().4s // t5a + sqadd v3.4s, \r7\().4s, \r5\().4s // t7 + sqsub \r3\().4s, \r7\().4s, \r5\().4s // t6a + + mul_mls v4, \r3, \r1, v0.s[0], v0.s[0] // -> t5 + mul_mla v6, \r3, \r1, v0.s[0], v0.s[0] // -> t6 + srshr v4.4s, v4.4s, #12 // t5 + srshr v5.4s, v6.4s, #12 // t6 + + sqsub \r7\().4s, \r0\().4s, v3.4s // out7 + sqadd \r0\().4s, \r0\().4s, v3.4s // out0 + sqadd \r1\().4s, \r2\().4s, v5.4s // out1 + sqsub v6.4s, \r2\().4s, v5.4s // out6 + sqadd \r2\().4s, \r4\().4s, v4.4s // out2 + sqsub \r5\().4s, \r4\().4s, v4.4s // out5 + sqadd \r3\().4s, \r6\().4s, v2.4s // out3 + sqsub \r4\().4s, \r6\().4s, v2.4s // out4 + mov \r6\().16b, v6.16b // out6 +.endm + +function inv_dct_4s_x8_neon + movrel x16, idct_coeffs + ld1 {v0.4s, v1.4s}, [x16] + idct_8 v16, v17, v18, v19, v20, v21, v22, v23 + ret +endfunc + +.macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7 + movrel x16, iadst8_coeffs + ld1 {v0.4s, v1.4s}, [x16], #32 + + mul_mla v2, v23, v16, v0.s[0], v0.s[1] + mul_mls v4, v23, v16, v0.s[1], v0.s[0] + mul_mla v6, v21, v18, v0.s[2], v0.s[3] + srshr v16.4s, v2.4s, #12 // t0a + srshr v23.4s, v4.4s, #12 // t1a + mul_mls v2, v21, v18, v0.s[3], v0.s[2] + mul_mla v4, v19, v20, v1.s[0], v1.s[1] + srshr v18.4s, v6.4s, #12 // t2a + srshr v21.4s, v2.4s, #12 // t3a + mul_mls v6, v19, v20, v1.s[1], v1.s[0] + mul_mla v2, v17, v22, v1.s[2], v1.s[3] + srshr v20.4s, v4.4s, #12 // t4a + srshr v19.4s, v6.4s, #12 // t5a + mul_mls v4, v17, v22, v1.s[3], v1.s[2] + srshr v22.4s, v2.4s, #12 // t6a + srshr v17.4s, v4.4s, #12 // t7a + + ld1 {v0.4s}, [x16] + + sqadd v2.4s, v16.4s, v20.4s // t0 + sqsub v3.4s, v16.4s, v20.4s // t4 + sqadd v4.4s, v23.4s, v19.4s // t1 + sqsub v5.4s, v23.4s, v19.4s // t5 + sqadd v6.4s, v18.4s, v22.4s // t2 + sqsub v7.4s, v18.4s, v22.4s // t6 + sqadd v18.4s, v21.4s, v17.4s // t3 + sqsub v19.4s, v21.4s, v17.4s // t7 + + mul_mla v16, v3, v5, v0.s[3], v0.s[2] + mul_mls v20, v3, v5, v0.s[2], v0.s[3] + mul_mls v22, v19, v7, v0.s[3], v0.s[2] + + srshr v3.4s, v16.4s, #12 // t4a + srshr v5.4s, v20.4s, #12 // t5a + + mul_mla v16, v19, v7, v0.s[2], v0.s[3] + + srshr v7.4s, v22.4s, #12 // t6a + srshr v19.4s, v16.4s, #12 // t7a + + sqadd \o0\().4s, v2.4s, v6.4s // out0 + sqsub v2.4s, v2.4s, v6.4s // t2 + sqadd \o7\().4s, v4.4s, v18.4s // out7 + sqsub v4.4s, v4.4s, v18.4s // t3 + sqneg \o7\().4s, \o7\().4s // out7 + + sqadd \o1\().4s, v3.4s, v7.4s // out1 + sqsub v3.4s, v3.4s, v7.4s // t6 + sqadd \o6\().4s, v5.4s, v19.4s // out6 + sqsub v5.4s, v5.4s, v19.4s // t7 + sqneg \o1\().4s, \o1\().4s // out1 + + mul_mla v18, v2, v4, v0.s[0], v0.s[0] // -> out3 (v19 or v20) + mul_mls v6, v2, v4, v0.s[0], v0.s[0] // -> out4 (v20 or v19) + mul_mls v20, v3, v5, v0.s[0], v0.s[0] // -> out5 (v21 or v18) + srshr v2.4s, v18.4s, #12 // out3 + mul_mla v18, v3, v5, v0.s[0], v0.s[0] // -> out2 (v18 or v21) + srshr v3.4s, v20.4s, #12 // out5 + srshr \o2\().4s, v18.4s, #12 // out2 (v18 or v21) + srshr \o4\().4s, v6.4s, #12 // out4 (v20 or v19) + + sqneg \o3\().4s, v2.4s // out3 + sqneg \o5\().4s, v3.4s // out5 +.endm + +function inv_adst_4s_x8_neon + iadst_8 v16, v17, v18, v19, v20, v21, v22, v23 + ret +endfunc + +function inv_flipadst_4s_x8_neon + iadst_8 v23, v22, v21, v20, v19, v18, v17, v16 + ret +endfunc + +function inv_identity_4s_x8_neon + sqshl v16.4s, v16.4s, #1 + sqshl v17.4s, v17.4s, #1 + sqshl v18.4s, v18.4s, #1 + sqshl v19.4s, v19.4s, #1 + sqshl v20.4s, v20.4s, #1 + sqshl v21.4s, v21.4s, #1 + sqshl v22.4s, v22.4s, #1 + sqshl v23.4s, v23.4s, #1 + ret +endfunc + +function inv_txfm_add_8x8_neon + movi v31.4s, #0 + + cmp w3, w13 + mov x11, #32 + b.lt 1f + + add x6, x2, #16 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s + ld1 {\i}, [x6] + st1 {v31.4s}, [x6], x11 +.endr + + blr x4 + + sqrshrn v24.4h, v16.4s, #1 + sqrshrn v25.4h, v17.4s, #1 + sqrshrn v26.4h, v18.4s, #1 + sqrshrn v27.4h, v19.4s, #1 + sqrshrn2 v24.8h, v20.4s, #1 + sqrshrn2 v25.8h, v21.4s, #1 + sqrshrn2 v26.8h, v22.4s, #1 + sqrshrn2 v27.8h, v23.4s, #1 + + transpose_4x8h v24, v25, v26, v27, v2, v3, v4, v5 + + b 2f + +1: +.irp i, v24.8h, v25.8h, v26.8h, v27.8h + movi \i, #0 +.endr + +2: + +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s + ld1 {\i}, [x2] + st1 {v31.4s}, [x2], x11 +.endr + + blr x4 + + sqrshrn v16.4h, v16.4s, #1 + sqrshrn v17.4h, v17.4s, #1 + sqrshrn v18.4h, v18.4s, #1 + sqrshrn v19.4h, v19.4s, #1 + sqrshrn2 v16.8h, v20.4s, #1 + sqrshrn2 v17.8h, v21.4s, #1 + sqrshrn2 v18.8h, v22.4s, #1 + sqrshrn2 v19.8h, v23.4s, #1 + + transpose_4x8h v16, v17, v18, v19, v20, v21, v22, v23 + + mov v20.16b, v24.16b + mov v21.16b, v25.16b + mov v22.16b, v26.16b + mov v23.16b, v27.16b + + blr x5 + + load_add_store_8x8 x0, x7 + br x15 +endfunc + +.macro def_fn_8x8 txfm1, txfm2, eob_half +function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_16bpc_neon, export=1 + mov x15, x30 + +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc 8, 8, 1 +.endif + movrel x5, X(inv_\txfm2\()_8h_x8_neon) + mov w13, #\eob_half + adr x4, inv_\txfm1\()_4s_x8_neon + b inv_txfm_add_8x8_neon +endfunc +.endm + +def_fn_8x8 dct, dct, 10 +def_fn_8x8 identity, identity, 10 +def_fn_8x8 dct, adst, 10 +def_fn_8x8 dct, flipadst, 10 +def_fn_8x8 dct, identity, 4 +def_fn_8x8 adst, dct, 10 +def_fn_8x8 adst, adst, 10 +def_fn_8x8 adst, flipadst, 10 +def_fn_8x8 flipadst, dct, 10 +def_fn_8x8 flipadst, adst, 10 +def_fn_8x8 flipadst, flipadst, 10 +def_fn_8x8 identity, dct, 4 +def_fn_8x8 adst, identity, 4 +def_fn_8x8 flipadst, identity, 4 +def_fn_8x8 identity, adst, 4 +def_fn_8x8 identity, flipadst, 4 + +function inv_txfm_add_8x4_neon + movi v28.4s, #0 + movi v29.4s, #0 + movi v30.4s, #0 + movi v31.4s, #0 + ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2] + st1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x2], #64 + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 + ld1 {v20.4s,v21.4s,v22.4s,v23.4s}, [x2] + st1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x2] + + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + + blr x4 + + sqxtn v16.4h, v16.4s + sqxtn v17.4h, v17.4s + sqxtn v18.4h, v18.4s + sqxtn v19.4h, v19.4s + sqxtn v20.4h, v20.4s + sqxtn v21.4h, v21.4s + sqxtn v22.4h, v22.4s + sqxtn v23.4h, v23.4s + + transpose_4x4h v16, v17, v18, v19, v4, v5, v6, v7 + transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7 + ins v16.d[1], v20.d[0] + ins v17.d[1], v21.d[0] + ins v18.d[1], v22.d[0] + ins v19.d[1], v23.d[0] + + blr x5 + + load_add_store_8x4 x0, x7 + br x15 +endfunc + +function inv_txfm_add_4x8_neon + movz w16, #2896*8, lsl #16 + movi v31.4s, #0 + dup v30.2s, w16 + + cmp w3, w13 + mov x11, #32 + b.lt 1f + + add x6, x2, #16 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s + ld1 {\i}, [x6] + st1 {v31.4s}, [x6], x11 +.endr + scale_input .4s, v30.s[0], v16, v17, v18, v19 + blr x4 + sqxtn v20.4h, v16.4s + sqxtn v21.4h, v17.4s + sqxtn v22.4h, v18.4s + sqxtn v23.4h, v19.4s + transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7 + + b 2f + +1: +.irp i, v20, v21, v22, v23 + movi \i\().4h, #0 +.endr + +2: + +.irp i, v16.4s, v17.4s, v18.4s, v19.4s + ld1 {\i}, [x2] + st1 {v31.4s}, [x2], x11 +.endr + scale_input .4s, v30.s[0], v16, v17, v18, v19 + blr x4 + sqxtn v16.4h, v16.4s + sqxtn v17.4h, v17.4s + sqxtn v18.4h, v18.4s + sqxtn v19.4h, v19.4s + transpose_4x4h v16, v17, v18, v19, v4, v5, v6, v7 + + blr x5 + + load_add_store_4x8 x0, x7 + br x15 +endfunc + +.macro def_fn_48 w, h, txfm1, txfm2, eob_half +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 + mov x15, x30 + +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc \w, \h, 0 +.endif + adr x4, inv_\txfm1\()_4s_x\w\()_neon +.if \w == 4 + mov w13, #\eob_half +.endif + movrel x5, X(inv_\txfm2\()_\w\()h_x\h\()_neon) + b inv_txfm_add_\w\()x\h\()_neon +endfunc +.endm + +.macro def_fns_48 w, h +def_fn_48 \w, \h, dct, dct, 13 +def_fn_48 \w, \h, identity, identity, 13 +def_fn_48 \w, \h, dct, adst, 13 +def_fn_48 \w, \h, dct, flipadst, 13 +def_fn_48 \w, \h, dct, identity, 4 +def_fn_48 \w, \h, adst, dct, 13 +def_fn_48 \w, \h, adst, adst, 13 +def_fn_48 \w, \h, adst, flipadst, 13 +def_fn_48 \w, \h, flipadst, dct, 13 +def_fn_48 \w, \h, flipadst, adst, 13 +def_fn_48 \w, \h, flipadst, flipadst, 13 +def_fn_48 \w, \h, identity, dct, 16 +def_fn_48 \w, \h, adst, identity, 4 +def_fn_48 \w, \h, flipadst, identity, 4 +def_fn_48 \w, \h, identity, adst, 16 +def_fn_48 \w, \h, identity, flipadst, 16 +.endm + +def_fns_48 4, 8 +def_fns_48 8, 4 + + +function inv_dct_4s_x16_neon + movrel x16, idct_coeffs + ld1 {v0.4s, v1.4s}, [x16], #32 + + idct_8 v16, v18, v20, v22, v24, v26, v28, v30 + + ld1 {v0.4s, v1.4s}, [x16] + sub x16, x16, #32 + + mul_mls v2, v17, v31, v0.s[0], v0.s[1] // -> t8a + mul_mla v4, v17, v31, v0.s[1], v0.s[0] // -> t15a + mul_mls v6, v25, v23, v0.s[2], v0.s[3] // -> t9a + srshr v17.4s, v2.4s, #12 // t8a + srshr v31.4s, v4.4s, #12 // t15a + mul_mla v2, v25, v23, v0.s[3], v0.s[2] // -> t14a + mul_mls v4, v21, v27, v1.s[0], v1.s[1] // -> t10a + srshr v23.4s, v6.4s, #12 // t9a + srshr v25.4s, v2.4s, #12 // t14a + mul_mla v6, v21, v27, v1.s[1], v1.s[0] // -> t13a + mul_mls v2, v29, v19, v1.s[2], v1.s[3] // -> t11a + srshr v21.4s, v4.4s, #12 // t10a + srshr v27.4s, v6.4s, #12 // t13a + mul_mla v4, v29, v19, v1.s[3], v1.s[2] // -> t12a + srshr v19.4s, v2.4s, #12 // t11a + srshr v29.4s, v4.4s, #12 // t12a + + ld1 {v0.4s}, [x16] + + sqsub v2.4s, v17.4s, v23.4s // t9 + sqadd v17.4s, v17.4s, v23.4s // t8 + sqsub v3.4s, v31.4s, v25.4s // t14 + sqadd v31.4s, v31.4s, v25.4s // t15 + sqsub v23.4s, v19.4s, v21.4s // t10 + sqadd v19.4s, v19.4s, v21.4s // t11 + sqadd v25.4s, v29.4s, v27.4s // t12 + sqsub v29.4s, v29.4s, v27.4s // t13 + + mul_mls v4, v3, v2, v0.s[2], v0.s[3] // -> t9a + mul_mla v6, v3, v2, v0.s[3], v0.s[2] // -> t14a + srshr v21.4s, v4.4s, #12 // t9a + srshr v27.4s, v6.4s, #12 // t14a + + mul_mls v4, v29, v23, v0.s[2], v0.s[3] // -> t13a + mul_mla v6, v29, v23, v0.s[3], v0.s[2] // -> t10a + srshr v29.4s, v4.4s, #12 // t13a + neg v6.4s, v6.4s + srshr v23.4s, v6.4s, #12 // t10a + + sqsub v2.4s, v17.4s, v19.4s // t11a + sqadd v17.4s, v17.4s, v19.4s // t8a + sqsub v3.4s, v31.4s, v25.4s // t12a + sqadd v31.4s, v31.4s, v25.4s // t15a + sqadd v19.4s, v21.4s, v23.4s // t9 + sqsub v21.4s, v21.4s, v23.4s // t10 + sqsub v25.4s, v27.4s, v29.4s // t13 + sqadd v27.4s, v27.4s, v29.4s // t14 + + mul_mls v4, v3, v2, v0.s[0], v0.s[0] // -> t11 + mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t12 + mul_mls v2, v25, v21, v0.s[0], v0.s[0] // -> t10a + + srshr v4.4s, v4.4s, #12 // t11 + srshr v5.4s, v6.4s, #12 // t12 + mul_mla v6, v25, v21, v0.s[0], v0.s[0] // -> t10a + srshr v2.4s, v2.4s, #12 // t10a + srshr v3.4s, v6.4s, #12 // t13a + + sqadd v6.4s, v16.4s, v31.4s // out0 + sqsub v31.4s, v16.4s, v31.4s // out15 + mov v16.16b, v6.16b + sqadd v23.4s, v30.4s, v17.4s // out7 + sqsub v7.4s, v30.4s, v17.4s // out8 + sqadd v17.4s, v18.4s, v27.4s // out1 + sqsub v30.4s, v18.4s, v27.4s // out14 + sqadd v18.4s, v20.4s, v3.4s // out2 + sqsub v29.4s, v20.4s, v3.4s // out13 + sqadd v3.4s, v28.4s, v19.4s // out6 + sqsub v25.4s, v28.4s, v19.4s // out9 + sqadd v19.4s, v22.4s, v5.4s // out3 + sqsub v28.4s, v22.4s, v5.4s // out12 + sqadd v20.4s, v24.4s, v4.4s // out4 + sqsub v27.4s, v24.4s, v4.4s // out11 + sqadd v21.4s, v26.4s, v2.4s // out5 + sqsub v26.4s, v26.4s, v2.4s // out10 + mov v24.16b, v7.16b + mov v22.16b, v3.16b + + ret +endfunc + +.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15 + movrel x16, iadst16_coeffs + ld1 {v0.4s, v1.4s}, [x16], #32 + + mul_mla v2, v31, v16, v0.s[0], v0.s[1] // -> t0 + mul_mls v4, v31, v16, v0.s[1], v0.s[0] // -> t1 + mul_mla v6, v29, v18, v0.s[2], v0.s[3] // -> t2 + srshr v16.4s, v2.4s, #12 // t0 + srshr v31.4s, v4.4s, #12 // t1 + mul_mls v2, v29, v18, v0.s[3], v0.s[2] // -> t3 + mul_mla v4, v27, v20, v1.s[0], v1.s[1] // -> t4 + srshr v18.4s, v6.4s, #12 // t2 + srshr v29.4s, v2.4s, #12 // t3 + mul_mls v6, v27, v20, v1.s[1], v1.s[0] // -> t5 + mul_mla v2, v25, v22, v1.s[2], v1.s[3] // -> t6 + srshr v20.4s, v4.4s, #12 // t4 + srshr v27.4s, v6.4s, #12 // t5 + mul_mls v4, v25, v22, v1.s[3], v1.s[2] // -> t7 + ld1 {v0.4s, v1.4s}, [x16] + movrel x16, idct_coeffs + mul_mla v6, v23, v24, v0.s[0], v0.s[1] // -> t8 + srshr v22.4s, v2.4s, #12 // t6 + srshr v25.4s, v4.4s, #12 // t7 + mul_mls v2, v23, v24, v0.s[1], v0.s[0] // -> t9 + mul_mla v4, v21, v26, v0.s[2], v0.s[3] // -> t10 + srshr v23.4s, v6.4s, #12 // t8 + srshr v24.4s, v2.4s, #12 // t9 + mul_mls v6, v21, v26, v0.s[3], v0.s[2] // -> t11 + mul_mla v2, v19, v28, v1.s[0], v1.s[1] // -> t12 + srshr v21.4s, v4.4s, #12 // t10 + srshr v26.4s, v6.4s, #12 // t11 + mul_mls v4, v19, v28, v1.s[1], v1.s[0] // -> t13 + mul_mla v6, v17, v30, v1.s[2], v1.s[3] // -> t14 + srshr v19.4s, v2.4s, #12 // t12 + srshr v28.4s, v4.4s, #12 // t13 + mul_mls v2, v17, v30, v1.s[3], v1.s[2] // -> t15 + srshr v17.4s, v6.4s, #12 // t14 + srshr v30.4s, v2.4s, #12 // t15 + + ld1 {v0.4s, v1.4s}, [x16] + + sqsub v2.4s, v16.4s, v23.4s // t8a + sqadd v16.4s, v16.4s, v23.4s // t0a + sqsub v3.4s, v31.4s, v24.4s // t9a + sqadd v31.4s, v31.4s, v24.4s // t1a + sqadd v23.4s, v18.4s, v21.4s // t2a + sqsub v18.4s, v18.4s, v21.4s // t10a + sqadd v24.4s, v29.4s, v26.4s // t3a + sqsub v29.4s, v29.4s, v26.4s // t11a + sqadd v21.4s, v20.4s, v19.4s // t4a + sqsub v20.4s, v20.4s, v19.4s // t12a + sqadd v26.4s, v27.4s, v28.4s // t5a + sqsub v27.4s, v27.4s, v28.4s // t13a + sqadd v19.4s, v22.4s, v17.4s // t6a + sqsub v22.4s, v22.4s, v17.4s // t14a + sqadd v28.4s, v25.4s, v30.4s // t7a + sqsub v25.4s, v25.4s, v30.4s // t15a + + mul_mla v4, v2, v3, v1.s[1], v1.s[0] // -> t8 + mul_mls v6, v2, v3, v1.s[0], v1.s[1] // -> t9 + mul_mla v2, v18, v29, v1.s[3], v1.s[2] // -> t10 + srshr v17.4s, v4.4s, #12 // t8 + srshr v30.4s, v6.4s, #12 // t9 + mul_mls v4, v18, v29, v1.s[2], v1.s[3] // -> t11 + mul_mls v6, v27, v20, v1.s[1], v1.s[0] // -> t12 + srshr v18.4s, v2.4s, #12 // t10 + srshr v29.4s, v4.4s, #12 // t11 + mul_mla v2, v27, v20, v1.s[0], v1.s[1] // -> t13 + mul_mls v4, v25, v22, v1.s[3], v1.s[2] // -> t14 + srshr v27.4s, v6.4s, #12 // t12 + srshr v20.4s, v2.4s, #12 // t13 + mul_mla v6, v25, v22, v1.s[2], v1.s[3] // -> t15 + srshr v25.4s, v4.4s, #12 // t14 + srshr v22.4s, v6.4s, #12 // t15 + + sqsub v2.4s, v16.4s, v21.4s // t4 + sqadd v16.4s, v16.4s, v21.4s // t0 + sqsub v3.4s, v31.4s, v26.4s // t5 + sqadd v31.4s, v31.4s, v26.4s // t1 + sqadd v21.4s, v23.4s, v19.4s // t2 + sqsub v23.4s, v23.4s, v19.4s // t6 + sqadd v26.4s, v24.4s, v28.4s // t3 + sqsub v24.4s, v24.4s, v28.4s // t7 + sqadd v19.4s, v17.4s, v27.4s // t8a + sqsub v17.4s, v17.4s, v27.4s // t12a + sqadd v28.4s, v30.4s, v20.4s // t9a + sqsub v30.4s, v30.4s, v20.4s // t13a + sqadd v27.4s, v18.4s, v25.4s // t10a + sqsub v18.4s, v18.4s, v25.4s // t14a + sqadd v20.4s, v29.4s, v22.4s // t11a + sqsub v29.4s, v29.4s, v22.4s // t15a + + mul_mla v4, v2, v3, v0.s[3], v0.s[2] // -> t4a + mul_mls v6, v2, v3, v0.s[2], v0.s[3] // -> t5a + mul_mls v2, v24, v23, v0.s[3], v0.s[2] // -> t6a + srshr v22.4s, v4.4s, #12 // t4a + srshr v25.4s, v6.4s, #12 // t5a + mul_mla v4, v24, v23, v0.s[2], v0.s[3] // -> t7a + mul_mla v6, v17, v30, v0.s[3], v0.s[2] // -> t12 + srshr v24.4s, v2.4s, #12 // t6a + srshr v23.4s, v4.4s, #12 // t7a + mul_mls v2, v17, v30, v0.s[2], v0.s[3] // -> t13 + mul_mls v4, v29, v18, v0.s[3], v0.s[2] // -> t14 + srshr v17.4s, v6.4s, #12 // t12 + mul_mla v6, v29, v18, v0.s[2], v0.s[3] // -> t15 + srshr v29.4s, v2.4s, #12 // t13 + srshr v30.4s, v4.4s, #12 // t14 + srshr v18.4s, v6.4s, #12 // t15 + + sqsub v2.4s, v16.4s, v21.4s // t2a +.ifc \o0, v16 + sqadd \o0\().4s, v16.4s, v21.4s // out0 + sqsub v21.4s, v31.4s, v26.4s // t3a + sqadd \o15\().4s, v31.4s, v26.4s // out15 +.else + sqadd v4.4s, v16.4s, v21.4s // out0 + sqsub v21.4s, v31.4s, v26.4s // t3a + sqadd \o15\().4s, v31.4s, v26.4s // out15 + mov \o0\().16b, v4.16b +.endif + sqneg \o15\().4s, \o15\().4s // out15 + + sqsub v3.4s, v29.4s, v18.4s // t15a + sqadd \o13\().4s, v29.4s, v18.4s // out13 + sqadd \o2\().4s, v17.4s, v30.4s // out2 + sqsub v26.4s, v17.4s, v30.4s // t14a + sqneg \o13\().4s, \o13\().4s // out13 + + sqadd \o1\().4s, v19.4s, v27.4s // out1 + sqsub v27.4s, v19.4s, v27.4s // t10 + sqadd \o14\().4s, v28.4s, v20.4s // out14 + sqsub v20.4s, v28.4s, v20.4s // t11 + sqneg \o1\().4s, \o1\().4s // out1 + + sqadd \o3\().4s, v22.4s, v24.4s // out3 + sqsub v22.4s, v22.4s, v24.4s // t6 + sqadd \o12\().4s, v25.4s, v23.4s // out12 + sqsub v23.4s, v25.4s, v23.4s // t7 + sqneg \o3\().4s, \o3\().4s // out3 + + mul_mls v24, v2, v21, v0.s[0], v0.s[0] // -> out8 (v24 or v23) + mul_mla v4, v2, v21, v0.s[0], v0.s[0] // -> out7 (v23 or v24) + mul_mla v6, v26, v3, v0.s[0], v0.s[0] // -> out5 (v21 or v26) + + srshr v24.4s, v24.4s, #12 // out8 + srshr v4.4s, v4.4s, #12 // out7 + srshr v5.4s, v6.4s, #12 // out5 + mul_mls v6, v26, v3, v0.s[0], v0.s[0] // -> out10 (v26 or v21) + mul_mla v2, v22, v23, v0.s[0], v0.s[0] // -> out4 (v20 or v27) + srshr v26.4s, v6.4s, #12 // out10 + + mul_mls v6, v22, v23, v0.s[0], v0.s[0] // -> out11 (v27 or v20) + mul_mla v22, v27, v20, v0.s[0], v0.s[0] // -> out6 (v22 or v25) + mul_mls v21, v27, v20, v0.s[0], v0.s[0] // -> out9 (v25 or v22) + + srshr \o4\().4s, v2.4s, #12 // out4 + srshr v6.4s, v6.4s, #12 // out11 + srshr v7.4s, v21.4s, #12 // out9 + srshr \o6\().4s, v22.4s, #12 // out6 + +.ifc \o8, v23 + mov \o8\().16b, v24.16b + mov \o10\().16b, v26.16b +.endif + + sqneg \o7\().4s, v4.4s // out7 + sqneg \o5\().4s, v5.4s // out5 + sqneg \o11\().4s, v6.4s // out11 + sqneg \o9\().4s, v7.4s // out9 +.endm + +function inv_adst_4s_x16_neon + iadst_16 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 + ret +endfunc + +function inv_flipadst_4s_x16_neon + iadst_16 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16 + ret +endfunc + +function inv_identity_4s_x16_neon + movz w16, #2*(5793-4096)*8, lsl #16 + dup v0.2s, w16 +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + sqrdmulh v2.4s, v\i\().4s, v0.s[0] + sqadd v\i\().4s, v\i\().4s, v\i\().4s + sqadd v\i\().4s, v\i\().4s, v2.4s +.endr + ret +endfunc + +.macro identity_4x16_shift1 c +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + sqrdmulh v3.4s, \i, \c + srshr v3.4s, v3.4s, #1 + sqadd \i, \i, v3.4s +.endr +.endm + +.macro identity_4x16 c +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + sqrdmulh v3.4s, \i, \c + sqadd \i, \i, \i + sqadd \i, \i, v3.4s +.endr +.endm + +.macro def_horz_16 scale=0, shift=2, suffix +function inv_txfm_horz\suffix\()_16x4_neon + mov x14, x30 + movi v7.4s, #0 +.if \scale + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 +.endif +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + ld1 {\i}, [x7] + st1 {v7.4s}, [x7], x8 +.endr +.if \scale + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 +.endif + blr x4 + sqrshrn v16.4h, v16.4s, #\shift + sqrshrn v17.4h, v17.4s, #\shift + sqrshrn v18.4h, v18.4s, #\shift + sqrshrn v19.4h, v19.4s, #\shift + sqrshrn2 v16.8h, v20.4s, #\shift + sqrshrn2 v17.8h, v21.4s, #\shift + sqrshrn2 v18.8h, v22.4s, #\shift + sqrshrn2 v19.8h, v23.4s, #\shift + sqrshrn v20.4h, v24.4s, #\shift + sqrshrn v21.4h, v25.4s, #\shift + sqrshrn v22.4h, v26.4s, #\shift + sqrshrn v23.4h, v27.4s, #\shift + sqrshrn2 v20.8h, v28.4s, #\shift + sqrshrn2 v21.8h, v29.4s, #\shift + sqrshrn2 v22.8h, v30.4s, #\shift + sqrshrn2 v23.8h, v31.4s, #\shift + transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7 + transpose_4x8h v20, v21, v22, v23, v4, v5, v6, v7 + +.irp i, v16.8h, v20.8h, v17.8h, v21.8h, v18.8h, v22.8h, v19.8h, v23.8h + st1 {\i}, [x6], #16 +.endr + + br x14 +endfunc +.endm + +def_horz_16 scale=0, shift=2 +def_horz_16 scale=1, shift=1, suffix=_scale + +function inv_txfm_add_vert_8x16_neon + mov x14, x30 +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + ld1 {v\i\().8h}, [x7], x8 +.endr + blr x5 + load_add_store_8x16 x6, x7 + br x14 +endfunc + +function inv_txfm_add_16x16_neon + mov x15, x30 + sub sp, sp, #512 + ldrh w12, [x13], #2 +.irp i, 0, 4, 8, 12 + add x6, sp, #(\i*16*2) +.if \i > 0 + mov w8, #(16 - \i) + cmp w3, w12 + b.lt 1f +.if \i < 12 + ldrh w12, [x13], #2 +.endif +.endif + add x7, x2, #(\i*4) + mov x8, #16*4 + bl inv_txfm_horz_16x4_neon +.endr + b 3f +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #4 +.rept 2 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b +3: +.irp i, 0, 8 + add x6, x0, #(\i*2) + add x7, sp, #(\i*2) + mov x8, #32 + bl inv_txfm_add_vert_8x16_neon +.endr + + add sp, sp, #512 + br x15 +endfunc + +const eob_16x16 + .short 10, 36, 78, 256 +endconst + +const eob_16x16_identity + .short 4, 8, 12, 256 +endconst + +.macro def_fn_16x16 txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_16bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc 16, 16, 2 +.endif + adr x4, inv_\txfm1\()_4s_x16_neon + movrel x5, X(inv_\txfm2\()_8h_x16_neon) +.ifc \txfm1, identity +.ifc \txfm2, identity + movrel x13, eob_16x16 +.else + movrel x13, eob_16x16_identity +.endif +.else +.ifc \txfm2, identity + movrel x13, eob_16x16_identity +.else + movrel x13, eob_16x16 +.endif +.endif + b inv_txfm_add_16x16_neon +endfunc +.endm + +def_fn_16x16 dct, dct +def_fn_16x16 identity, identity +def_fn_16x16 dct, adst +def_fn_16x16 dct, flipadst +def_fn_16x16 dct, identity +def_fn_16x16 adst, dct +def_fn_16x16 adst, adst +def_fn_16x16 adst, flipadst +def_fn_16x16 flipadst, dct +def_fn_16x16 flipadst, adst +def_fn_16x16 flipadst, flipadst +def_fn_16x16 identity, dct + +function inv_txfm_add_16x4_neon + mov x15, x30 + movi v4.4s, #0 + +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + ld1 {\i}, [x2] + st1 {v4.4s}, [x2], #16 +.endr + + blr x4 + + sqrshrn v16.4h, v16.4s, #1 + sqrshrn v17.4h, v17.4s, #1 + sqrshrn v18.4h, v18.4s, #1 + sqrshrn v19.4h, v19.4s, #1 + sqrshrn2 v16.8h, v20.4s, #1 + sqrshrn2 v17.8h, v21.4s, #1 + sqrshrn2 v18.8h, v22.4s, #1 + sqrshrn2 v19.8h, v23.4s, #1 + transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 + blr x5 + mov x6, x0 + load_add_store_8x4 x6, x7 + + sqrshrn v16.4h, v24.4s, #1 + sqrshrn v17.4h, v25.4s, #1 + sqrshrn v18.4h, v26.4s, #1 + sqrshrn v19.4h, v27.4s, #1 + sqrshrn2 v16.8h, v28.4s, #1 + sqrshrn2 v17.8h, v29.4s, #1 + sqrshrn2 v18.8h, v30.4s, #1 + sqrshrn2 v19.8h, v31.4s, #1 + transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 + blr x5 + add x6, x0, #16 + load_add_store_8x4 x6, x7 + + br x15 +endfunc + +function inv_txfm_add_4x16_neon + ldrh w12, [x13, #4] + mov x15, x30 + + mov x11, #64 + + cmp w3, w12 + ldrh w12, [x13, #2] + b.lt 1f + + add x6, x2, #48 + movi v2.4s, #0 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s + ld1 {\i}, [x6] + st1 {v2.4s}, [x6], x11 +.endr + blr x4 + rshrn v28.4h, v16.4s, #1 + rshrn v29.4h, v17.4s, #1 + rshrn v30.4h, v18.4s, #1 + rshrn v31.4h, v19.4s, #1 + transpose_4x4h v28, v29, v30, v31, v4, v5, v6, v7 + + b 2f +1: +.irp i, v28.4h, v29.4h, v30.4h, v31.4h + movi \i, #0 +.endr +2: + cmp w3, w12 + ldrh w12, [x13, #0] + b.lt 1f + + add x6, x2, #32 + movi v2.4s, #0 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s + ld1 {\i}, [x6] + st1 {v2.4s}, [x6], x11 +.endr + blr x4 + rshrn v24.4h, v16.4s, #1 + rshrn v25.4h, v17.4s, #1 + rshrn v26.4h, v18.4s, #1 + rshrn v27.4h, v19.4s, #1 + transpose_4x4h v24, v25, v26, v27, v4, v5, v6, v7 + + b 2f +1: +.irp i, v24.4h, v25.4h, v26.4h, v27.4h + movi \i, #0 +.endr +2: + cmp w3, w12 + b.lt 1f + + add x6, x2, #16 + movi v2.4s, #0 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s + ld1 {\i}, [x6] + st1 {v2.4s}, [x6], x11 +.endr + blr x4 + rshrn v20.4h, v16.4s, #1 + rshrn v21.4h, v17.4s, #1 + rshrn v22.4h, v18.4s, #1 + rshrn v23.4h, v19.4s, #1 + transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7 + + b 2f +1: +.irp i, v20.4h, v21.4h, v22.4h, v23.4h + movi \i, #0 +.endr +2: + + movi v2.4s, #0 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s + ld1 {\i}, [x2] + st1 {v2.4s}, [x2], x11 +.endr + blr x4 + rshrn v16.4h, v16.4s, #1 + rshrn v17.4h, v17.4s, #1 + rshrn v18.4h, v18.4s, #1 + rshrn v19.4h, v19.4s, #1 + transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7 + + blr x5 + + load_add_store_4x16 x0, x6 + + br x15 +endfunc + +const eob_4x16 + .short 13, 29, 45, 64 +endconst + +const eob_4x16_identity1 + .short 16, 32, 48, 64 +endconst + +const eob_4x16_identity2 + .short 4, 8, 12, 64 +endconst + +.macro def_fn_416 w, h, txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc \w, \h, 1 +.endif +.if \w == 4 + adr x4, inv_\txfm1\()_4s_x\w\()_neon + movrel x5, X(inv_\txfm2\()_4h_x\h\()_neon) +.ifc \txfm1, identity +.ifc \txfm2, identity + movrel x13, eob_4x16 +.else + movrel x13, eob_4x16_identity1 +.endif +.else +.ifc \txfm2, identity + movrel x13, eob_4x16_identity2 +.else + movrel x13, eob_4x16 +.endif +.endif +.else + adr x4, inv_\txfm1\()_4s_x\w\()_neon + movrel x5, X(inv_\txfm2\()_8h_x\h\()_neon) +.endif + b inv_txfm_add_\w\()x\h\()_neon +endfunc +.endm + +.macro def_fns_416 w, h +def_fn_416 \w, \h, dct, dct +def_fn_416 \w, \h, identity, identity +def_fn_416 \w, \h, dct, adst +def_fn_416 \w, \h, dct, flipadst +def_fn_416 \w, \h, dct, identity +def_fn_416 \w, \h, adst, dct +def_fn_416 \w, \h, adst, adst +def_fn_416 \w, \h, adst, flipadst +def_fn_416 \w, \h, flipadst, dct +def_fn_416 \w, \h, flipadst, adst +def_fn_416 \w, \h, flipadst, flipadst +def_fn_416 \w, \h, identity, dct +def_fn_416 \w, \h, adst, identity +def_fn_416 \w, \h, flipadst, identity +def_fn_416 \w, \h, identity, adst +def_fn_416 \w, \h, identity, flipadst +.endm + +def_fns_416 4, 16 +def_fns_416 16, 4 + + +function inv_txfm_add_16x8_neon + mov x15, x30 + stp d8, d9, [sp, #-0x40]! + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + + cmp w3, w13 + mov x11, #32 + b.lt 1f + + movi v4.4s, #0 + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 + + add x6, x2, #16 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + ld1 {\i}, [x6] + st1 {v4.4s}, [x6], x11 +.endr + + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 + blr x4 + + sqrshrn v8.4h, v16.4s, #1 + sqrshrn v9.4h, v17.4s, #1 + sqrshrn v10.4h, v18.4s, #1 + sqrshrn v11.4h, v19.4s, #1 + sqrshrn2 v8.8h, v20.4s, #1 + sqrshrn2 v9.8h, v21.4s, #1 + sqrshrn2 v10.8h, v22.4s, #1 + sqrshrn2 v11.8h, v23.4s, #1 + sqrshrn v12.4h, v24.4s, #1 + sqrshrn v13.4h, v25.4s, #1 + sqrshrn v14.4h, v26.4s, #1 + sqrshrn v15.4h, v27.4s, #1 + sqrshrn2 v12.8h, v28.4s, #1 + sqrshrn2 v13.8h, v29.4s, #1 + sqrshrn2 v14.8h, v30.4s, #1 + sqrshrn2 v15.8h, v31.4s, #1 + + transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5 + transpose_4x8h v12, v13, v14, v15, v2, v3, v4, v5 + + b 2f +1: +.irp i, v8.8h, v9.8h, v10.8h, v11.8h, v12.8h, v13.8h, v14.8h, v15.8h + movi \i, #0 +.endr +2: + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 + + movi v4.4s, #0 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + ld1 {\i}, [x2] + st1 {v4.4s}, [x2], x11 +.endr + + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 + blr x4 + + sqrshrn v16.4h, v16.4s, #1 + sqrshrn v17.4h, v17.4s, #1 + sqrshrn v18.4h, v18.4s, #1 + sqrshrn v19.4h, v19.4s, #1 + sqrshrn2 v16.8h, v20.4s, #1 + sqrshrn2 v17.8h, v21.4s, #1 + sqrshrn2 v18.8h, v22.4s, #1 + sqrshrn2 v19.8h, v23.4s, #1 + + mov v20.16b, v8.16b + mov v21.16b, v9.16b + mov v22.16b, v10.16b + mov v23.16b, v11.16b + + transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 + + sqrshrn v8.4h, v24.4s, #1 + sqrshrn v9.4h, v25.4s, #1 + sqrshrn v10.4h, v26.4s, #1 + sqrshrn v11.4h, v27.4s, #1 + sqrshrn2 v8.8h, v28.4s, #1 + sqrshrn2 v9.8h, v29.4s, #1 + sqrshrn2 v10.8h, v30.4s, #1 + sqrshrn2 v11.8h, v31.4s, #1 + + transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5 + + blr x5 + + mov x6, x0 + load_add_store_8x8 x6, x7 + + mov v16.16b, v8.16b + mov v17.16b, v9.16b + mov v18.16b, v10.16b + mov v19.16b, v11.16b + mov v20.16b, v12.16b + mov v21.16b, v13.16b + mov v22.16b, v14.16b + mov v23.16b, v15.16b + + blr x5 + + add x0, x0, #16 + load_add_store_8x8 x0, x7 + + ldp d14, d15, [sp, #0x30] + ldp d12, d13, [sp, #0x20] + ldp d10, d11, [sp, #0x10] + ldp d8, d9, [sp], 0x40 + br x15 +endfunc + +function inv_txfm_add_8x16_neon + mov x15, x30 + stp d8, d9, [sp, #-0x20]! + stp d10, d11, [sp, #0x10] + ldrh w12, [x13, #4] + + mov x11, #64 + + cmp w3, w12 + ldrh w12, [x13, #2] + b.lt 1f + + add x6, x2, #48 + movi v4.4s, #0 + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s + ld1 {\i}, [x6] + st1 {v4.4s}, [x6], x11 +.endr + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + blr x4 + + sqrshrn v28.4h, v16.4s, #1 + sqrshrn v29.4h, v17.4s, #1 + sqrshrn v30.4h, v18.4s, #1 + sqrshrn v31.4h, v19.4s, #1 + sqrshrn2 v28.8h, v20.4s, #1 + sqrshrn2 v29.8h, v21.4s, #1 + sqrshrn2 v30.8h, v22.4s, #1 + sqrshrn2 v31.8h, v23.4s, #1 + transpose_4x8h v28, v29, v30, v31, v2, v3, v4, v5 + + b 2f + +1: +.irp i, v28.8h, v29.8h, v30.8h, v31.8h + movi \i, #0 +.endr + +2: + cmp w3, w12 + ldrh w12, [x13, #0] + b.lt 1f + + add x6, x2, #32 + movi v4.4s, #0 + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s + ld1 {\i}, [x6] + st1 {v4.4s}, [x6], x11 +.endr + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + blr x4 + + sqrshrn v24.4h, v16.4s, #1 + sqrshrn v25.4h, v17.4s, #1 + sqrshrn v26.4h, v18.4s, #1 + sqrshrn v27.4h, v19.4s, #1 + sqrshrn2 v24.8h, v20.4s, #1 + sqrshrn2 v25.8h, v21.4s, #1 + sqrshrn2 v26.8h, v22.4s, #1 + sqrshrn2 v27.8h, v23.4s, #1 + transpose_4x8h v24, v25, v26, v27, v2, v3, v4, v5 + + b 2f + +1: +.irp i, v24.8h, v25.8h, v26.8h, v27.8h + movi \i, #0 +.endr + +2: + cmp w3, w12 + b.lt 1f + + add x6, x2, #16 + movi v4.4s, #0 + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s + ld1 {\i}, [x6] + st1 {v4.4s}, [x6], x11 +.endr + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + blr x4 + + sqrshrn v8.4h, v16.4s, #1 + sqrshrn v9.4h, v17.4s, #1 + sqrshrn v10.4h, v18.4s, #1 + sqrshrn v11.4h, v19.4s, #1 + sqrshrn2 v8.8h, v20.4s, #1 + sqrshrn2 v9.8h, v21.4s, #1 + sqrshrn2 v10.8h, v22.4s, #1 + sqrshrn2 v11.8h, v23.4s, #1 + transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5 + + b 2f + +1: +.irp i, v8.8h, v9.8h, v10.8h, v11.8h + movi \i, #0 +.endr + +2: + movi v4.4s, #0 + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s + ld1 {\i}, [x2] + st1 {v4.4s}, [x2], x11 +.endr + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + blr x4 + + sqrshrn v16.4h, v16.4s, #1 + sqrshrn v17.4h, v17.4s, #1 + sqrshrn v18.4h, v18.4s, #1 + sqrshrn v19.4h, v19.4s, #1 + sqrshrn2 v16.8h, v20.4s, #1 + sqrshrn2 v17.8h, v21.4s, #1 + sqrshrn2 v18.8h, v22.4s, #1 + sqrshrn2 v19.8h, v23.4s, #1 + transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 + + mov v20.16b, v8.16b + mov v21.16b, v9.16b + mov v22.16b, v10.16b + mov v23.16b, v11.16b + + blr x5 + + load_add_store_8x16 x0, x6 + + ldp d10, d11, [sp, #0x10] + ldp d8, d9, [sp], 0x20 + + br x15 +endfunc + +const eob_8x16 + .short 10, 43, 75, 128 +endconst + +const eob_8x16_identity1 + .short 4, 64, 96, 128 +endconst + +const eob_8x16_identity2 + .short 4, 8, 12, 128 +endconst + +.macro def_fn_816 w, h, txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc \w, \h, 1 +.endif + adr x4, inv_\txfm1\()_4s_x\w\()_neon + movrel x5, X(inv_\txfm2\()_8h_x\h\()_neon) +.ifc \txfm1, identity +.ifc \txfm2, identity + movrel x13, eob_8x16 +.else + movrel x13, eob_8x16_identity1 +.endif +.else +.ifc \txfm2, identity + movrel x13, eob_8x16_identity2 +.else + movrel x13, eob_8x16 +.endif +.endif +.if \h == 8 + ldrh w13, [x13] +.endif + b inv_txfm_add_\w\()x\h\()_neon +endfunc +.endm + +.macro def_fns_816 w, h +def_fn_816 \w, \h, dct, dct +def_fn_816 \w, \h, identity, identity +def_fn_816 \w, \h, dct, adst +def_fn_816 \w, \h, dct, flipadst +def_fn_816 \w, \h, dct, identity +def_fn_816 \w, \h, adst, dct +def_fn_816 \w, \h, adst, adst +def_fn_816 \w, \h, adst, flipadst +def_fn_816 \w, \h, flipadst, dct +def_fn_816 \w, \h, flipadst, adst +def_fn_816 \w, \h, flipadst, flipadst +def_fn_816 \w, \h, identity, dct +def_fn_816 \w, \h, adst, identity +def_fn_816 \w, \h, flipadst, identity +def_fn_816 \w, \h, identity, adst +def_fn_816 \w, \h, identity, flipadst +.endm + +def_fns_816 8, 16 +def_fns_816 16, 8 + +function inv_dct32_odd_4s_x16_neon + movrel x16, idct_coeffs, 4*16 + ld1 {v0.4s, v1.4s}, [x16], #32 + + mul_mls v2, v16, v31, v0.s[0], v0.s[1] // -> t16a + mul_mla v4, v16, v31, v0.s[1], v0.s[0] // -> t31a + mul_mls v6, v24, v23, v0.s[2], v0.s[3] // -> t17a + srshr v16.4s, v2.4s, #12 // t16a + srshr v31.4s, v4.4s, #12 // t31a + mul_mla v2, v24, v23, v0.s[3], v0.s[2] // -> t30a + mul_mls v4, v20, v27, v1.s[0], v1.s[1] // -> t18a + srshr v24.4s, v6.4s, #12 // t17a + srshr v23.4s, v2.4s, #12 // t30a + mul_mla v6, v20, v27, v1.s[1], v1.s[0] // -> t29a + mul_mls v2, v28, v19, v1.s[2], v1.s[3] // -> t19a + srshr v20.4s, v4.4s, #12 // t18a + srshr v27.4s, v6.4s, #12 // t29a + mul_mla v4, v28, v19, v1.s[3], v1.s[2] // -> t28a + ld1 {v0.4s, v1.4s}, [x16] + sub x16, x16, #4*24 + mul_mls v6, v18, v29, v0.s[0], v0.s[1] // -> t20a + srshr v28.4s, v2.4s, #12 // t19a + srshr v19.4s, v4.4s, #12 // t28a + mul_mla v2, v18, v29, v0.s[1], v0.s[0] // -> t27a + mul_mls v4, v26, v21, v0.s[2], v0.s[3] // -> t21a + srshr v18.4s, v6.4s, #12 // t20a + srshr v29.4s, v2.4s, #12 // t27a + mul_mla v6, v26, v21, v0.s[3], v0.s[2] // -> t26a + mul_mls v2, v22, v25, v1.s[0], v1.s[1] // -> t22a + srshr v26.4s, v4.4s, #12 // t21a + srshr v21.4s, v6.4s, #12 // t26a + mul_mla v4, v22, v25, v1.s[1], v1.s[0] // -> t25a + mul_mls v6, v30, v17, v1.s[2], v1.s[3] // -> t23a + srshr v22.4s, v2.4s, #12 // t22a + srshr v25.4s, v4.4s, #12 // t25a + mul_mla v2, v30, v17, v1.s[3], v1.s[2] // -> t24a + srshr v30.4s, v6.4s, #12 // t23a + srshr v17.4s, v2.4s, #12 // t24a + + ld1 {v0.4s, v1.4s}, [x16] + + sqsub v2.4s, v16.4s, v24.4s // t17 + sqadd v16.4s, v16.4s, v24.4s // t16 + sqsub v3.4s, v31.4s, v23.4s // t30 + sqadd v31.4s, v31.4s, v23.4s // t31 + sqsub v24.4s, v28.4s, v20.4s // t18 + sqadd v28.4s, v28.4s, v20.4s // t19 + sqadd v23.4s, v18.4s, v26.4s // t20 + sqsub v18.4s, v18.4s, v26.4s // t21 + sqsub v20.4s, v30.4s, v22.4s // t22 + sqadd v30.4s, v30.4s, v22.4s // t23 + sqadd v26.4s, v17.4s, v25.4s // t24 + sqsub v17.4s, v17.4s, v25.4s // t25 + sqsub v22.4s, v29.4s, v21.4s // t26 + sqadd v29.4s, v29.4s, v21.4s // t27 + sqadd v25.4s, v19.4s, v27.4s // t28 + sqsub v19.4s, v19.4s, v27.4s // t29 + + mul_mls v4, v3, v2, v1.s[0], v1.s[1] // -> t17a + mul_mla v6, v3, v2, v1.s[1], v1.s[0] // -> t30a + mul_mla v2, v19, v24, v1.s[1], v1.s[0] // -> t18a + srshr v21.4s, v4.4s, #12 // t17a + srshr v27.4s, v6.4s, #12 // t30a + neg v2.4s, v2.4s // -> t18a + mul_mls v4, v19, v24, v1.s[0], v1.s[1] // -> t29a + mul_mls v6, v22, v18, v1.s[2], v1.s[3] // -> t21a + srshr v19.4s, v2.4s, #12 // t18a + srshr v24.4s, v4.4s, #12 // t29a + mul_mla v2, v22, v18, v1.s[3], v1.s[2] // -> t26a + mul_mla v4, v17, v20, v1.s[3], v1.s[2] // -> t22a + srshr v22.4s, v6.4s, #12 // t21a + srshr v18.4s, v2.4s, #12 // t26a + neg v4.4s, v4.4s // -> t22a + mul_mls v6, v17, v20, v1.s[2], v1.s[3] // -> t25a + srshr v17.4s, v4.4s, #12 // t22a + srshr v20.4s, v6.4s, #12 // t25a + + sqsub v2.4s, v27.4s, v24.4s // t29 + sqadd v27.4s, v27.4s, v24.4s // t30 + sqsub v3.4s, v21.4s, v19.4s // t18 + sqadd v21.4s, v21.4s, v19.4s // t17 + sqsub v24.4s, v16.4s, v28.4s // t19a + sqadd v16.4s, v16.4s, v28.4s // t16a + sqsub v19.4s, v30.4s, v23.4s // t20a + sqadd v30.4s, v30.4s, v23.4s // t23a + sqsub v28.4s, v17.4s, v22.4s // t21 + sqadd v17.4s, v17.4s, v22.4s // t22 + sqadd v23.4s, v26.4s, v29.4s // t24a + sqsub v26.4s, v26.4s, v29.4s // t27a + sqadd v22.4s, v20.4s, v18.4s // t25 + sqsub v20.4s, v20.4s, v18.4s // t26 + sqsub v29.4s, v31.4s, v25.4s // t28a + sqadd v31.4s, v31.4s, v25.4s // t31a + + mul_mls v4, v2, v3, v0.s[2], v0.s[3] // -> t18a + mul_mla v6, v2, v3, v0.s[3], v0.s[2] // -> t29a + mul_mls v2, v29, v24, v0.s[2], v0.s[3] // -> t19 + srshr v18.4s, v4.4s, #12 // t18a + srshr v25.4s, v6.4s, #12 // t29a + mul_mla v4, v29, v24, v0.s[3], v0.s[2] // -> t28 + mul_mla v6, v26, v19, v0.s[3], v0.s[2] // -> t20 + srshr v29.4s, v2.4s, #12 // t19 + srshr v24.4s, v4.4s, #12 // t28 + neg v6.4s, v6.4s // -> t20 + mul_mls v2, v26, v19, v0.s[2], v0.s[3] // -> t27 + mul_mla v4, v20, v28, v0.s[3], v0.s[2] // -> t21a + srshr v26.4s, v6.4s, #12 // t20 + srshr v19.4s, v2.4s, #12 // t27 + neg v4.4s, v4.4s // -> t21a + mul_mls v6, v20, v28, v0.s[2], v0.s[3] // -> t26a + srshr v20.4s, v4.4s, #12 // t21a + srshr v28.4s, v6.4s, #12 // t26a + + sqsub v2.4s, v16.4s, v30.4s // t23 + sqadd v16.4s, v16.4s, v30.4s // t16 = out16 + sqsub v3.4s, v31.4s, v23.4s // t24 + sqadd v31.4s, v31.4s, v23.4s // t31 = out31 + sqsub v23.4s, v21.4s, v17.4s // t22a + sqadd v17.4s, v21.4s, v17.4s // t17a = out17 + sqadd v30.4s, v27.4s, v22.4s // t30a = out30 + sqsub v21.4s, v27.4s, v22.4s // t25a + sqsub v27.4s, v18.4s, v20.4s // t21 + sqadd v18.4s, v18.4s, v20.4s // t18 = out18 + sqadd v4.4s, v29.4s, v26.4s // t19a = out19 + sqsub v26.4s, v29.4s, v26.4s // t20a + sqadd v29.4s, v25.4s, v28.4s // t29 = out29 + sqsub v25.4s, v25.4s, v28.4s // t26 + sqadd v28.4s, v24.4s, v19.4s // t28a = out28 + sqsub v24.4s, v24.4s, v19.4s // t27a + mov v19.16b, v4.16b // out19 + + mul_mls v4, v24, v26, v0.s[0], v0.s[0] // -> t20 + mul_mla v6, v24, v26, v0.s[0], v0.s[0] // -> t27 + srshr v20.4s, v4.4s, #12 // t20 + srshr v22.4s, v6.4s, #12 // t27 + + mul_mla v4, v25, v27, v0.s[0], v0.s[0] // -> t26a + mul_mls v6, v25, v27, v0.s[0], v0.s[0] // -> t21a + mov v27.16b, v22.16b // t27 + srshr v26.4s, v4.4s, #12 // t26a + + mul_mls v24, v21, v23, v0.s[0], v0.s[0] // -> t22 + mul_mla v4, v21, v23, v0.s[0], v0.s[0] // -> t25 + srshr v21.4s, v6.4s, #12 // t21a + srshr v22.4s, v24.4s, #12 // t22 + srshr v25.4s, v4.4s, #12 // t25 + + mul_mls v4, v3, v2, v0.s[0], v0.s[0] // -> t23a + mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t24a + srshr v23.4s, v4.4s, #12 // t23a + srshr v24.4s, v6.4s, #12 // t24a + + ret +endfunc + +.macro def_horz_32 scale=0, shift=2, suffix +function inv_txfm_horz\suffix\()_dct_32x4_neon + mov x14, x30 + movi v7.4s, #0 + lsl x8, x8, #1 +.if \scale + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 +.endif + +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + ld1 {\i}, [x7] + st1 {v7.4s}, [x7], x8 +.endr + sub x7, x7, x8, lsl #4 + add x7, x7, x8, lsr #1 +.if \scale + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 +.endif + bl inv_dct_4s_x16_neon + transpose_4x4s v16, v17, v18, v19, v2, v3, v4, v5 + transpose_4x4s v20, v21, v22, v23, v2, v3, v4, v5 + transpose_4x4s v24, v25, v26, v27, v2, v3, v4, v5 + transpose_4x4s v28, v29, v30, v31, v2, v3, v4, v5 + +.macro store1 r0, r1, r2, r3 + st1 {\r0}, [x6], #16 + st1 {\r1}, [x6], #16 + st1 {\r2}, [x6], #16 + st1 {\r3}, [x6], #16 +.endm + store1 v16.4s, v20.4s, v24.4s, v28.4s + store1 v17.4s, v21.4s, v25.4s, v29.4s + store1 v18.4s, v22.4s, v26.4s, v30.4s + store1 v19.4s, v23.4s, v27.4s, v31.4s +.purgem store1 + sub x6, x6, #64*4 + + movi v7.4s, #0 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + ld1 {\i}, [x7] + st1 {v7.4s}, [x7], x8 +.endr +.if \scale + // This relies on the fact that the idct also leaves the right coeff in v0.s[1] + scale_input .4s, v0.s[1], v16, v17, v18, v19, v20, v21, v22, v23 + scale_input .4s, v0.s[1], v24, v25, v26, v27, v28, v29, v30, v31 +.endif + bl inv_dct32_odd_4s_x16_neon + transpose_4x4s v31, v30, v29, v28, v2, v3, v4, v5 + transpose_4x4s v27, v26, v25, v24, v2, v3, v4, v5 + transpose_4x4s v23, v22, v21, v20, v2, v3, v4, v5 + transpose_4x4s v19, v18, v17, v16, v2, v3, v4, v5 +.macro store2 r0, r1, r2, r3, shift + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x6] + sqsub v4.4s, v0.4s, \r0 + sqadd v0.4s, v0.4s, \r0 + sqsub v5.4s, v1.4s, \r1 + sqadd v1.4s, v1.4s, \r1 + sqsub v6.4s, v2.4s, \r2 + sqadd v2.4s, v2.4s, \r2 + sqsub v7.4s, v3.4s, \r3 + sqadd v3.4s, v3.4s, \r3 + sqrshrn v0.4h, v0.4s, #\shift + sqrshrn2 v0.8h, v1.4s, #\shift + sqrshrn v1.4h, v2.4s, #\shift + sqrshrn2 v1.8h, v3.4s, #\shift + sqrshrn v2.4h, v7.4s, #\shift + sqrshrn2 v2.8h, v6.4s, #\shift + sqrshrn v3.4h, v5.4s, #\shift + sqrshrn2 v3.8h, v4.4s, #\shift + st1 {v0.8h, v1.8h}, [x6], #32 + rev64 v2.8h, v2.8h + rev64 v3.8h, v3.8h + st1 {v2.8h, v3.8h}, [x6], #32 +.endm + + store2 v31.4s, v27.4s, v23.4s, v19.4s, \shift + store2 v30.4s, v26.4s, v22.4s, v18.4s, \shift + store2 v29.4s, v25.4s, v21.4s, v17.4s, \shift + store2 v28.4s, v24.4s, v20.4s, v16.4s, \shift +.purgem store2 + br x14 +endfunc +.endm + +def_horz_32 scale=0, shift=2 +def_horz_32 scale=1, shift=1, suffix=_scale + +function inv_txfm_add_vert_dct_8x32_neon + mov x14, x30 + lsl x8, x8, #1 + +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + ld1 {v\i\().8h}, [x7], x8 +.endr + sub x7, x7, x8, lsl #4 + + bl X(inv_dct_8h_x16_neon) + +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + st1 {v\i\().8h}, [x7], x8 +.endr + sub x7, x7, x8, lsl #4 + add x7, x7, x8, lsr #1 + +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + ld1 {v\i\().8h}, [x7], x8 +.endr + sub x7, x7, x8, lsl #4 + sub x7, x7, x8, lsr #1 + bl X(inv_dct32_odd_8h_x16_neon) + + neg x9, x8 + mov x10, x6 + movi v0.8h, #0 + mvni v1.8h, #0xfc, lsl #8 // 0x3ff +.macro combine r0, r1, r2, r3, op, stride + ld1 {v5.8h}, [x7], \stride + ld1 {v2.8h}, [x10], x1 + ld1 {v6.8h}, [x7], \stride + ld1 {v3.8h}, [x10], x1 + \op v5.8h, v5.8h, \r0 + ld1 {v7.8h}, [x7], \stride + ld1 {v4.8h}, [x10], x1 + srshr v5.8h, v5.8h, #4 + \op v6.8h, v6.8h, \r1 + sqadd v5.8h, v5.8h, v2.8h + srshr v6.8h, v6.8h, #4 + \op v7.8h, v7.8h, \r2 + smax v2.8h, v5.8h, v0.8h + ld1 {v5.8h}, [x7], \stride + sqadd v6.8h, v6.8h, v3.8h + smin v2.8h, v2.8h, v1.8h + srshr v7.8h, v7.8h, #4 + \op v5.8h, v5.8h, \r3 + st1 {v2.8h}, [x6], x1 + ld1 {v2.8h}, [x10], x1 + smax v3.8h, v6.8h, v0.8h + sqadd v7.8h, v7.8h, v4.8h + smin v3.8h, v3.8h, v1.8h + srshr v5.8h, v5.8h, #4 + st1 {v3.8h}, [x6], x1 + smax v4.8h, v7.8h, v0.8h + sqadd v5.8h, v5.8h, v2.8h + smin v4.8h, v4.8h, v1.8h + st1 {v4.8h}, [x6], x1 + smax v2.8h, v5.8h, v0.8h + smin v2.8h, v2.8h, v1.8h + st1 {v2.8h}, [x6], x1 +.endm + combine v31.8h, v30.8h, v29.8h, v28.8h, sqadd, x8 + combine v27.8h, v26.8h, v25.8h, v24.8h, sqadd, x8 + combine v23.8h, v22.8h, v21.8h, v20.8h, sqadd, x8 + combine v19.8h, v18.8h, v17.8h, v16.8h, sqadd, x8 + sub x7, x7, x8 + combine v16.8h, v17.8h, v18.8h, v19.8h, sqsub, x9 + combine v20.8h, v21.8h, v22.8h, v23.8h, sqsub, x9 + combine v24.8h, v25.8h, v26.8h, v27.8h, sqsub, x9 + combine v28.8h, v29.8h, v30.8h, v31.8h, sqsub, x9 +.purgem combine + + br x14 +endfunc + +const eob_32x32 + .short 10, 36, 78, 136, 210, 300, 406, 1024 +endconst + +const eob_16x32 + .short 10, 36, 78, 151, 215, 279, 343, 512 +endconst + +const eob_16x32_shortside + .short 10, 36, 78, 512 +endconst + +const eob_8x32 + .short 10, 43, 75, 107, 139, 171, 203, 256 +endconst + +function inv_txfm_add_identity_identity_32x32_16bpc_neon, export=1 + movi v0.8h, #0 + movi v1.8h, #0 + movrel x13, eob_32x32, 2 + + mov x8, #4*32 +1: + mov w9, #0 + movrel x12, eob_32x32, 2 +2: + add w9, w9, #8 + ld1 {v16.4s, v17.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v18.4s, v19.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v20.4s, v21.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v22.4s, v23.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v24.4s, v25.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v26.4s, v27.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v28.4s, v29.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v30.4s, v31.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + sqxtn v16.4h, v16.4s + sqxtn2 v16.8h, v17.4s + sqxtn v17.4h, v18.4s + sqxtn2 v17.8h, v19.4s + sqxtn v18.4h, v20.4s + sqxtn2 v18.8h, v21.4s + sqxtn v19.4h, v22.4s + sqxtn2 v19.8h, v23.4s + sqxtn v20.4h, v24.4s + sqxtn2 v20.8h, v25.4s + sqxtn v21.4h, v26.4s + sqxtn2 v21.8h, v27.4s + sqxtn v22.4h, v28.4s + sqxtn2 v22.8h, v29.4s + sqxtn v23.4h, v30.4s + sqxtn2 v23.8h, v31.4s + transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 + + load_add_store_8x8 x0, x7, shiftbits=2 + ldrh w11, [x12], #4 + sub x0, x0, x1, lsl #3 + add x0, x0, #2*8 + cmp w3, w11 + b.ge 2b + + ldrh w11, [x13], #4 + cmp w3, w11 + b.lt 9f + + sub x0, x0, w9, uxtw #1 + add x0, x0, x1, lsl #3 + msub x2, x8, x9, x2 + add x2, x2, #4*8 + b 1b +9: + ret +endfunc + +.macro shift_16_regs op, shift +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + \op \i, \i, #\shift +.endr +.endm + +.macro def_identity_1632 w, h, wshort, hshort +function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1 + movz w16, #2896*8, lsl #16 + movz w17, #2*(5793-4096)*8, lsl #16 + movi v0.4s, #0 + movi v1.4s, #0 + movrel x13, eob_16x32\hshort, 2 + + mov x8, #4*\h +1: + mov w9, #0 + movrel x12, eob_16x32\wshort, 2 +2: + add w9, w9, #8 + ld1 {v16.4s, v17.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + dup v2.2s, w16 + ld1 {v18.4s, v19.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + mov v2.s[1], w17 + ld1 {v20.4s, v21.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v22.4s, v23.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v24.4s, v25.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v26.4s, v27.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v28.4s, v29.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v30.4s, v31.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + scale_input .4s, v2.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + scale_input .4s, v2.s[0], v24, v25, v26, v27, v28, v29, v30, v31 + +.if \w == 16 + // 16x32 + identity_4x16_shift1 v2.s[1] +.else + // 32x16 + shift_16_regs sqshl, 1 + identity_4x16 v2.s[1] +.endif + sqxtn v16.4h, v16.4s + sqxtn2 v16.8h, v17.4s + sqxtn v17.4h, v18.4s + sqxtn2 v17.8h, v19.4s + sqxtn v18.4h, v20.4s + sqxtn2 v18.8h, v21.4s + sqxtn v19.4h, v22.4s + sqxtn2 v19.8h, v23.4s + sqxtn v20.4h, v24.4s + sqxtn2 v20.8h, v25.4s + sqxtn v21.4h, v26.4s + sqxtn2 v21.8h, v27.4s + sqxtn v22.4h, v28.4s + sqxtn2 v22.8h, v29.4s + sqxtn v23.4h, v30.4s + sqxtn2 v23.8h, v31.4s + + transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 + +.if \w == 16 + load_add_store_8x8 x0, x7, shiftbits=2 +.else + load_add_store_8x8 x0, x7, shiftbits=4 +.endif + ldrh w11, [x12], #4 + sub x0, x0, x1, lsl #3 + add x0, x0, #16 + cmp w3, w11 + b.ge 2b + + ldrh w11, [x13], #4 + cmp w3, w11 + b.lt 9f + + sub x0, x0, w9, uxtw #1 + add x0, x0, x1, lsl #3 + msub x2, x8, x9, x2 + add x2, x2, #4*8 + b 1b +9: + ret +endfunc +.endm + +def_identity_1632 16, 32, _shortside, +def_identity_1632 32, 16, , _shortside + +.macro def_identity_832 w, h +function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1 + movi v0.4s, #0 + movi v1.4s, #0 + // Working on 8x8 blocks, read every other entry from eob_8x32 + movrel x13, eob_8x32, 2 + + mov w8, #4*\h +1: + // Working on 8x8 blocks, read every other entry from eob_8x32 + ldrh w12, [x13], #4 + ld1 {v16.4s, v17.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v18.4s, v19.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v20.4s, v21.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v22.4s, v23.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v24.4s, v25.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v26.4s, v27.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v28.4s, v29.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v30.4s, v31.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + +.if \w == 8 + sqrshrn v16.4h, v16.4s, #1 + sqrshrn2 v16.8h, v17.4s, #1 + sqrshrn v17.4h, v18.4s, #1 + sqrshrn2 v17.8h, v19.4s, #1 + sqrshrn v18.4h, v20.4s, #1 + sqrshrn2 v18.8h, v21.4s, #1 + sqrshrn v19.4h, v22.4s, #1 + sqrshrn2 v19.8h, v23.4s, #1 + sqrshrn v20.4h, v24.4s, #1 + sqrshrn2 v20.8h, v25.4s, #1 + sqrshrn v21.4h, v26.4s, #1 + sqrshrn2 v21.8h, v27.4s, #1 + sqrshrn v22.4h, v28.4s, #1 + sqrshrn2 v22.8h, v29.4s, #1 + sqrshrn v23.4h, v30.4s, #1 + sqrshrn2 v23.8h, v31.4s, #1 +.else + sqxtn v16.4h, v16.4s + sqxtn2 v16.8h, v17.4s + sqxtn v17.4h, v18.4s + sqxtn2 v17.8h, v19.4s + sqxtn v18.4h, v20.4s + sqxtn2 v18.8h, v21.4s + sqxtn v19.4h, v22.4s + sqxtn2 v19.8h, v23.4s + sqxtn v20.4h, v24.4s + sqxtn2 v20.8h, v25.4s + sqxtn v21.4h, v26.4s + sqxtn2 v21.8h, v27.4s + sqxtn v22.4h, v28.4s + sqxtn2 v22.8h, v29.4s + sqxtn v23.4h, v30.4s + sqxtn2 v23.8h, v31.4s +.endif + + transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 + + + cmp w3, w12 +.if \w == 8 + load_add_store_8x8 x0, x7, shiftbits=2 +.else + load_add_store_8x8 x0, x7, shiftbits=3 +.endif + + b.lt 9f +.if \w == 8 + sub x2, x2, x8, lsl #3 + add x2, x2, #4*8 +.else + sub x0, x0, x1, lsl #3 + add x0, x0, #2*8 +.endif + b 1b + +9: + ret +endfunc +.endm + +def_identity_832 8, 32 +def_identity_832 32, 8 + +function inv_txfm_add_dct_dct_32x32_16bpc_neon, export=1 + idct_dc 32, 32, 2 + + mov x15, x30 + sub sp, sp, #2048 + movrel x13, eob_32x32 + ldrh w12, [x13], #2 + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add x6, sp, #(\i*32*2) +.if \i > 0 + mov w8, #(32 - \i) + cmp w3, w12 + b.lt 1f +.if \i < 28 + ldrh w12, [x13], #2 +.endif +.endif + add x7, x2, #(\i*4) + mov x8, #32*4 + bl inv_txfm_horz_dct_32x4_neon +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #4 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8, 16, 24 + add x6, x0, #(\i*2) + add x7, sp, #(\i*2) + mov x8, #32*2 + bl inv_txfm_add_vert_dct_8x32_neon +.endr + + add sp, sp, #2048 + br x15 +endfunc + +function inv_txfm_add_dct_dct_16x32_16bpc_neon, export=1 + idct_dc 16, 32, 1 + + mov x15, x30 + sub sp, sp, #1024 + movrel x13, eob_16x32 + ldrh w12, [x13], #2 + adr x4, inv_dct_4s_x16_neon + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add x6, sp, #(\i*16*2) + add x7, x2, #(\i*4) +.if \i > 0 + mov w8, #(32 - \i) + cmp w3, w12 + b.lt 1f +.if \i < 28 + ldrh w12, [x13], #2 +.endif +.endif + mov x8, #4*32 + bl inv_txfm_horz_scale_16x4_neon +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #4 +.rept 2 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8 + add x6, x0, #(\i*2) + add x7, sp, #(\i*2) + mov x8, #16*2 + bl inv_txfm_add_vert_dct_8x32_neon +.endr + + add sp, sp, #1024 + br x15 +endfunc + +function inv_txfm_add_dct_dct_32x16_16bpc_neon, export=1 + idct_dc 32, 16, 1 + + mov x15, x30 + sub sp, sp, #1024 + + movrel x13, eob_16x32 + movrel x5, X(inv_dct_8h_x16_neon) + ldrh w12, [x13], #2 + +.irp i, 0, 4, 8, 12 + add x6, sp, #(\i*32*2) + add x7, x2, #(\i*4) +.if \i > 0 + mov w8, #(16 - \i) + cmp w3, w12 + b.lt 1f + ldrh w12, [x13], #2 +.endif + mov x8, #4*16 + bl inv_txfm_horz_scale_dct_32x4_neon +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #4 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8, 16, 24 + add x6, x0, #(\i*2) + add x7, sp, #(\i*2) + mov x8, #32*2 + bl inv_txfm_add_vert_8x16_neon +.endr + + add sp, sp, #1024 + br x15 +endfunc + +function inv_txfm_add_dct_dct_8x32_16bpc_neon, export=1 + idct_dc 8, 32, 2 + + mov x15, x30 + sub sp, sp, #512 + + movrel x13, eob_8x32 + + movi v28.4s, #0 + mov x8, #4*32 + mov w9, #32 + mov x6, sp + mov x7, x2 +1: +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + ld1 {v\i\().4s}, [x7] + st1 {v28.4s}, [x7], x8 +.endr + ldrh w12, [x13], #2 + sub w9, w9, #4 + sub x7, x7, x8, lsl #3 + add x7, x7, #4*4 + + bl inv_dct_4s_x8_neon + + sqrshrn v16.4h, v16.4s, #2 + sqrshrn v17.4h, v17.4s, #2 + sqrshrn v18.4h, v18.4s, #2 + sqrshrn v19.4h, v19.4s, #2 + sqrshrn2 v16.8h, v20.4s, #2 + sqrshrn2 v17.8h, v21.4s, #2 + sqrshrn2 v18.8h, v22.4s, #2 + sqrshrn2 v19.8h, v23.4s, #2 + + transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 + + cmp w3, w12 + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64 + + b.ge 1b + cbz w9, 3f + + movi v29.8h, #0 + movi v30.8h, #0 + movi v31.8h, #0 +2: + subs w9, w9, #4 + st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x6], #64 + b.gt 2b + +3: + mov x6, x0 + mov x7, sp + mov x8, #8*2 + bl inv_txfm_add_vert_dct_8x32_neon + + add sp, sp, #512 + br x15 +endfunc + +function inv_txfm_add_dct_dct_32x8_16bpc_neon, export=1 + idct_dc 32, 8, 2 + + mov x15, x30 + sub sp, sp, #512 + +.irp i, 0, 4 + add x6, sp, #(\i*32*2) + add x7, x2, #(\i*4) +.if \i > 0 + cmp w3, #10 + b.lt 1f +.endif + mov x8, #8*4 + bl inv_txfm_horz_dct_32x4_neon +.endr + b 2f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + +2: + mov x8, #2*32 + mov w9, #0 +1: + add x6, x0, x9, lsl #1 + add x7, sp, x9, lsl #1 // #(\i*2) + +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + ld1 {v\i\().8h}, [x7], x8 +.endr + add w9, w9, #8 + + bl X(inv_dct_8h_x8_neon) + + cmp w9, #32 + + load_add_store_8x8 x6, x7 + + b.lt 1b + + add sp, sp, #512 + br x15 +endfunc + +function inv_dct64_step1_neon + // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a + // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a + // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a + // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a + + ld1 {v0.4s, v1.4s}, [x17], #32 + + sqrdmulh v23.4s, v16.4s, v0.s[1] // t63a + sqrdmulh v16.4s, v16.4s, v0.s[0] // t32a + sqrdmulh v22.4s, v17.4s, v0.s[2] // t62a + sqrdmulh v17.4s, v17.4s, v0.s[3] // t33a + sqrdmulh v21.4s, v18.4s, v1.s[1] // t61a + sqrdmulh v18.4s, v18.4s, v1.s[0] // t34a + sqrdmulh v20.4s, v19.4s, v1.s[2] // t60a + sqrdmulh v19.4s, v19.4s, v1.s[3] // t35a + + ld1 {v0.4s}, [x17], #16 + + sqadd v24.4s, v16.4s, v17.4s // t32 + sqsub v25.4s, v16.4s, v17.4s // t33 + sqsub v26.4s, v19.4s, v18.4s // t34 + sqadd v27.4s, v19.4s, v18.4s // t35 + sqadd v28.4s, v20.4s, v21.4s // t60 + sqsub v29.4s, v20.4s, v21.4s // t61 + sqsub v30.4s, v23.4s, v22.4s // t62 + sqadd v31.4s, v23.4s, v22.4s // t63 + + mul_mla v2, v29, v26, v0.s[0], v0.s[1] // -> t34a + mul_mls v4, v29, v26, v0.s[1], v0.s[0] // -> t61a + neg v2.4s, v2.4s // t34a + mul_mls v6, v30, v25, v0.s[1], v0.s[0] // -> t33a + srshr v26.4s, v2.4s, #12 // t34a + mul_mla v2, v30, v25, v0.s[0], v0.s[1] // -> t62a + srshr v29.4s, v4.4s, #12 // t61a + srshr v25.4s, v6.4s, #12 // t33a + srshr v30.4s, v2.4s, #12 // t62a + + sqadd v16.4s, v24.4s, v27.4s // t32a + sqsub v19.4s, v24.4s, v27.4s // t35a + sqadd v17.4s, v25.4s, v26.4s // t33 + sqsub v18.4s, v25.4s, v26.4s // t34 + sqsub v20.4s, v31.4s, v28.4s // t60a + sqadd v23.4s, v31.4s, v28.4s // t63a + sqsub v21.4s, v30.4s, v29.4s // t61 + sqadd v22.4s, v30.4s, v29.4s // t62 + + mul_mla v2, v21, v18, v0.s[2], v0.s[3] // -> t61a + mul_mls v4, v21, v18, v0.s[3], v0.s[2] // -> t34a + mul_mla v6, v20, v19, v0.s[2], v0.s[3] // -> t60 + srshr v21.4s, v2.4s, #12 // t61a + srshr v18.4s, v4.4s, #12 // t34a + mul_mls v2, v20, v19, v0.s[3], v0.s[2] // -> t35 + srshr v20.4s, v6.4s, #12 // t60 + srshr v19.4s, v2.4s, #12 // t35 + + st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x6], #64 + st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x6], #64 + + ret +endfunc + +function inv_dct64_step2_neon + movrel x16, idct_coeffs + ld1 {v0.4s}, [x16] +1: + // t32a/33/34a/35/60/61a/62/63a + // t56a/57/58a/59/36/37a/38/39a + // t40a/41/42a/43/52/53a/54/55a + // t48a/49/50a/51/44/45a/46/47a + ldr q16, [x6, #4*4*0] // t32a + ldr q17, [x9, #4*4*8] // t39a + ldr q18, [x9, #4*4*0] // t63a + ldr q19, [x6, #4*4*8] // t56a + ldr q20, [x6, #4*4*16] // t40a + ldr q21, [x9, #4*4*24] // t47a + ldr q22, [x9, #4*4*16] // t55a + ldr q23, [x6, #4*4*24] // t48a + + sqadd v24.4s, v16.4s, v17.4s // t32 + sqsub v25.4s, v16.4s, v17.4s // t39 + sqadd v26.4s, v18.4s, v19.4s // t63 + sqsub v27.4s, v18.4s, v19.4s // t56 + sqsub v28.4s, v21.4s, v20.4s // t40 + sqadd v29.4s, v21.4s, v20.4s // t47 + sqadd v30.4s, v23.4s, v22.4s // t48 + sqsub v31.4s, v23.4s, v22.4s // t55 + + mul_mla v2, v27, v25, v0.s[3], v0.s[2] // -> t56a + mul_mls v4, v27, v25, v0.s[2], v0.s[3] // -> t39a + mul_mla v6, v31, v28, v0.s[3], v0.s[2] // -> t40a + srshr v25.4s, v2.4s, #12 // t56a + srshr v27.4s, v4.4s, #12 // t39a + neg v6.4s, v6.4s // t40a + mul_mls v2, v31, v28, v0.s[2], v0.s[3] // -> t55a + srshr v31.4s, v6.4s, #12 // t40a + srshr v28.4s, v2.4s, #12 // t55a + + sqadd v16.4s, v24.4s, v29.4s // t32a + sqsub v19.4s, v24.4s, v29.4s // t47a + sqadd v17.4s, v27.4s, v31.4s // t39 + sqsub v18.4s, v27.4s, v31.4s // t40 + sqsub v20.4s, v26.4s, v30.4s // t48a + sqadd v23.4s, v26.4s, v30.4s // t63a + sqsub v21.4s, v25.4s, v28.4s // t55 + sqadd v22.4s, v25.4s, v28.4s // t56 + + mul_mls v2, v21, v18, v0.s[0], v0.s[0] // -> t40a + mul_mla v4, v21, v18, v0.s[0], v0.s[0] // -> t55a + mul_mls v6, v20, v19, v0.s[0], v0.s[0] // -> t47 + srshr v18.4s, v2.4s, #12 // t40a + srshr v21.4s, v4.4s, #12 // t55a + mul_mla v2, v20, v19, v0.s[0], v0.s[0] // -> t48 + srshr v19.4s, v6.4s, #12 // t47 + srshr v20.4s, v2.4s, #12 // t48 + + str q16, [x6, #4*4*0] // t32a + str q17, [x9, #4*4*0] // t39 + str q18, [x6, #4*4*8] // t40a + str q19, [x9, #4*4*8] // t47 + str q20, [x6, #4*4*16] // t48 + str q21, [x9, #4*4*16] // t55a + str q22, [x6, #4*4*24] // t56 + str q23, [x9, #4*4*24] // t63a + + add x6, x6, #4*4 + sub x9, x9, #4*4 + cmp x6, x9 + b.lt 1b + ret +endfunc + +.macro load8 src, strd, zero, clear +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s +.if \clear + ld1 {\i}, [\src] + st1 {\zero}, [\src], \strd +.else + ld1 {\i}, [\src], \strd +.endif +.endr +.endm + +.macro store16 dst +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + st1 {\i}, [\dst], #16 +.endr +.endm + +.macro clear_upper8 +.irp i, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + movi \i, #0 +.endr +.endm + +.macro movi_if reg, val, cond +.if \cond + movi \reg, \val +.endif +.endm + +.macro movz16dup_if reg, gpr, val, cond +.if \cond + movz \gpr, \val, lsl #16 + dup \reg, \gpr +.endif +.endm + +.macro st1_if regs, dst, cond +.if \cond + st1 \regs, \dst +.endif +.endm + +.macro str_if reg, dst, cond +.if \cond + str \reg, \dst +.endif +.endm + +.macro stroff_if reg, dst, dstoff, cond +.if \cond + str \reg, \dst, \dstoff +.endif +.endm + +.macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7 +.if \cond + scale_input .4s, \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 +.endif +.endm + +.macro def_dct64_func suffix, clear=0, scale=0 +function inv_txfm_dct\suffix\()_4s_x64_neon + mov x14, x30 + mov x6, sp + lsl x8, x8, #2 + + movz16dup_if v0.2s, w16, #2896*8, \scale + movi_if v7.4s, #0, \clear + load8 x7, x8, v7.4s, \clear + clear_upper8 + sub x7, x7, x8, lsl #3 + add x7, x7, x8, lsr #1 + scale_if \scale, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + + bl inv_dct_4s_x16_neon + + store16 x6 + + movz16dup_if v0.2s, w16, #2896*8, \scale + movi_if v7.8h, #0, \clear + load8 x7, x8, v7.4s, \clear + clear_upper8 + sub x7, x7, x8, lsl #3 + lsr x8, x8, #1 + sub x7, x7, x8, lsr #1 + scale_if \scale, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + + bl inv_dct32_odd_4s_x16_neon + + add x10, x6, #16*15 + sub x6, x6, #16*16 + + mov x9, #-16 + +.macro store_addsub r0, r1, r2, r3 + ld1 {v2.4s}, [x6], #16 + ld1 {v3.4s}, [x6], #16 + sqadd v6.4s, v2.4s, \r0 + sqsub \r0, v2.4s, \r0 + ld1 {v4.4s}, [x6], #16 + sqadd v7.4s, v3.4s, \r1 + sqsub \r1, v3.4s, \r1 + ld1 {v5.4s}, [x6], #16 + sqadd v2.4s, v4.4s, \r2 + sub x6, x6, #16*4 + sqsub \r2, v4.4s, \r2 + st1 {v6.4s}, [x6], #16 + st1 {\r0}, [x10], x9 + sqadd v3.4s, v5.4s, \r3 + sqsub \r3, v5.4s, \r3 + st1 {v7.4s}, [x6], #16 + st1 {\r1}, [x10], x9 + st1 {v2.4s}, [x6], #16 + st1 {\r2}, [x10], x9 + st1 {v3.4s}, [x6], #16 + st1 {\r3}, [x10], x9 +.endm + store_addsub v31.4s, v30.4s, v29.4s, v28.4s + store_addsub v27.4s, v26.4s, v25.4s, v24.4s + store_addsub v23.4s, v22.4s, v21.4s, v20.4s + store_addsub v19.4s, v18.4s, v17.4s, v16.4s +.purgem store_addsub + + add x6, x6, #4*4*16 + + movrel x17, idct64_coeffs + movz16dup_if v0.2s, w16, #2896*8, \scale + movi_if v7.4s, #0, \clear + add x9, x7, x8, lsl #4 // offset 16 + add x10, x7, x8, lsl #3 // offset 8 + sub x9, x9, x8 // offset 15 + sub x11, x10, x8 // offset 7 + ld1 {v16.4s}, [x7] // in1 (offset 0) + ld1 {v17.4s}, [x9] // in31 (offset 15) + ld1 {v18.4s}, [x10] // in17 (offset 8) + ld1 {v19.4s}, [x11] // in15 (offset 7) + st1_if {v7.4s}, [x7], \clear + st1_if {v7.4s}, [x9], \clear + st1_if {v7.4s}, [x10], \clear + st1_if {v7.4s}, [x11], \clear + scale_if \scale, v0.s[0], v16, v17, v18, v19 + bl inv_dct64_step1_neon + movz16dup_if v0.2s, w16, #2896*8, \scale + movi_if v7.4s, #0, \clear + add x7, x7, x8, lsl #2 // offset 4 + sub x9, x9, x8, lsl #2 // offset 11 + sub x10, x7, x8 // offset 3 + add x11, x9, x8 // offset 12 + ld1 {v16.4s}, [x10] // in7 (offset 3) + ld1 {v17.4s}, [x11] // in25 (offset 12) + ld1 {v18.4s}, [x9] // in23 (offset 11) + ld1 {v19.4s}, [x7] // in9 (offset 4) + st1_if {v7.4s}, [x7], \clear + st1_if {v7.4s}, [x9], \clear + st1_if {v7.4s}, [x10], \clear + st1_if {v7.4s}, [x11], \clear + scale_if \scale, v0.s[0], v16, v17, v18, v19 + bl inv_dct64_step1_neon + movz16dup_if v0.2s, w16, #2896*8, \scale + movi_if v7.4s, #0, \clear + sub x10, x10, x8, lsl #1 // offset 1 + sub x9, x9, x8, lsl #1 // offset 9 + add x7, x7, x8 // offset 5 + add x11, x11, x8 // offset 13 + ldr q16, [x10, x8] // in5 (offset 2) + ldr q17, [x11] // in27 (offset 13) + ldr q18, [x9, x8] // in21 (offset 10) + ldr q19, [x7] // in11 (offset 5) + stroff_if q7, [x10, x8], \clear + str_if q7, [x11], \clear + stroff_if q7, [x9, x8], \clear + str_if q7, [x7], \clear + scale_if \scale, v0.s[0], v16, v17, v18, v19 + bl inv_dct64_step1_neon + movz16dup_if v0.2s, w16, #2896*8, \scale + movi_if v7.4s, #0, \clear + ldr q16, [x10] // in3 (offset 1) + ldr q17, [x11, x8] // in29 (offset 14) + ldr q18, [x9] // in19 (offset 9) + ldr q19, [x7, x8] // in13 (offset 6) + str_if q7, [x10], \clear + stroff_if q7, [x11, x8], \clear + str_if q7, [x9], \clear + stroff_if q7, [x7, x8], \clear + scale_if \scale, v0.s[0], v16, v17, v18, v19 + bl inv_dct64_step1_neon + + sub x6, x6, #4*4*32 + add x9, x6, #4*4*7 + + bl inv_dct64_step2_neon + + br x14 +endfunc +.endm + +def_dct64_func _clear, clear=1 +def_dct64_func _clear_scale, clear=1, scale=1 + + +function inv_txfm_horz_dct_64x4_neon + mov x14, x30 + + mov x7, sp + add x8, sp, #4*4*(64 - 4) + add x9, x6, #2*56 + mov x10, #2*64 + mov x11, #-4*4*4 + + dup v7.4s, w12 +1: + ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x7], #64 + ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x8], x11 + ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x7], #64 + ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x8], x11 + transpose_4x4s v16, v17, v18, v19, v2, v3, v4, v5 + transpose_4x4s v20, v21, v22, v23, v2, v3, v4, v5 + transpose_4x4s v31, v30, v29, v28, v2, v3, v4, v5 + transpose_4x4s v27, v26, v25, v24, v2, v3, v4, v5 + +.macro store_addsub src0, src1, src2, src3 + sqsub v1.4s, \src0, \src1 + sqadd v0.4s, \src0, \src1 + sqsub v3.4s, \src2, \src3 + srshl v1.4s, v1.4s, v7.4s + sqadd v2.4s, \src2, \src3 + srshl v3.4s, v3.4s, v7.4s + srshl v0.4s, v0.4s, v7.4s + srshl v2.4s, v2.4s, v7.4s + sqxtn v3.4h, v3.4s + sqxtn2 v3.8h, v1.4s + sqxtn v0.4h, v0.4s + sqxtn2 v0.8h, v2.4s + rev64 v3.8h, v3.8h + st1 {v0.8h}, [x6], x10 + st1 {v3.8h}, [x9], x10 +.endm + store_addsub v16.4s, v31.4s, v20.4s, v27.4s + store_addsub v17.4s, v30.4s, v21.4s, v26.4s + store_addsub v18.4s, v29.4s, v22.4s, v25.4s + store_addsub v19.4s, v28.4s, v23.4s, v24.4s +.purgem store_addsub + sub x6, x6, x10, lsl #2 + sub x9, x9, x10, lsl #2 + add x6, x6, #16 + sub x9, x9, #16 + + cmp x7, x8 + b.lt 1b + br x14 +endfunc + +function inv_txfm_add_vert_dct_8x64_neon + mov x14, x30 + lsl x8, x8, #1 + + mov x7, sp + add x8, sp, #2*8*(64 - 4) + add x9, x6, x1, lsl #6 + sub x9, x9, x1 + neg x10, x1 + mov x11, #-2*8*4 + +1: + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64 + ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11 + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64 + ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11 + + movi v6.8h, #0 + mvni v7.8h, #0xfc, lsl #8 // 0x3ff +.macro add_dest_addsub src0, src1, src2, src3 + ld1 {v0.8h}, [x6], x1 + ld1 {v1.8h}, [x9], x10 + sqadd v4.8h, \src0, \src1 + ld1 {v2.8h}, [x6] + sqsub \src0, \src0, \src1 + ld1 {v3.8h}, [x9] + sqadd v5.8h, \src2, \src3 + sqsub \src2, \src2, \src3 + sub x6, x6, x1 + sub x9, x9, x10 + srshr v4.8h, v4.8h, #4 + srshr v5.8h, v5.8h, #4 + srshr \src0, \src0, #4 + sqadd v0.8h, v0.8h, v4.8h + srshr \src2, \src2, #4 + sqadd v1.8h, v1.8h, \src0 + sqadd v2.8h, v2.8h, v5.8h + smax v0.8h, v0.8h, v6.8h + sqadd v3.8h, v3.8h, \src2 + smax v1.8h, v1.8h, v6.8h + smin v0.8h, v0.8h, v7.8h + smax v2.8h, v2.8h, v6.8h + smin v1.8h, v1.8h, v7.8h + st1 {v0.8h}, [x6], x1 + smax v3.8h, v3.8h, v6.8h + smin v2.8h, v2.8h, v7.8h + st1 {v1.8h}, [x9], x10 + smin v3.8h, v3.8h, v7.8h + st1 {v2.8h}, [x6], x1 + st1 {v3.8h}, [x9], x10 +.endm + add_dest_addsub v16.8h, v31.8h, v17.8h, v30.8h + add_dest_addsub v18.8h, v29.8h, v19.8h, v28.8h + add_dest_addsub v20.8h, v27.8h, v21.8h, v26.8h + add_dest_addsub v22.8h, v25.8h, v23.8h, v24.8h +.purgem add_dest_addsub + cmp x7, x8 + b.lt 1b + + br x14 +endfunc + +.macro sub_sp space +#ifdef _WIN32 +.if \space > 8192 + // Here, we'd need to touch two (or more) pages while decrementing + // the stack pointer. + .error "sub_sp_align doesn't support values over 8K at the moment" +.elseif \space > 4096 + sub x16, sp, #4096 + ldr xzr, [x16] + sub sp, x16, #(\space - 4096) +.else + sub sp, sp, #\space +.endif +#else +.if \space >= 4096 + sub sp, sp, #(\space)/4096*4096 +.endif +.if (\space % 4096) != 0 + sub sp, sp, #(\space)%4096 +.endif +#endif +.endm + +function inv_txfm_add_dct_dct_64x64_16bpc_neon, export=1 + idct_dc 64, 64, 2 + + mov x15, x30 + + sub_sp 64*32*2+64*4*4 + add x5, sp, #64*4*4 + + movrel x13, eob_32x32 + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add x6, x5, #(\i*64*2) +.if \i > 0 + mov w8, #(32 - \i) + cmp w3, w12 + b.lt 1f +.endif + add x7, x2, #(\i*4) + mov x8, #32*4 + mov x12, #-2 // shift + bl inv_txfm_dct_clear_4s_x64_neon + add x6, x5, #(\i*64*2) + bl inv_txfm_horz_dct_64x4_neon +.if \i < 28 + ldrh w12, [x13], #2 +.endif +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #2 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8, 16, 24, 32, 40, 48, 56 + add x7, x5, #(\i*2) + mov x8, #64*2 + bl X(inv_txfm_dct_8h_x64_neon) + add x6, x0, #(\i*2) + bl inv_txfm_add_vert_dct_8x64_neon +.endr + + add sp, x5, #64*32*2 + br x15 +endfunc + +function inv_txfm_add_dct_dct_64x32_16bpc_neon, export=1 + idct_dc 64, 32, 1 + + mov x15, x30 + + sub_sp 64*32*2+64*4*4 + add x5, sp, #64*4*4 + + movrel x13, eob_32x32 + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add x6, x5, #(\i*64*2) +.if \i > 0 + mov w8, #(32 - \i) + cmp w3, w12 + b.lt 1f +.endif + add x7, x2, #(\i*4) + mov x8, #32*4 + mov x12, #-1 // shift + bl inv_txfm_dct_clear_scale_4s_x64_neon + add x6, x5, #(\i*64*2) + bl inv_txfm_horz_dct_64x4_neon +.if \i < 28 + ldrh w12, [x13], #2 +.endif +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #2 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8, 16, 24, 32, 40, 48, 56 + add x6, x0, #(\i*2) + add x7, x5, #(\i*2) + mov x8, #64*2 + bl inv_txfm_add_vert_dct_8x32_neon +.endr + + add sp, x5, #64*32*2 + br x15 +endfunc + +function inv_txfm_add_dct_dct_32x64_16bpc_neon, export=1 + idct_dc 32, 64, 1 + + mov x15, x30 + + sub_sp 32*32*2+64*8*2 + add x5, sp, #64*8*2 + + movrel x13, eob_32x32 + ldrh w12, [x13], #2 + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add x6, x5, #(\i*32*2) +.if \i > 0 + mov w8, #(32 - \i) + cmp w3, w12 + b.lt 1f + ldrh w12, [x13], #2 +.endif + add x7, x2, #(\i*4) + mov x8, #32*4 + bl inv_txfm_horz_scale_dct_32x4_neon +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #4 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8, 16, 24 + add x7, x5, #(\i*2) + mov x8, #32*2 + bl X(inv_txfm_dct_8h_x64_neon) + add x6, x0, #(\i*2) + bl inv_txfm_add_vert_dct_8x64_neon +.endr + + add sp, x5, #32*32*2 + br x15 +endfunc + +function inv_txfm_add_dct_dct_64x16_16bpc_neon, export=1 + idct_dc 64, 16, 2 + + mov x15, x30 + + sub_sp 64*16*2+64*4*4 + add x4, sp, #64*4*4 + + movrel x13, eob_16x32 + +.irp i, 0, 4, 8, 12 + add x6, x4, #(\i*64*2) +.if \i > 0 + mov w8, #(16 - \i) + cmp w3, w12 + b.lt 1f +.endif + add x7, x2, #(\i*4) + mov x8, #16*4 + mov x12, #-2 // shift + bl inv_txfm_dct_clear_4s_x64_neon + add x6, x4, #(\i*64*2) + bl inv_txfm_horz_dct_64x4_neon +.if \i < 12 + ldrh w12, [x13], #2 +.endif +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #2 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: + movrel x5, X(inv_dct_8h_x16_neon) +.irp i, 0, 8, 16, 24, 32, 40, 48, 56 + add x6, x0, #(\i*2) + add x7, x4, #(\i*2) + mov x8, #64*2 + bl inv_txfm_add_vert_8x16_neon +.endr + + add sp, x4, #64*16*2 + br x15 +endfunc + +function inv_txfm_add_dct_dct_16x64_16bpc_neon, export=1 + idct_dc 16, 64, 2 + + mov x15, x30 + + sub_sp 16*32*2+64*8*2 + add x5, sp, #64*8*2 + + movrel x13, eob_16x32 + ldrh w12, [x13], #2 + + adr x4, inv_dct_4s_x16_neon +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add x6, x5, #(\i*16*2) +.if \i > 0 + mov w8, #(32 - \i) + cmp w3, w12 + b.lt 1f + ldrh w12, [x13], #2 +.endif + add x7, x2, #(\i*4) + mov x8, #32*4 + bl inv_txfm_horz_16x4_neon +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #4 +.rept 2 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8 + add x7, x5, #(\i*2) + mov x8, #16*2 + bl X(inv_txfm_dct_8h_x64_neon) + add x6, x0, #(\i*2) + bl inv_txfm_add_vert_dct_8x64_neon +.endr + + add sp, x5, #16*32*2 + br x15 +endfunc diff --git a/third_party/dav1d/src/arm/64/loopfilter.S b/third_party/dav1d/src/arm/64/loopfilter.S new file mode 100644 index 0000000000..d45f2085a3 --- /dev/null +++ b/third_party/dav1d/src/arm/64/loopfilter.S @@ -0,0 +1,1123 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2019, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +.macro loop_filter wd +function lpf_16_wd\wd\()_neon + uabd v0.16b, v22.16b, v23.16b // abs(p1 - p0) + uabd v1.16b, v25.16b, v24.16b // abs(q1 - q0) + uabd v2.16b, v23.16b, v24.16b // abs(p0 - q0) + uabd v3.16b, v22.16b, v25.16b // abs(p1 - q1) +.if \wd >= 6 + uabd v4.16b, v21.16b, v22.16b // abs(p2 - p1) + uabd v5.16b, v26.16b, v25.16b // abs(q2 - q1) +.endif +.if \wd >= 8 + uabd v6.16b, v20.16b, v21.16b // abs(p3 - p2) + uabd v7.16b, v27.16b, v26.16b // abs(q3 - q3) +.endif +.if \wd >= 6 + umax v4.16b, v4.16b, v5.16b +.endif + uqadd v2.16b, v2.16b, v2.16b // abs(p0 - q0) * 2 +.if \wd >= 8 + umax v6.16b, v6.16b, v7.16b +.endif + ushr v3.16b, v3.16b, #1 +.if \wd >= 8 + umax v4.16b, v4.16b, v6.16b +.endif +.if \wd >= 6 + and v4.16b, v4.16b, v14.16b +.endif + umax v0.16b, v0.16b, v1.16b // max(abs(p1 - p0), abs(q1 - q0)) + uqadd v2.16b, v2.16b, v3.16b // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 +.if \wd >= 6 + umax v4.16b, v0.16b, v4.16b + cmhs v1.16b, v11.16b, v4.16b // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I +.else + cmhs v1.16b, v11.16b, v0.16b // max(abs(p1 - p0), abs(q1 - q0)) <= I +.endif + cmhs v2.16b, v10.16b, v2.16b // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E + and v1.16b, v1.16b, v2.16b // fm + and v1.16b, v1.16b, v13.16b // fm && wd >= 4 +.if \wd >= 6 + and v14.16b, v14.16b, v1.16b // fm && wd > 4 +.endif +.if \wd >= 16 + and v15.16b, v15.16b, v1.16b // fm && wd == 16 +.endif + + mov x16, v1.d[0] + mov x17, v1.d[1] + adds x16, x16, x17 + b.eq 9f // if (!fm || wd < 4) return; + +.if \wd >= 6 + movi v10.16b, #1 + uabd v2.16b, v21.16b, v23.16b // abs(p2 - p0) + uabd v3.16b, v22.16b, v23.16b // abs(p1 - p0) + uabd v4.16b, v25.16b, v24.16b // abs(q1 - q0) + uabd v5.16b, v26.16b, v24.16b // abs(q2 - q0) +.if \wd >= 8 + uabd v6.16b, v20.16b, v23.16b // abs(p3 - p0) + uabd v7.16b, v27.16b, v24.16b // abs(q3 - q0) +.endif + umax v2.16b, v2.16b, v3.16b + umax v4.16b, v4.16b, v5.16b +.if \wd >= 8 + umax v6.16b, v6.16b, v7.16b +.endif + umax v2.16b, v2.16b, v4.16b +.if \wd >= 8 + umax v2.16b, v2.16b, v6.16b +.endif + +.if \wd == 16 + uabd v3.16b, v17.16b, v23.16b // abs(p6 - p0) + uabd v4.16b, v18.16b, v23.16b // abs(p5 - p0) + uabd v5.16b, v19.16b, v23.16b // abs(p4 - p0) +.endif + cmhs v2.16b, v10.16b, v2.16b // flat8in +.if \wd == 16 + uabd v6.16b, v28.16b, v24.16b // abs(q4 - q0) + uabd v7.16b, v29.16b, v24.16b // abs(q5 - q0) + uabd v8.16b, v30.16b, v24.16b // abs(q6 - q0) +.endif + and v14.16b, v2.16b, v14.16b // flat8in && fm && wd > 4 + bic v1.16b, v1.16b, v14.16b // fm && wd >= 4 && !flat8in +.if \wd == 16 + umax v3.16b, v3.16b, v4.16b + umax v5.16b, v5.16b, v6.16b +.endif + mov x16, v1.d[0] + mov x17, v1.d[1] +.if \wd == 16 + umax v7.16b, v7.16b, v8.16b + umax v3.16b, v3.16b, v5.16b + umax v3.16b, v3.16b, v7.16b + cmhs v3.16b, v10.16b, v3.16b // flat8out +.endif + adds x16, x16, x17 +.if \wd == 16 + and v15.16b, v15.16b, v3.16b // flat8out && fm && wd == 16 + and v15.16b, v15.16b, v14.16b // flat8out && flat8in && fm && wd == 16 + bic v14.16b, v14.16b, v15.16b // flat8in && fm && wd >= 4 && !flat8out +.endif + b.eq 1f // skip wd == 4 case +.endif + + usubl v2.8h, v22.8b, v25.8b // p1 - q1 + usubl2 v3.8h, v22.16b, v25.16b + cmhi v0.16b, v0.16b, v12.16b // hev + sqxtn v2.8b, v2.8h // iclip_diff(p1 - q1) + sqxtn2 v2.16b, v3.8h + and v4.16b, v2.16b, v0.16b // if (hev) iclip_diff(p1 - q1) + bic v0.16b, v1.16b, v0.16b // (fm && wd >= 4 && !hev) + usubl v2.8h, v24.8b, v23.8b + movi v5.8h, #3 + usubl2 v3.8h, v24.16b, v23.16b + mul v2.8h, v2.8h, v5.8h + mul v3.8h, v3.8h, v5.8h + movi v6.16b, #4 + saddw v2.8h, v2.8h, v4.8b + saddw2 v3.8h, v3.8h, v4.16b + movi v7.16b, #3 + sqxtn v2.8b, v2.8h // f + sqxtn2 v2.16b, v3.8h + sqadd v4.16b, v6.16b, v2.16b // imin(f + 4, 127) + sqadd v5.16b, v7.16b, v2.16b // imin(f + 3, 127) + sshr v4.16b, v4.16b, #3 // f1 + sshr v5.16b, v5.16b, #3 // f2 + uxtl v2.8h, v23.8b // p0 + uxtl2 v3.8h, v23.16b + uxtl v6.8h, v24.8b // q0 + uxtl2 v7.8h, v24.16b + saddw v2.8h, v2.8h, v5.8b + saddw2 v3.8h, v3.8h, v5.16b + ssubw v6.8h, v6.8h, v4.8b + ssubw2 v7.8h, v7.8h, v4.16b + srshr v4.16b, v4.16b, #1 // (f1 + 1) >> 1 + sqxtun v2.8b, v2.8h // out p0 + sqxtun2 v2.16b, v3.8h + sqxtun v6.8b, v6.8h // out q0 + sqxtun2 v6.16b, v7.8h + bit v23.16b, v2.16b, v1.16b // if (fm && wd >= 4) + uxtl v2.8h, v22.8b // p1 + uxtl2 v3.8h, v22.16b + bit v24.16b, v6.16b, v1.16b // if (fm && wd >= 4) + uxtl v6.8h, v25.8b // q1 + uxtl2 v7.8h, v25.16b + saddw v2.8h, v2.8h, v4.8b + saddw2 v3.8h, v3.8h, v4.16b + ssubw v6.8h, v6.8h, v4.8b + ssubw2 v7.8h, v7.8h, v4.16b + sqxtun v2.8b, v2.8h // out p1 + sqxtun2 v2.16b, v3.8h + sqxtun v6.8b, v6.8h // out q1 + sqxtun2 v6.16b, v7.8h + bit v22.16b, v2.16b, v0.16b // if (fm && wd >= 4 && !hev) + bit v25.16b, v6.16b, v0.16b // if (fm && wd >= 4 && !hev) +1: + +.if \wd == 6 + mov x16, v14.d[0] + mov x17, v14.d[1] + adds x16, x16, x17 + b.eq 2f // skip if there's no flat8in + + uaddl v0.8h, v21.8b, v21.8b // p2 * 2 + uaddl2 v1.8h, v21.16b, v21.16b + uaddl v2.8h, v21.8b, v22.8b // p2 + p1 + uaddl2 v3.8h, v21.16b, v22.16b + uaddl v4.8h, v22.8b, v23.8b // p1 + p0 + uaddl2 v5.8h, v22.16b, v23.16b + uaddl v6.8h, v23.8b, v24.8b // p0 + q0 + uaddl2 v7.8h, v23.16b, v24.16b + add v8.8h, v0.8h, v2.8h + add v9.8h, v1.8h, v3.8h + add v10.8h, v4.8h, v6.8h + add v11.8h, v5.8h, v7.8h + uaddl v12.8h, v24.8b, v25.8b // q0 + q1 + uaddl2 v13.8h, v24.16b, v25.16b + add v8.8h, v8.8h, v10.8h + add v9.8h, v9.8h, v11.8h + sub v12.8h, v12.8h, v0.8h + sub v13.8h, v13.8h, v1.8h + uaddl v10.8h, v25.8b, v26.8b // q1 + q2 + uaddl2 v11.8h, v25.16b, v26.16b + rshrn v0.8b, v8.8h, #3 // out p1 + rshrn2 v0.16b, v9.8h, #3 + + add v8.8h, v8.8h, v12.8h + add v9.8h, v9.8h, v13.8h + sub v10.8h, v10.8h, v2.8h + sub v11.8h, v11.8h, v3.8h + uaddl v12.8h, v26.8b, v26.8b // q2 + q2 + uaddl2 v13.8h, v26.16b, v26.16b + rshrn v1.8b, v8.8h, #3 // out p0 + rshrn2 v1.16b, v9.8h, #3 + + add v8.8h, v8.8h, v10.8h + add v9.8h, v9.8h, v11.8h + sub v12.8h, v12.8h, v4.8h + sub v13.8h, v13.8h, v5.8h + rshrn v2.8b, v8.8h, #3 // out q0 + rshrn2 v2.16b, v9.8h, #3 + + bit v22.16b, v0.16b, v14.16b // p1 if (flat8in) + add v8.8h, v8.8h, v12.8h + add v9.8h, v9.8h, v13.8h + bit v23.16b, v1.16b, v14.16b // p0 if (flat8in) + rshrn v3.8b, v8.8h, #3 // out q1 + rshrn2 v3.16b, v9.8h, #3 + bit v24.16b, v2.16b, v14.16b // q0 if (flat8in) + bit v25.16b, v3.16b, v14.16b // q1 if (flat8in) +.elseif \wd >= 8 + mov x16, v14.d[0] + mov x17, v14.d[1] + adds x16, x16, x17 +.if \wd == 8 + b.eq 8f // skip if there's no flat8in +.else + b.eq 2f // skip if there's no flat8in +.endif + + uaddl v0.8h, v20.8b, v21.8b // p3 + p2 + uaddl2 v1.8h, v20.16b, v21.16b + uaddl v2.8h, v22.8b, v25.8b // p1 + q1 + uaddl2 v3.8h, v22.16b, v25.16b + uaddl v4.8h, v20.8b, v22.8b // p3 + p1 + uaddl2 v5.8h, v20.16b, v22.16b + uaddl v6.8h, v23.8b, v26.8b // p0 + q2 + uaddl2 v7.8h, v23.16b, v26.16b + add v8.8h, v0.8h, v0.8h // 2 * (p3 + p2) + add v9.8h, v1.8h, v1.8h + uaddw v8.8h, v8.8h, v23.8b // + p0 + uaddw2 v9.8h, v9.8h, v23.16b + uaddw v8.8h, v8.8h, v24.8b // + q0 + uaddw2 v9.8h, v9.8h, v24.16b + add v8.8h, v8.8h, v4.8h + add v9.8h, v9.8h, v5.8h // + p3 + p1 + sub v2.8h, v2.8h, v0.8h // p1 + q1 - p3 - p2 + sub v3.8h, v3.8h, v1.8h + sub v6.8h, v6.8h, v4.8h // p0 + q2 - p3 - p1 + sub v7.8h, v7.8h, v5.8h + rshrn v10.8b, v8.8h, #3 // out p2 + rshrn2 v10.16b, v9.8h, #3 + + add v8.8h, v8.8h, v2.8h + add v9.8h, v9.8h, v3.8h + uaddl v0.8h, v20.8b, v23.8b // p3 + p0 + uaddl2 v1.8h, v20.16b, v23.16b + uaddl v2.8h, v24.8b, v27.8b // q0 + q3 + uaddl2 v3.8h, v24.16b, v27.16b + rshrn v11.8b, v8.8h, #3 // out p1 + rshrn2 v11.16b, v9.8h, #3 + + add v8.8h, v8.8h, v6.8h + add v9.8h, v9.8h, v7.8h + sub v2.8h, v2.8h, v0.8h // q0 + q3 - p3 - p0 + sub v3.8h, v3.8h, v1.8h + uaddl v4.8h, v21.8b, v24.8b // p2 + q0 + uaddl2 v5.8h, v21.16b, v24.16b + uaddl v6.8h, v25.8b, v27.8b // q1 + q3 + uaddl2 v7.8h, v25.16b, v27.16b + rshrn v12.8b, v8.8h, #3 // out p0 + rshrn2 v12.16b, v9.8h, #3 + + add v8.8h, v8.8h, v2.8h + add v9.8h, v9.8h, v3.8h + sub v6.8h, v6.8h, v4.8h // q1 + q3 - p2 - q0 + sub v7.8h, v7.8h, v5.8h + uaddl v0.8h, v22.8b, v25.8b // p1 + q1 + uaddl2 v1.8h, v22.16b, v25.16b + uaddl v2.8h, v26.8b, v27.8b // q2 + q3 + uaddl2 v3.8h, v26.16b, v27.16b + rshrn v13.8b, v8.8h, #3 // out q0 + rshrn2 v13.16b, v9.8h, #3 + + add v8.8h, v8.8h, v6.8h + add v9.8h, v9.8h, v7.8h + sub v2.8h, v2.8h, v0.8h // q2 + q3 - p1 - q1 + sub v3.8h, v3.8h, v1.8h + rshrn v0.8b, v8.8h, #3 // out q1 + rshrn2 v0.16b, v9.8h, #3 + + add v8.8h, v8.8h, v2.8h + add v9.8h , v9.8h, v3.8h + + bit v21.16b, v10.16b, v14.16b + bit v22.16b, v11.16b, v14.16b + bit v23.16b, v12.16b, v14.16b + rshrn v1.8b, v8.8h, #3 // out q2 + rshrn2 v1.16b, v9.8h, #3 + bit v24.16b, v13.16b, v14.16b + bit v25.16b, v0.16b, v14.16b + bit v26.16b, v1.16b, v14.16b +.endif +2: +.if \wd == 16 + mov x16, v15.d[0] + mov x17, v15.d[1] + adds x16, x16, x17 + b.ne 1f // check if flat8out is needed + mov x16, v14.d[0] + mov x17, v14.d[1] + adds x16, x16, x17 + b.eq 8f // if there was no flat8in, just write the inner 4 pixels + b 7f // if flat8in was used, write the inner 6 pixels +1: + + uaddl v2.8h, v17.8b, v17.8b // p6 + p6 + uaddl2 v3.8h, v17.16b, v17.16b + uaddl v4.8h, v17.8b, v18.8b // p6 + p5 + uaddl2 v5.8h, v17.16b, v18.16b + uaddl v6.8h, v17.8b, v19.8b // p6 + p4 + uaddl2 v7.8h, v17.16b, v19.16b + uaddl v8.8h, v17.8b, v20.8b // p6 + p3 + uaddl2 v9.8h, v17.16b, v20.16b + add v12.8h, v2.8h, v4.8h + add v13.8h, v3.8h, v5.8h + add v10.8h, v6.8h, v8.8h + add v11.8h, v7.8h, v9.8h + uaddl v6.8h, v17.8b, v21.8b // p6 + p2 + uaddl2 v7.8h, v17.16b, v21.16b + add v12.8h, v12.8h, v10.8h + add v13.8h, v13.8h, v11.8h + uaddl v8.8h, v17.8b, v22.8b // p6 + p1 + uaddl2 v9.8h, v17.16b, v22.16b + uaddl v10.8h, v18.8b, v23.8b // p5 + p0 + uaddl2 v11.8h, v18.16b, v23.16b + add v6.8h, v6.8h, v8.8h + add v7.8h, v7.8h, v9.8h + uaddl v8.8h, v19.8b, v24.8b // p4 + q0 + uaddl2 v9.8h, v19.16b, v24.16b + add v12.8h, v12.8h, v6.8h + add v13.8h, v13.8h, v7.8h + add v10.8h, v10.8h, v8.8h + add v11.8h, v11.8h, v9.8h + uaddl v6.8h, v20.8b, v25.8b // p3 + q1 + uaddl2 v7.8h, v20.16b, v25.16b + add v12.8h, v12.8h, v10.8h + add v13.8h, v13.8h, v11.8h + sub v6.8h, v6.8h, v2.8h + sub v7.8h, v7.8h, v3.8h + uaddl v2.8h, v21.8b, v26.8b // p2 + q2 + uaddl2 v3.8h, v21.16b, v26.16b + rshrn v0.8b, v12.8h, #4 // out p5 + rshrn2 v0.16b, v13.8h, #4 + add v12.8h, v12.8h, v6.8h // - (p6 + p6) + (p3 + q1) + add v13.8h, v13.8h, v7.8h + sub v2.8h, v2.8h, v4.8h + sub v3.8h, v3.8h, v5.8h + uaddl v4.8h, v22.8b, v27.8b // p1 + q3 + uaddl2 v5.8h, v22.16b, v27.16b + uaddl v6.8h, v17.8b, v19.8b // p6 + p4 + uaddl2 v7.8h, v17.16b, v19.16b + rshrn v1.8b, v12.8h, #4 // out p4 + rshrn2 v1.16b, v13.8h, #4 + add v12.8h, v12.8h, v2.8h // - (p6 + p5) + (p2 + q2) + add v13.8h, v13.8h, v3.8h + sub v4.8h, v4.8h, v6.8h + sub v5.8h, v5.8h, v7.8h + uaddl v6.8h, v23.8b, v28.8b // p0 + q4 + uaddl2 v7.8h, v23.16b, v28.16b + uaddl v8.8h, v17.8b, v20.8b // p6 + p3 + uaddl2 v9.8h, v17.16b, v20.16b + rshrn v2.8b, v12.8h, #4 // out p3 + rshrn2 v2.16b, v13.8h, #4 + add v12.8h, v12.8h, v4.8h // - (p6 + p4) + (p1 + q3) + add v13.8h, v13.8h, v5.8h + sub v6.8h, v6.8h, v8.8h + sub v7.8h, v7.8h, v9.8h + uaddl v8.8h, v24.8b, v29.8b // q0 + q5 + uaddl2 v9.8h, v24.16b, v29.16b + uaddl v4.8h, v17.8b, v21.8b // p6 + p2 + uaddl2 v5.8h, v17.16b, v21.16b + rshrn v3.8b, v12.8h, #4 // out p2 + rshrn2 v3.16b, v13.8h, #4 + add v12.8h, v12.8h, v6.8h // - (p6 + p3) + (p0 + q4) + add v13.8h, v13.8h, v7.8h + sub v8.8h, v8.8h, v4.8h + sub v9.8h, v9.8h, v5.8h + uaddl v6.8h, v25.8b, v30.8b // q1 + q6 + uaddl2 v7.8h, v25.16b, v30.16b + uaddl v10.8h, v17.8b, v22.8b // p6 + p1 + uaddl2 v11.8h, v17.16b, v22.16b + rshrn v4.8b, v12.8h, #4 // out p1 + rshrn2 v4.16b, v13.8h, #4 + add v12.8h, v12.8h, v8.8h // - (p6 + p2) + (q0 + q5) + add v13.8h, v13.8h, v9.8h + sub v6.8h, v6.8h, v10.8h + sub v7.8h, v7.8h, v11.8h + uaddl v8.8h, v26.8b, v30.8b // q2 + q6 + uaddl2 v9.8h, v26.16b, v30.16b + bif v0.16b, v18.16b, v15.16b // out p5 + uaddl v10.8h, v18.8b, v23.8b // p5 + p0 + uaddl2 v11.8h, v18.16b, v23.16b + rshrn v5.8b, v12.8h, #4 // out p0 + rshrn2 v5.16b, v13.8h, #4 + add v12.8h, v12.8h, v6.8h // - (p6 + p1) + (q1 + q6) + add v13.8h, v13.8h, v7.8h + sub v8.8h, v8.8h, v10.8h + sub v9.8h, v9.8h, v11.8h + uaddl v10.8h, v27.8b, v30.8b // q3 + q6 + uaddl2 v11.8h, v27.16b, v30.16b + bif v1.16b, v19.16b, v15.16b // out p4 + uaddl v18.8h, v19.8b, v24.8b // p4 + q0 + uaddl2 v19.8h, v19.16b, v24.16b + rshrn v6.8b, v12.8h, #4 // out q0 + rshrn2 v6.16b, v13.8h, #4 + add v12.8h, v12.8h, v8.8h // - (p5 + p0) + (q2 + q6) + add v13.8h, v13.8h, v9.8h + sub v10.8h, v10.8h, v18.8h + sub v11.8h, v11.8h, v19.8h + uaddl v8.8h, v28.8b, v30.8b // q4 + q6 + uaddl2 v9.8h, v28.16b, v30.16b + bif v2.16b, v20.16b, v15.16b // out p3 + uaddl v18.8h, v20.8b, v25.8b // p3 + q1 + uaddl2 v19.8h, v20.16b, v25.16b + rshrn v7.8b, v12.8h, #4 // out q1 + rshrn2 v7.16b, v13.8h, #4 + add v12.8h, v12.8h, v10.8h // - (p4 + q0) + (q3 + q6) + add v13.8h, v13.8h, v11.8h + sub v18.8h, v8.8h, v18.8h + sub v19.8h, v9.8h, v19.8h + uaddl v10.8h, v29.8b, v30.8b // q5 + q6 + uaddl2 v11.8h, v29.16b, v30.16b + bif v3.16b, v21.16b, v15.16b // out p2 + uaddl v20.8h, v21.8b, v26.8b // p2 + q2 + uaddl2 v21.8h, v21.16b, v26.16b + rshrn v8.8b, v12.8h, #4 // out q2 + rshrn2 v8.16b, v13.8h, #4 + add v12.8h, v12.8h, v18.8h // - (p3 + q1) + (q4 + q6) + add v13.8h, v13.8h, v19.8h + sub v10.8h, v10.8h, v20.8h + sub v11.8h, v11.8h, v21.8h + uaddl v18.8h, v30.8b, v30.8b // q6 + q6 + uaddl2 v19.8h, v30.16b, v30.16b + bif v4.16b, v22.16b, v15.16b // out p1 + uaddl v20.8h, v22.8b, v27.8b // p1 + q3 + uaddl2 v21.8h, v22.16b, v27.16b + rshrn v9.8b, v12.8h, #4 // out q3 + rshrn2 v9.16b, v13.8h, #4 + add v12.8h, v12.8h, v10.8h // - (p2 + q2) + (q5 + q6) + add v13.8h, v13.8h, v11.8h + sub v18.8h, v18.8h, v20.8h + sub v19.8h, v19.8h, v21.8h + bif v5.16b, v23.16b, v15.16b // out p0 + rshrn v10.8b, v12.8h, #4 // out q4 + rshrn2 v10.16b, v13.8h, #4 + add v12.8h, v12.8h, v18.8h // - (p1 + q3) + (q6 + q6) + add v13.8h, v13.8h, v19.8h + rshrn v11.8b, v12.8h, #4 // out q5 + rshrn2 v11.16b, v13.8h, #4 + bif v6.16b, v24.16b, v15.16b // out q0 + bif v7.16b, v25.16b, v15.16b // out q1 + bif v8.16b, v26.16b, v15.16b // out q2 + bif v9.16b, v27.16b, v15.16b // out q3 + bif v10.16b, v28.16b, v15.16b // out q4 + bif v11.16b, v29.16b, v15.16b // out q5 +.endif + + ret +.if \wd == 16 +7: + // Return to a shorter epilogue, writing only the inner 6 pixels + br x13 +.endif +.if \wd >= 8 +8: + // Return to a shorter epilogue, writing only the inner 4 pixels + br x14 +.endif +9: + // Return directly without writing back any pixels + br x15 +endfunc +.endm + +loop_filter 16 +loop_filter 8 +loop_filter 6 +loop_filter 4 + +.macro lpf_16_wd16 + adr x13, 7f + adr x14, 8f + bl lpf_16_wd16_neon +.endm + +.macro lpf_16_wd8 + adr x14, 8f + bl lpf_16_wd8_neon +.endm + +.macro lpf_16_wd6 + bl lpf_16_wd6_neon +.endm + +.macro lpf_16_wd4 + bl lpf_16_wd4_neon +.endm + +function lpf_v_4_16_neon + mov x15, x30 + sub x16, x0, x1, lsl #1 + ld1 {v22.16b}, [x16], x1 // p1 + ld1 {v24.16b}, [x0], x1 // q0 + ld1 {v23.16b}, [x16], x1 // p0 + ld1 {v25.16b}, [x0], x1 // q1 + sub x0, x0, x1, lsl #1 + + lpf_16_wd4 + + sub x16, x0, x1, lsl #1 + st1 {v22.16b}, [x16], x1 // p1 + st1 {v24.16b}, [x0], x1 // q0 + st1 {v23.16b}, [x16], x1 // p0 + st1 {v25.16b}, [x0], x1 // q1 + sub x0, x0, x1, lsl #1 + br x15 +endfunc + +function lpf_h_4_16_neon + mov x15, x30 + sub x16, x0, #2 + add x0, x16, x1, lsl #3 + ld1 {v22.s}[0], [x16], x1 + ld1 {v22.s}[2], [x0], x1 + ld1 {v23.s}[0], [x16], x1 + ld1 {v23.s}[2], [x0], x1 + ld1 {v24.s}[0], [x16], x1 + ld1 {v24.s}[2], [x0], x1 + ld1 {v25.s}[0], [x16], x1 + ld1 {v25.s}[2], [x0], x1 + ld1 {v22.s}[1], [x16], x1 + ld1 {v22.s}[3], [x0], x1 + ld1 {v23.s}[1], [x16], x1 + ld1 {v23.s}[3], [x0], x1 + ld1 {v24.s}[1], [x16], x1 + ld1 {v24.s}[3], [x0], x1 + ld1 {v25.s}[1], [x16], x1 + ld1 {v25.s}[3], [x0], x1 + add x0, x0, #2 + + transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29 + + lpf_16_wd4 + + sub x16, x0, x1, lsl #4 + sub x16, x16, #2 + transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29 + add x0, x16, x1, lsl #3 + + st1 {v22.s}[0], [x16], x1 + st1 {v22.s}[2], [x0], x1 + st1 {v23.s}[0], [x16], x1 + st1 {v23.s}[2], [x0], x1 + st1 {v24.s}[0], [x16], x1 + st1 {v24.s}[2], [x0], x1 + st1 {v25.s}[0], [x16], x1 + st1 {v25.s}[2], [x0], x1 + st1 {v22.s}[1], [x16], x1 + st1 {v22.s}[3], [x0], x1 + st1 {v23.s}[1], [x16], x1 + st1 {v23.s}[3], [x0], x1 + st1 {v24.s}[1], [x16], x1 + st1 {v24.s}[3], [x0], x1 + st1 {v25.s}[1], [x16], x1 + st1 {v25.s}[3], [x0], x1 + add x0, x0, #2 + br x15 +endfunc + +function lpf_v_6_16_neon + mov x15, x30 + sub x16, x0, x1, lsl #1 + sub x16, x16, x1 + ld1 {v21.16b}, [x16], x1 // p2 + ld1 {v24.16b}, [x0], x1 // q0 + ld1 {v22.16b}, [x16], x1 // p1 + ld1 {v25.16b}, [x0], x1 // q1 + ld1 {v23.16b}, [x16], x1 // p0 + ld1 {v26.16b}, [x0], x1 // q2 + sub x0, x0, x1, lsl #1 + sub x0, x0, x1 + + lpf_16_wd6 + + sub x16, x0, x1, lsl #1 + st1 {v22.16b}, [x16], x1 // p1 + st1 {v24.16b}, [x0], x1 // q0 + st1 {v23.16b}, [x16], x1 // p0 + st1 {v25.16b}, [x0], x1 // q1 + sub x0, x0, x1, lsl #1 + br x15 +endfunc + +function lpf_h_6_16_neon + mov x15, x30 + sub x16, x0, #4 + add x0, x16, x1, lsl #3 + ld1 {v20.d}[0], [x16], x1 + ld1 {v20.d}[1], [x0], x1 + ld1 {v21.d}[0], [x16], x1 + ld1 {v21.d}[1], [x0], x1 + ld1 {v22.d}[0], [x16], x1 + ld1 {v22.d}[1], [x0], x1 + ld1 {v23.d}[0], [x16], x1 + ld1 {v23.d}[1], [x0], x1 + ld1 {v24.d}[0], [x16], x1 + ld1 {v24.d}[1], [x0], x1 + ld1 {v25.d}[0], [x16], x1 + ld1 {v25.d}[1], [x0], x1 + ld1 {v26.d}[0], [x16], x1 + ld1 {v26.d}[1], [x0], x1 + ld1 {v27.d}[0], [x16], x1 + ld1 {v27.d}[1], [x0], x1 + add x0, x0, #4 + + transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 + + lpf_16_wd6 + + sub x16, x0, x1, lsl #4 + sub x16, x16, #2 + transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29 + add x0, x16, x1, lsl #3 + + st1 {v22.s}[0], [x16], x1 + st1 {v22.s}[2], [x0], x1 + st1 {v23.s}[0], [x16], x1 + st1 {v23.s}[2], [x0], x1 + st1 {v24.s}[0], [x16], x1 + st1 {v24.s}[2], [x0], x1 + st1 {v25.s}[0], [x16], x1 + st1 {v25.s}[2], [x0], x1 + st1 {v22.s}[1], [x16], x1 + st1 {v22.s}[3], [x0], x1 + st1 {v23.s}[1], [x16], x1 + st1 {v23.s}[3], [x0], x1 + st1 {v24.s}[1], [x16], x1 + st1 {v24.s}[3], [x0], x1 + st1 {v25.s}[1], [x16], x1 + st1 {v25.s}[3], [x0], x1 + add x0, x0, #2 + br x15 +endfunc + +function lpf_v_8_16_neon + mov x15, x30 + sub x16, x0, x1, lsl #2 + ld1 {v20.16b}, [x16], x1 // p3 + ld1 {v24.16b}, [x0], x1 // q0 + ld1 {v21.16b}, [x16], x1 // p2 + ld1 {v25.16b}, [x0], x1 // q1 + ld1 {v22.16b}, [x16], x1 // p1 + ld1 {v26.16b}, [x0], x1 // q2 + ld1 {v23.16b}, [x16], x1 // p0 + ld1 {v27.16b}, [x0], x1 // q3 + sub x0, x0, x1, lsl #2 + + lpf_16_wd8 + + sub x16, x0, x1, lsl #1 + sub x16, x16, x1 + st1 {v21.16b}, [x16], x1 // p2 + st1 {v24.16b}, [x0], x1 // q0 + st1 {v22.16b}, [x16], x1 // p1 + st1 {v25.16b}, [x0], x1 // q1 + st1 {v23.16b}, [x16], x1 // p0 + st1 {v26.16b}, [x0], x1 // q2 + sub x0, x0, x1, lsl #1 + sub x0, x0, x1 + br x15 + +8: + sub x16, x0, x1, lsl #1 + st1 {v22.16b}, [x16], x1 // p1 + st1 {v24.16b}, [x0], x1 // q0 + st1 {v23.16b}, [x16], x1 // p0 + st1 {v25.16b}, [x0], x1 // q1 + sub x0, x0, x1, lsl #1 + br x15 +endfunc + +function lpf_h_8_16_neon + mov x15, x30 + sub x16, x0, #4 + add x0, x16, x1, lsl #3 + ld1 {v20.d}[0], [x16], x1 + ld1 {v20.d}[1], [x0], x1 + ld1 {v21.d}[0], [x16], x1 + ld1 {v21.d}[1], [x0], x1 + ld1 {v22.d}[0], [x16], x1 + ld1 {v22.d}[1], [x0], x1 + ld1 {v23.d}[0], [x16], x1 + ld1 {v23.d}[1], [x0], x1 + ld1 {v24.d}[0], [x16], x1 + ld1 {v24.d}[1], [x0], x1 + ld1 {v25.d}[0], [x16], x1 + ld1 {v25.d}[1], [x0], x1 + ld1 {v26.d}[0], [x16], x1 + ld1 {v26.d}[1], [x0], x1 + ld1 {v27.d}[0], [x16], x1 + ld1 {v27.d}[1], [x0], x1 + add x0, x0, #4 + + transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 + + lpf_16_wd8 + + sub x16, x0, x1, lsl #4 + sub x16, x16, #4 + transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 + add x0, x16, x1, lsl #3 + + st1 {v20.d}[0], [x16], x1 + st1 {v20.d}[1], [x0], x1 + st1 {v21.d}[0], [x16], x1 + st1 {v21.d}[1], [x0], x1 + st1 {v22.d}[0], [x16], x1 + st1 {v22.d}[1], [x0], x1 + st1 {v23.d}[0], [x16], x1 + st1 {v23.d}[1], [x0], x1 + st1 {v24.d}[0], [x16], x1 + st1 {v24.d}[1], [x0], x1 + st1 {v25.d}[0], [x16], x1 + st1 {v25.d}[1], [x0], x1 + st1 {v26.d}[0], [x16], x1 + st1 {v26.d}[1], [x0], x1 + st1 {v27.d}[0], [x16], x1 + st1 {v27.d}[1], [x0], x1 + add x0, x0, #4 + br x15 +8: + sub x16, x0, x1, lsl #4 + sub x16, x16, #2 + transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29 + add x0, x16, x1, lsl #3 + + st1 {v22.s}[0], [x16], x1 + st1 {v22.s}[2], [x0], x1 + st1 {v23.s}[0], [x16], x1 + st1 {v23.s}[2], [x0], x1 + st1 {v24.s}[0], [x16], x1 + st1 {v24.s}[2], [x0], x1 + st1 {v25.s}[0], [x16], x1 + st1 {v25.s}[2], [x0], x1 + st1 {v22.s}[1], [x16], x1 + st1 {v22.s}[3], [x0], x1 + st1 {v23.s}[1], [x16], x1 + st1 {v23.s}[3], [x0], x1 + st1 {v24.s}[1], [x16], x1 + st1 {v24.s}[3], [x0], x1 + st1 {v25.s}[1], [x16], x1 + st1 {v25.s}[3], [x0], x1 + add x0, x0, #2 + br x15 +endfunc + +function lpf_v_16_16_neon + mov x15, x30 + + sub x16, x0, x1, lsl #3 + add x16, x16, x1 + ld1 {v17.16b}, [x16], x1 // p6 + ld1 {v24.16b}, [x0], x1 // q0 + ld1 {v18.16b}, [x16], x1 // p5 + ld1 {v25.16b}, [x0], x1 // q1 + ld1 {v19.16b}, [x16], x1 // p4 + ld1 {v26.16b}, [x0], x1 // q2 + ld1 {v20.16b}, [x16], x1 // p3 + ld1 {v27.16b}, [x0], x1 // q3 + ld1 {v21.16b}, [x16], x1 // p2 + ld1 {v28.16b}, [x0], x1 // q4 + ld1 {v22.16b}, [x16], x1 // p1 + ld1 {v29.16b}, [x0], x1 // q5 + ld1 {v23.16b}, [x16], x1 // p0 + ld1 {v30.16b}, [x0], x1 // q6 + sub x0, x0, x1, lsl #3 + add x0, x0, x1 + + lpf_16_wd16 + + sub x16, x0, x1, lsl #2 + sub x16, x16, x1, lsl #1 + st1 {v0.16b}, [x16], x1 // p5 + st1 {v6.16b}, [x0], x1 // q0 + st1 {v1.16b}, [x16], x1 // p4 + st1 {v7.16b}, [x0], x1 // q1 + st1 {v2.16b}, [x16], x1 // p3 + st1 {v8.16b}, [x0], x1 // q2 + st1 {v3.16b}, [x16], x1 // p2 + st1 {v9.16b}, [x0], x1 // q3 + st1 {v4.16b}, [x16], x1 // p1 + st1 {v10.16b}, [x0], x1 // q4 + st1 {v5.16b}, [x16], x1 // p0 + st1 {v11.16b}, [x0], x1 // q5 + sub x0, x0, x1, lsl #2 + sub x0, x0, x1, lsl #1 + br x15 +7: + sub x16, x0, x1 + sub x16, x16, x1, lsl #1 + st1 {v21.16b}, [x16], x1 // p2 + st1 {v24.16b}, [x0], x1 // q0 + st1 {v22.16b}, [x16], x1 // p1 + st1 {v25.16b}, [x0], x1 // q1 + st1 {v23.16b}, [x16], x1 // p0 + st1 {v26.16b}, [x0], x1 // q2 + sub x0, x0, x1, lsl #1 + sub x0, x0, x1 + br x15 + +8: + sub x16, x0, x1, lsl #1 + st1 {v22.16b}, [x16], x1 // p1 + st1 {v24.16b}, [x0], x1 // q0 + st1 {v23.16b}, [x16], x1 // p0 + st1 {v25.16b}, [x0], x1 // q1 + sub x0, x0, x1, lsl #1 + br x15 +endfunc + +function lpf_h_16_16_neon + mov x15, x30 + sub x16, x0, #8 + ld1 {v16.d}[0], [x16], x1 + ld1 {v24.d}[0], [x0], x1 + ld1 {v17.d}[0], [x16], x1 + ld1 {v25.d}[0], [x0], x1 + ld1 {v18.d}[0], [x16], x1 + ld1 {v26.d}[0], [x0], x1 + ld1 {v19.d}[0], [x16], x1 + ld1 {v27.d}[0], [x0], x1 + ld1 {v20.d}[0], [x16], x1 + ld1 {v28.d}[0], [x0], x1 + ld1 {v21.d}[0], [x16], x1 + ld1 {v29.d}[0], [x0], x1 + ld1 {v22.d}[0], [x16], x1 + ld1 {v30.d}[0], [x0], x1 + ld1 {v23.d}[0], [x16], x1 + ld1 {v31.d}[0], [x0], x1 + ld1 {v16.d}[1], [x16], x1 + ld1 {v24.d}[1], [x0], x1 + ld1 {v17.d}[1], [x16], x1 + ld1 {v25.d}[1], [x0], x1 + ld1 {v18.d}[1], [x16], x1 + ld1 {v26.d}[1], [x0], x1 + ld1 {v19.d}[1], [x16], x1 + ld1 {v27.d}[1], [x0], x1 + ld1 {v20.d}[1], [x16], x1 + ld1 {v28.d}[1], [x0], x1 + ld1 {v21.d}[1], [x16], x1 + ld1 {v29.d}[1], [x0], x1 + ld1 {v22.d}[1], [x16], x1 + ld1 {v30.d}[1], [x0], x1 + ld1 {v23.d}[1], [x16], x1 + ld1 {v31.d}[1], [x0], x1 + + transpose_8x16b v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 + transpose_8x16b v24, v25, v26, v27, v28, v29, v30, v31, v0, v1 + + lpf_16_wd16 + + sub x0, x0, x1, lsl #4 + sub x16, x0, #8 + + transpose_8x16b v16, v17, v0, v1, v2, v3, v4, v5, v18, v19 + transpose_8x16b v6, v7, v8, v9, v10, v11, v30, v31, v18, v19 + + st1 {v16.d}[0], [x16], x1 + st1 {v6.d}[0], [x0], x1 + st1 {v17.d}[0], [x16], x1 + st1 {v7.d}[0], [x0], x1 + st1 {v0.d}[0], [x16], x1 + st1 {v8.d}[0], [x0], x1 + st1 {v1.d}[0], [x16], x1 + st1 {v9.d}[0], [x0], x1 + st1 {v2.d}[0], [x16], x1 + st1 {v10.d}[0], [x0], x1 + st1 {v3.d}[0], [x16], x1 + st1 {v11.d}[0], [x0], x1 + st1 {v4.d}[0], [x16], x1 + st1 {v30.d}[0], [x0], x1 + st1 {v5.d}[0], [x16], x1 + st1 {v31.d}[0], [x0], x1 + st1 {v16.d}[1], [x16], x1 + st1 {v6.d}[1], [x0], x1 + st1 {v17.d}[1], [x16], x1 + st1 {v7.d}[1], [x0], x1 + st1 {v0.d}[1], [x16], x1 + st1 {v8.d}[1], [x0], x1 + st1 {v1.d}[1], [x16], x1 + st1 {v9.d}[1], [x0], x1 + st1 {v2.d}[1], [x16], x1 + st1 {v10.d}[1], [x0], x1 + st1 {v3.d}[1], [x16], x1 + st1 {v11.d}[1], [x0], x1 + st1 {v4.d}[1], [x16], x1 + st1 {v30.d}[1], [x0], x1 + st1 {v5.d}[1], [x16], x1 + st1 {v31.d}[1], [x0], x1 + br x15 + +7: + sub x16, x0, x1, lsl #4 + sub x16, x16, #4 + transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 + add x0, x16, x1, lsl #3 + + st1 {v20.d}[0], [x16], x1 + st1 {v20.d}[1], [x0], x1 + st1 {v21.d}[0], [x16], x1 + st1 {v21.d}[1], [x0], x1 + st1 {v22.d}[0], [x16], x1 + st1 {v22.d}[1], [x0], x1 + st1 {v23.d}[0], [x16], x1 + st1 {v23.d}[1], [x0], x1 + st1 {v24.d}[0], [x16], x1 + st1 {v24.d}[1], [x0], x1 + st1 {v25.d}[0], [x16], x1 + st1 {v25.d}[1], [x0], x1 + st1 {v26.d}[0], [x16], x1 + st1 {v26.d}[1], [x0], x1 + st1 {v27.d}[0], [x16], x1 + st1 {v27.d}[1], [x0], x1 + add x0, x0, #4 + br x15 +8: + sub x16, x0, x1, lsl #4 + sub x16, x16, #2 + transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29 + add x0, x16, x1, lsl #3 + + st1 {v22.s}[0], [x16], x1 + st1 {v22.s}[2], [x0], x1 + st1 {v23.s}[0], [x16], x1 + st1 {v23.s}[2], [x0], x1 + st1 {v24.s}[0], [x16], x1 + st1 {v24.s}[2], [x0], x1 + st1 {v25.s}[0], [x16], x1 + st1 {v25.s}[2], [x0], x1 + st1 {v22.s}[1], [x16], x1 + st1 {v22.s}[3], [x0], x1 + st1 {v23.s}[1], [x16], x1 + st1 {v23.s}[3], [x0], x1 + st1 {v24.s}[1], [x16], x1 + st1 {v24.s}[3], [x0], x1 + st1 {v25.s}[1], [x16], x1 + st1 {v25.s}[3], [x0], x1 + add x0, x0, #2 + br x15 +endfunc + +// void dav1d_lpf_v_sb_y_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const uint32_t *const vmask, +// const uint8_t (*l)[4], ptrdiff_t b4_stride, +// const Av1FilterLUT *lut, const int w) + +.macro lpf_func dir, type +function lpf_\dir\()_sb_\type\()_8bpc_neon, export=1 + mov x11, x30 + stp d8, d9, [sp, #-0x40]! + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + ldp w6, w7, [x2] // vmask[0], vmask[1] +.ifc \type, y + ldr w2, [x2, #8] // vmask[2] +.endif + add x5, x5, #128 // Move to sharp part of lut +.ifc \type, y + orr w7, w7, w2 // vmask[1] |= vmask[2] +.endif +.ifc \dir, v + sub x4, x3, x4, lsl #2 +.else + sub x3, x3, #4 + lsl x4, x4, #2 +.endif + orr w6, w6, w7 // vmask[0] |= vmask[1] + +1: + tst w6, #0x0f +.ifc \dir, v + ld1 {v0.16b}, [x4], #16 + ld1 {v1.16b}, [x3], #16 +.else + ld2 {v0.s,v1.s}[0], [x3], x4 + ld2 {v0.s,v1.s}[1], [x3], x4 + ld2 {v0.s,v1.s}[2], [x3], x4 + ld2 {v0.s,v1.s}[3], [x3], x4 +.endif + b.eq 7f // if (!(vm & bits)) continue; + + ld1r {v5.16b}, [x5] // sharp[0] + add x5, x5, #8 + movi v2.4s, #0xff + dup v13.4s, w6 // vmask[0] + + and v0.16b, v0.16b, v2.16b // Keep only lowest byte in each 32 bit word + and v1.16b, v1.16b, v2.16b + cmtst v3.16b, v1.16b, v2.16b // Check for nonzero values in l[0][0] + movi v4.16b, #1 + ld1r {v6.16b}, [x5] // sharp[1] + sub x5, x5, #8 + bif v1.16b, v0.16b, v3.16b // if (!l[0][0]) L = l[offset][0] + mul v1.4s, v1.4s, v4.4s // L +.ifc \type, y + dup v15.4s, w2 // vmask[2] +.endif + cmtst v2.4s, v1.4s, v2.4s // L != 0 + dup v14.4s, w7 // vmask[1] + mov x16, v2.d[0] + mov x17, v2.d[1] + adds x16, x16, x17 + b.eq 7f // if (!L) continue; + neg v5.16b, v5.16b // -sharp[0] + movrel x16, word_1248 + ushr v12.16b, v1.16b, #4 // H + ld1 {v16.4s}, [x16] + sshl v3.16b, v1.16b, v5.16b // L >> sharp[0] +.ifc \type, y + cmtst v15.4s, v15.4s, v16.4s // if (vmask[2] & bits) +.endif + movi v7.16b, #2 + umin v3.16b, v3.16b, v6.16b // imin(L >> sharp[0], sharp[1]) + add v0.16b, v1.16b, v7.16b // L + 2 + umax v11.16b, v3.16b, v4.16b // imax(imin(), 1) = limit = I + add v0.16b, v0.16b, v0.16b // 2*(L + 2) + cmtst v14.4s, v14.4s, v16.4s // if (vmask[1] & bits) + add v10.16b, v0.16b, v11.16b // 2*(L + 2) + limit = E + cmtst v13.4s, v13.4s, v16.4s // if (vmask[0] & bits) + and v13.16b, v13.16b, v2.16b // vmask[0] &= L != 0 + +.ifc \type, y + tst w2, #0x0f + b.eq 2f + // wd16 + bl lpf_\dir\()_16_16_neon + b 8f +2: +.endif + tst w7, #0x0f + b.eq 3f +.ifc \type, y + // wd8 + bl lpf_\dir\()_8_16_neon +.else + // wd6 + bl lpf_\dir\()_6_16_neon +.endif + b 8f +3: + // wd4 + bl lpf_\dir\()_4_16_neon +.ifc \dir, h + b 8f +7: + // For dir h, the functions above increment x0. + // If the whole function is skipped, increment it here instead. + add x0, x0, x1, lsl #4 +.else +7: +.endif +8: + lsr w6, w6, #4 // vmask[0] >>= 4 + lsr w7, w7, #4 // vmask[1] >>= 4 +.ifc \type, y + lsr w2, w2, #4 // vmask[2] >>= 4 +.endif +.ifc \dir, v + add x0, x0, #16 +.else + // For dir h, x0 is returned incremented +.endif + cbnz w6, 1b + + ldp d14, d15, [sp, #0x30] + ldp d12, d13, [sp, #0x20] + ldp d10, d11, [sp, #0x10] + ldp d8, d9, [sp], 0x40 + br x11 +endfunc +.endm + +lpf_func v, y +lpf_func h, y +lpf_func v, uv +lpf_func h, uv + +const word_1248 + .word 1, 2, 4, 8 +endconst diff --git a/third_party/dav1d/src/arm/64/loopfilter16.S b/third_party/dav1d/src/arm/64/loopfilter16.S new file mode 100644 index 0000000000..a7319189d0 --- /dev/null +++ b/third_party/dav1d/src/arm/64/loopfilter16.S @@ -0,0 +1,907 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +.macro loop_filter wd +function lpf_8_wd\wd\()_neon + uabd v0.8h, v22.8h, v23.8h // abs(p1 - p0) + uabd v1.8h, v25.8h, v24.8h // abs(q1 - q0) + uabd v2.8h, v23.8h, v24.8h // abs(p0 - q0) + uabd v3.8h, v22.8h, v25.8h // abs(p1 - q1) +.if \wd >= 6 + uabd v4.8h, v21.8h, v22.8h // abs(p2 - p1) + uabd v5.8h, v26.8h, v25.8h // abs(q2 - q1) +.endif +.if \wd >= 8 + uabd v6.8h, v20.8h, v21.8h // abs(p3 - p2) + uabd v7.8h, v27.8h, v26.8h // abs(q3 - q3) +.endif +.if \wd >= 6 + umax v4.8h, v4.8h, v5.8h +.endif + uqadd v2.8h, v2.8h, v2.8h // abs(p0 - q0) * 2 +.if \wd >= 8 + umax v6.8h, v6.8h, v7.8h +.endif + ushr v3.8h, v3.8h, #1 +.if \wd >= 8 + umax v4.8h, v4.8h, v6.8h +.endif +.if \wd >= 6 + and v4.16b, v4.16b, v14.16b +.endif + umax v0.8h, v0.8h, v1.8h // max(abs(p1 - p0), abs(q1 - q0)) + uqadd v2.8h, v2.8h, v3.8h // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 +.if \wd >= 6 + umax v4.8h, v0.8h, v4.8h + cmhs v1.8h, v11.8h, v4.8h // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I +.else + cmhs v1.8h, v11.8h, v0.8h // max(abs(p1 - p0), abs(q1 - q0)) <= I +.endif + cmhs v2.8h, v10.8h, v2.8h // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E + and v1.16b, v1.16b, v2.16b // fm + and v1.16b, v1.16b, v13.16b // fm && wd >= 4 +.if \wd >= 6 + and v14.16b, v14.16b, v1.16b // fm && wd > 4 +.endif +.if \wd >= 16 + and v15.16b, v15.16b, v1.16b // fm && wd == 16 +.endif + + mov x16, v1.d[0] + mov x17, v1.d[1] + adds x16, x16, x17 + b.eq 9f // if (!fm || wd < 4) return; + +.if \wd >= 6 + movi v10.8h, #1 + uabd v2.8h, v21.8h, v23.8h // abs(p2 - p0) + uabd v3.8h, v22.8h, v23.8h // abs(p1 - p0) + uabd v4.8h, v25.8h, v24.8h // abs(q1 - q0) + uabd v5.8h, v26.8h, v24.8h // abs(q2 - q0) + dup v9.8h, w9 // bitdepth_min_8 +.if \wd >= 8 + uabd v6.8h, v20.8h, v23.8h // abs(p3 - p0) + uabd v7.8h, v27.8h, v24.8h // abs(q3 - q0) +.endif + umax v2.8h, v2.8h, v3.8h + umax v4.8h, v4.8h, v5.8h +.if \wd >= 8 + umax v6.8h, v6.8h, v7.8h +.endif + umax v2.8h, v2.8h, v4.8h + ushl v10.8h, v10.8h, v9.8h // F = 1 << bitdepth_min_8 +.if \wd >= 8 + umax v2.8h, v2.8h, v6.8h +.endif + +.if \wd == 16 + uabd v3.8h, v17.8h, v23.8h // abs(p6 - p0) + uabd v4.8h, v18.8h, v23.8h // abs(p5 - p0) + uabd v5.8h, v19.8h, v23.8h // abs(p4 - p0) +.endif + cmhs v2.8h, v10.8h, v2.8h // flat8in +.if \wd == 16 + uabd v6.8h, v28.8h, v24.8h // abs(q4 - q0) + uabd v7.8h, v29.8h, v24.8h // abs(q5 - q0) + uabd v8.8h, v30.8h, v24.8h // abs(q6 - q0) +.endif + and v14.16b, v2.16b, v14.16b // flat8in && fm && wd > 4 + bic v1.16b, v1.16b, v14.16b // fm && wd >= 4 && !flat8in +.if \wd == 16 + umax v3.8h, v3.8h, v4.8h + umax v5.8h, v5.8h, v6.8h +.endif + mov x16, v1.d[0] + mov x17, v1.d[1] +.if \wd == 16 + umax v7.8h, v7.8h, v8.8h + umax v3.8h, v3.8h, v5.8h + umax v3.8h, v3.8h, v7.8h + cmhs v3.8h, v10.8h, v3.8h // flat8out +.endif + adds x16, x16, x17 +.if \wd == 16 + and v15.16b, v15.16b, v3.16b // flat8out && fm && wd == 16 + and v15.16b, v15.16b, v14.16b // flat8out && flat8in && fm && wd == 16 + bic v14.16b, v14.16b, v15.16b // flat8in && fm && wd >= 4 && !flat8out +.endif + b.eq 1f // skip wd == 4 case +.endif + + dup v3.8h, w8 // bitdepth_max + sub v2.8h, v22.8h, v25.8h // p1 - q1 + ushr v3.8h, v3.8h, #1 // 128 << bitdepth_min_8 - 1 + cmhi v0.8h, v0.8h, v12.8h // hev + not v9.16b, v3.16b // - 128 * (1 << bitdepth_min_8) + smin v2.8h, v2.8h, v3.8h // iclip_diff(p1 - q1) + smax v2.8h, v2.8h, v9.8h // iclip_diff(p1 - q1) + and v4.16b, v2.16b, v0.16b // if (hev) iclip_diff(p1 - q1) + sub v2.8h, v24.8h, v23.8h + movi v5.8h, #3 + bic v0.16b, v1.16b, v0.16b // (fm && wd >= 4 && !hev) + mul v2.8h, v2.8h, v5.8h + movi v6.8h, #4 + add v2.8h, v2.8h, v4.8h + smin v2.8h, v2.8h, v3.8h // f = iclip_diff() + movi v7.8h, #3 + smax v2.8h, v2.8h, v9.8h // f = iclip_diff() + sqadd v4.8h, v6.8h, v2.8h // f + 4 + sqadd v5.8h, v7.8h, v2.8h // f + 3 + smin v4.8h, v4.8h, v3.8h // imin(f + 4, 128 << bitdepth_min_8 - 1) + smin v5.8h, v5.8h, v3.8h // imin(f + 3, 128 << bitdepth_min_8 - 1) + sshr v4.8h, v4.8h, #3 // f1 + sshr v5.8h, v5.8h, #3 // f2 + movi v9.8h, #0 + dup v3.8h, w8 // bitdepth_max + sqadd v2.8h, v23.8h, v5.8h // p0 + f2 + sqsub v6.8h, v24.8h, v4.8h // q0 - f1 + srshr v4.8h, v4.8h, #1 // (f1 + 1) >> 1 + smin v2.8h, v2.8h, v3.8h // out p0 = iclip_pixel() + smin v6.8h, v6.8h, v3.8h // out q0 = iclip_pixel() + smax v2.8h, v2.8h, v9.8h // out p0 = iclip_pixel() + smax v6.8h, v6.8h, v9.8h // out q0 = iclip_pixel() + bit v23.16b, v2.16b, v1.16b // if (fm && wd >= 4) + bit v24.16b, v6.16b, v1.16b // if (fm && wd >= 4) + sqadd v2.8h, v22.8h, v4.8h // p1 + f + sqsub v6.8h, v25.8h, v4.8h // q1 - f + smin v2.8h, v2.8h, v3.8h // out p1 = iclip_pixel() + smin v6.8h, v6.8h, v3.8h // out q1 = iclip_pixel() + smax v2.8h, v2.8h, v9.8h // out p1 = iclip_pixel() + smax v6.8h, v6.8h, v9.8h // out q1 = iclip_pixel() + bit v22.16b, v2.16b, v0.16b // if (fm && wd >= 4 && !hev) + bit v25.16b, v6.16b, v0.16b // if (fm && wd >= 4 && !hev) +1: + +.if \wd == 6 + mov x16, v14.d[0] + mov x17, v14.d[1] + adds x16, x16, x17 + b.eq 2f // skip if there's no flat8in + + add v0.8h, v21.8h, v21.8h // p2 * 2 + add v2.8h, v21.8h, v22.8h // p2 + p1 + add v4.8h, v22.8h, v23.8h // p1 + p0 + add v6.8h, v23.8h, v24.8h // p0 + q0 + add v8.8h, v0.8h, v2.8h + add v10.8h, v4.8h, v6.8h + add v12.8h, v24.8h, v25.8h // q0 + q1 + add v8.8h, v8.8h, v10.8h + sub v12.8h, v12.8h, v0.8h + add v10.8h, v25.8h, v26.8h // q1 + q2 + urshr v0.8h, v8.8h, #3 // out p1 + + add v8.8h, v8.8h, v12.8h + sub v10.8h, v10.8h, v2.8h + add v12.8h, v26.8h, v26.8h // q2 + q2 + urshr v1.8h, v8.8h, #3 // out p0 + + add v8.8h, v8.8h, v10.8h + sub v12.8h, v12.8h, v4.8h + urshr v2.8h, v8.8h, #3 // out q0 + + bit v22.16b, v0.16b, v14.16b // p1 if (flat8in) + add v8.8h, v8.8h, v12.8h + bit v23.16b, v1.16b, v14.16b // p0 if (flat8in) + urshr v3.8h, v8.8h, #3 // out q1 + bit v24.16b, v2.16b, v14.16b // q0 if (flat8in) + bit v25.16b, v3.16b, v14.16b // q1 if (flat8in) +.elseif \wd >= 8 + mov x16, v14.d[0] + mov x17, v14.d[1] + adds x16, x16, x17 +.if \wd == 8 + b.eq 8f // skip if there's no flat8in +.else + b.eq 2f // skip if there's no flat8in +.endif + + add v0.8h, v20.8h, v21.8h // p3 + p2 + add v2.8h, v22.8h, v25.8h // p1 + q1 + add v4.8h, v20.8h, v22.8h // p3 + p1 + add v6.8h, v23.8h, v26.8h // p0 + q2 + add v8.8h, v0.8h, v0.8h // 2 * (p3 + p2) + add v9.8h, v23.8h, v24.8h // p0 + q0 + add v8.8h, v8.8h, v4.8h // + p3 + p1 + sub v2.8h, v2.8h, v0.8h // p1 + q1 - p3 - p2 + add v8.8h, v8.8h, v9.8h // + p0 + q0 + sub v6.8h, v6.8h, v4.8h // p0 + q2 - p3 - p1 + urshr v10.8h, v8.8h, #3 // out p2 + + add v8.8h, v8.8h, v2.8h + add v0.8h, v20.8h, v23.8h // p3 + p0 + add v2.8h, v24.8h, v27.8h // q0 + q3 + urshr v11.8h, v8.8h, #3 // out p1 + + add v8.8h, v8.8h, v6.8h + sub v2.8h, v2.8h, v0.8h // q0 + q3 - p3 - p0 + add v4.8h, v21.8h, v24.8h // p2 + q0 + add v6.8h, v25.8h, v27.8h // q1 + q3 + urshr v12.8h, v8.8h, #3 // out p0 + + add v8.8h, v8.8h, v2.8h + sub v6.8h, v6.8h, v4.8h // q1 + q3 - p2 - q0 + add v0.8h, v22.8h, v25.8h // p1 + q1 + add v2.8h, v26.8h, v27.8h // q2 + q3 + urshr v13.8h, v8.8h, #3 // out q0 + + add v8.8h, v8.8h, v6.8h + sub v2.8h, v2.8h, v0.8h // q2 + q3 - p1 - q1 + urshr v0.8h, v8.8h, #3 // out q1 + + add v8.8h, v8.8h, v2.8h + + bit v21.16b, v10.16b, v14.16b + bit v22.16b, v11.16b, v14.16b + bit v23.16b, v12.16b, v14.16b + urshr v1.8h, v8.8h, #3 // out q2 + bit v24.16b, v13.16b, v14.16b + bit v25.16b, v0.16b, v14.16b + bit v26.16b, v1.16b, v14.16b +.endif +2: +.if \wd == 16 + mov x16, v15.d[0] + mov x17, v15.d[1] + adds x16, x16, x17 + b.ne 1f // check if flat8out is needed + mov x16, v14.d[0] + mov x17, v14.d[1] + adds x16, x16, x17 + b.eq 8f // if there was no flat8in, just write the inner 4 pixels + b 7f // if flat8in was used, write the inner 6 pixels +1: + + add v2.8h, v17.8h, v17.8h // p6 + p6 + add v4.8h, v17.8h, v18.8h // p6 + p5 + add v6.8h, v17.8h, v19.8h // p6 + p4 + add v8.8h, v17.8h, v20.8h // p6 + p3 + add v12.8h, v2.8h, v4.8h + add v10.8h, v6.8h, v8.8h + add v6.8h, v17.8h, v21.8h // p6 + p2 + add v12.8h, v12.8h, v10.8h + add v8.8h, v17.8h, v22.8h // p6 + p1 + add v10.8h, v18.8h, v23.8h // p5 + p0 + add v6.8h, v6.8h, v8.8h + add v8.8h, v19.8h, v24.8h // p4 + q0 + add v12.8h, v12.8h, v6.8h + add v10.8h, v10.8h, v8.8h + add v6.8h, v20.8h, v25.8h // p3 + q1 + add v12.8h, v12.8h, v10.8h + sub v6.8h, v6.8h, v2.8h + add v2.8h, v21.8h, v26.8h // p2 + q2 + urshr v0.8h, v12.8h, #4 // out p5 + add v12.8h, v12.8h, v6.8h // - (p6 + p6) + (p3 + q1) + sub v2.8h, v2.8h, v4.8h + add v4.8h, v22.8h, v27.8h // p1 + q3 + add v6.8h, v17.8h, v19.8h // p6 + p4 + urshr v1.8h, v12.8h, #4 // out p4 + add v12.8h, v12.8h, v2.8h // - (p6 + p5) + (p2 + q2) + sub v4.8h, v4.8h, v6.8h + add v6.8h, v23.8h, v28.8h // p0 + q4 + add v8.8h, v17.8h, v20.8h // p6 + p3 + urshr v2.8h, v12.8h, #4 // out p3 + add v12.8h, v12.8h, v4.8h // - (p6 + p4) + (p1 + q3) + sub v6.8h, v6.8h, v8.8h + add v8.8h, v24.8h, v29.8h // q0 + q5 + add v4.8h, v17.8h, v21.8h // p6 + p2 + urshr v3.8h, v12.8h, #4 // out p2 + add v12.8h, v12.8h, v6.8h // - (p6 + p3) + (p0 + q4) + sub v8.8h, v8.8h, v4.8h + add v6.8h, v25.8h, v30.8h // q1 + q6 + add v10.8h, v17.8h, v22.8h // p6 + p1 + urshr v4.8h, v12.8h, #4 // out p1 + add v12.8h, v12.8h, v8.8h // - (p6 + p2) + (q0 + q5) + sub v6.8h, v6.8h, v10.8h + add v8.8h, v26.8h, v30.8h // q2 + q6 + bif v0.16b, v18.16b, v15.16b // out p5 + add v10.8h, v18.8h, v23.8h // p5 + p0 + urshr v5.8h, v12.8h, #4 // out p0 + add v12.8h, v12.8h, v6.8h // - (p6 + p1) + (q1 + q6) + sub v8.8h, v8.8h, v10.8h + add v10.8h, v27.8h, v30.8h // q3 + q6 + bif v1.16b, v19.16b, v15.16b // out p4 + add v18.8h, v19.8h, v24.8h // p4 + q0 + urshr v6.8h, v12.8h, #4 // out q0 + add v12.8h, v12.8h, v8.8h // - (p5 + p0) + (q2 + q6) + sub v10.8h, v10.8h, v18.8h + add v8.8h, v28.8h, v30.8h // q4 + q6 + bif v2.16b, v20.16b, v15.16b // out p3 + add v18.8h, v20.8h, v25.8h // p3 + q1 + urshr v7.8h, v12.8h, #4 // out q1 + add v12.8h, v12.8h, v10.8h // - (p4 + q0) + (q3 + q6) + sub v18.8h, v8.8h, v18.8h + add v10.8h, v29.8h, v30.8h // q5 + q6 + bif v3.16b, v21.16b, v15.16b // out p2 + add v20.8h, v21.8h, v26.8h // p2 + q2 + urshr v8.8h, v12.8h, #4 // out q2 + add v12.8h, v12.8h, v18.8h // - (p3 + q1) + (q4 + q6) + sub v10.8h, v10.8h, v20.8h + add v18.8h, v30.8h, v30.8h // q6 + q6 + bif v4.16b, v22.16b, v15.16b // out p1 + add v20.8h, v22.8h, v27.8h // p1 + q3 + urshr v9.8h, v12.8h, #4 // out q3 + add v12.8h, v12.8h, v10.8h // - (p2 + q2) + (q5 + q6) + sub v18.8h, v18.8h, v20.8h + bif v5.16b, v23.16b, v15.16b // out p0 + urshr v10.8h, v12.8h, #4 // out q4 + add v12.8h, v12.8h, v18.8h // - (p1 + q3) + (q6 + q6) + urshr v11.8h, v12.8h, #4 // out q5 + bif v6.16b, v24.16b, v15.16b // out q0 + bif v7.16b, v25.16b, v15.16b // out q1 + bif v8.16b, v26.16b, v15.16b // out q2 + bif v9.16b, v27.16b, v15.16b // out q3 + bif v10.16b, v28.16b, v15.16b // out q4 + bif v11.16b, v29.16b, v15.16b // out q5 +.endif + + ret +.if \wd == 16 +7: + // Return to a shorter epilogue, writing only the inner 6 pixels + br x13 +.endif +.if \wd >= 8 +8: + // Return to a shorter epilogue, writing only the inner 4 pixels + br x14 +.endif +9: + // Return directly without writing back any pixels + br x15 +endfunc +.endm + +loop_filter 16 +loop_filter 8 +loop_filter 6 +loop_filter 4 + +.macro lpf_8_wd16 + adr x13, 7f + adr x14, 8f + bl lpf_8_wd16_neon +.endm + +.macro lpf_8_wd8 + adr x14, 8f + bl lpf_8_wd8_neon +.endm + +.macro lpf_8_wd6 + bl lpf_8_wd6_neon +.endm + +.macro lpf_8_wd4 + bl lpf_8_wd4_neon +.endm + +function lpf_v_4_8_neon + mov x15, x30 + sub x16, x0, x1, lsl #1 + ld1 {v22.8h}, [x16], x1 // p1 + ld1 {v24.8h}, [x0], x1 // q0 + ld1 {v23.8h}, [x16], x1 // p0 + ld1 {v25.8h}, [x0], x1 // q1 + sub x0, x0, x1, lsl #1 + + lpf_8_wd4 + + sub x16, x0, x1, lsl #1 + st1 {v22.8h}, [x16], x1 // p1 + st1 {v24.8h}, [x0], x1 // q0 + st1 {v23.8h}, [x16], x1 // p0 + st1 {v25.8h}, [x0], x1 // q1 + sub x0, x0, x1, lsl #1 + br x15 +endfunc + +function lpf_h_4_8_neon + mov x15, x30 + sub x16, x0, #4 + add x0, x16, x1, lsl #2 + ld1 {v22.d}[0], [x16], x1 + ld1 {v22.d}[1], [x0], x1 + ld1 {v23.d}[0], [x16], x1 + ld1 {v23.d}[1], [x0], x1 + ld1 {v24.d}[0], [x16], x1 + ld1 {v24.d}[1], [x0], x1 + ld1 {v25.d}[0], [x16], x1 + ld1 {v25.d}[1], [x0], x1 + add x0, x0, #4 + + transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29 + + lpf_8_wd4 + + sub x16, x0, x1, lsl #3 + sub x16, x16, #4 + transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29 + add x0, x16, x1, lsl #2 + + st1 {v22.d}[0], [x16], x1 + st1 {v22.d}[1], [x0], x1 + st1 {v23.d}[0], [x16], x1 + st1 {v23.d}[1], [x0], x1 + st1 {v24.d}[0], [x16], x1 + st1 {v24.d}[1], [x0], x1 + st1 {v25.d}[0], [x16], x1 + st1 {v25.d}[1], [x0], x1 + add x0, x0, #4 + br x15 +endfunc + +function lpf_v_6_8_neon + mov x15, x30 + sub x16, x0, x1, lsl #1 + sub x16, x16, x1 + ld1 {v21.8h}, [x16], x1 // p2 + ld1 {v24.8h}, [x0], x1 // q0 + ld1 {v22.8h}, [x16], x1 // p1 + ld1 {v25.8h}, [x0], x1 // q1 + ld1 {v23.8h}, [x16], x1 // p0 + ld1 {v26.8h}, [x0], x1 // q2 + sub x0, x0, x1, lsl #1 + sub x0, x0, x1 + + lpf_8_wd6 + + sub x16, x0, x1, lsl #1 + st1 {v22.8h}, [x16], x1 // p1 + st1 {v24.8h}, [x0], x1 // q0 + st1 {v23.8h}, [x16], x1 // p0 + st1 {v25.8h}, [x0], x1 // q1 + sub x0, x0, x1, lsl #1 + br x15 +endfunc + +function lpf_h_6_8_neon + mov x15, x30 + sub x16, x0, #8 + add x0, x16, x1, lsl #2 + ld1 {v20.8h}, [x16], x1 + ld1 {v24.8h}, [x0], x1 + ld1 {v21.8h}, [x16], x1 + ld1 {v25.8h}, [x0], x1 + ld1 {v22.8h}, [x16], x1 + ld1 {v26.8h}, [x0], x1 + ld1 {v23.8h}, [x16], x1 + ld1 {v27.8h}, [x0], x1 + add x0, x0, #8 + + transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 + + lpf_8_wd6 + + sub x16, x0, x1, lsl #3 + sub x16, x16, #4 + transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29 + add x0, x16, x1, lsl #2 + + st1 {v22.d}[0], [x16], x1 + st1 {v22.d}[1], [x0], x1 + st1 {v23.d}[0], [x16], x1 + st1 {v23.d}[1], [x0], x1 + st1 {v24.d}[0], [x16], x1 + st1 {v24.d}[1], [x0], x1 + st1 {v25.d}[0], [x16], x1 + st1 {v25.d}[1], [x0], x1 + add x0, x0, #4 + br x15 +endfunc + +function lpf_v_8_8_neon + mov x15, x30 + sub x16, x0, x1, lsl #2 + ld1 {v20.8h}, [x16], x1 // p3 + ld1 {v24.8h}, [x0], x1 // q0 + ld1 {v21.8h}, [x16], x1 // p2 + ld1 {v25.8h}, [x0], x1 // q1 + ld1 {v22.8h}, [x16], x1 // p1 + ld1 {v26.8h}, [x0], x1 // q2 + ld1 {v23.8h}, [x16], x1 // p0 + ld1 {v27.8h}, [x0], x1 // q3 + sub x0, x0, x1, lsl #2 + + lpf_8_wd8 + + sub x16, x0, x1, lsl #1 + sub x16, x16, x1 + st1 {v21.8h}, [x16], x1 // p2 + st1 {v24.8h}, [x0], x1 // q0 + st1 {v22.8h}, [x16], x1 // p1 + st1 {v25.8h}, [x0], x1 // q1 + st1 {v23.8h}, [x16], x1 // p0 + st1 {v26.8h}, [x0], x1 // q2 + sub x0, x0, x1, lsl #1 + sub x0, x0, x1 + br x15 + +8: + sub x16, x0, x1, lsl #1 + st1 {v22.8h}, [x16], x1 // p1 + st1 {v24.8h}, [x0], x1 // q0 + st1 {v23.8h}, [x16], x1 // p0 + st1 {v25.8h}, [x0], x1 // q1 + sub x0, x0, x1, lsl #1 + br x15 +endfunc + +function lpf_h_8_8_neon + mov x15, x30 + sub x16, x0, #8 + add x0, x16, x1, lsl #2 + ld1 {v20.8h}, [x16], x1 + ld1 {v24.8h}, [x0], x1 + ld1 {v21.8h}, [x16], x1 + ld1 {v25.8h}, [x0], x1 + ld1 {v22.8h}, [x16], x1 + ld1 {v26.8h}, [x0], x1 + ld1 {v23.8h}, [x16], x1 + ld1 {v27.8h}, [x0], x1 + add x0, x0, #8 + + transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 + + lpf_8_wd8 + + sub x16, x0, x1, lsl #3 + sub x16, x16, #8 + transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 + add x0, x16, x1, lsl #2 + + st1 {v20.8h}, [x16], x1 + st1 {v24.8h}, [x0], x1 + st1 {v21.8h}, [x16], x1 + st1 {v25.8h}, [x0], x1 + st1 {v22.8h}, [x16], x1 + st1 {v26.8h}, [x0], x1 + st1 {v23.8h}, [x16], x1 + st1 {v27.8h}, [x0], x1 + add x0, x0, #8 + br x15 +8: + sub x16, x0, x1, lsl #3 + sub x16, x16, #4 + transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29 + add x0, x16, x1, lsl #2 + + st1 {v22.d}[0], [x16], x1 + st1 {v22.d}[1], [x0], x1 + st1 {v23.d}[0], [x16], x1 + st1 {v23.d}[1], [x0], x1 + st1 {v24.d}[0], [x16], x1 + st1 {v24.d}[1], [x0], x1 + st1 {v25.d}[0], [x16], x1 + st1 {v25.d}[1], [x0], x1 + add x0, x0, #4 + br x15 +endfunc + +function lpf_v_16_8_neon + mov x15, x30 + + sub x16, x0, x1, lsl #3 + add x16, x16, x1 + ld1 {v17.8h}, [x16], x1 // p6 + ld1 {v24.8h}, [x0], x1 // q0 + ld1 {v18.8h}, [x16], x1 // p5 + ld1 {v25.8h}, [x0], x1 // q1 + ld1 {v19.8h}, [x16], x1 // p4 + ld1 {v26.8h}, [x0], x1 // q2 + ld1 {v20.8h}, [x16], x1 // p3 + ld1 {v27.8h}, [x0], x1 // q3 + ld1 {v21.8h}, [x16], x1 // p2 + ld1 {v28.8h}, [x0], x1 // q4 + ld1 {v22.8h}, [x16], x1 // p1 + ld1 {v29.8h}, [x0], x1 // q5 + ld1 {v23.8h}, [x16], x1 // p0 + ld1 {v30.8h}, [x0], x1 // q6 + sub x0, x0, x1, lsl #3 + add x0, x0, x1 + + lpf_8_wd16 + + sub x16, x0, x1, lsl #2 + sub x16, x16, x1, lsl #1 + st1 {v0.8h}, [x16], x1 // p5 + st1 {v6.8h}, [x0], x1 // q0 + st1 {v1.8h}, [x16], x1 // p4 + st1 {v7.8h}, [x0], x1 // q1 + st1 {v2.8h}, [x16], x1 // p3 + st1 {v8.8h}, [x0], x1 // q2 + st1 {v3.8h}, [x16], x1 // p2 + st1 {v9.8h}, [x0], x1 // q3 + st1 {v4.8h}, [x16], x1 // p1 + st1 {v10.8h}, [x0], x1 // q4 + st1 {v5.8h}, [x16], x1 // p0 + st1 {v11.8h}, [x0], x1 // q5 + sub x0, x0, x1, lsl #2 + sub x0, x0, x1, lsl #1 + br x15 +7: + sub x16, x0, x1 + sub x16, x16, x1, lsl #1 + st1 {v21.8h}, [x16], x1 // p2 + st1 {v24.8h}, [x0], x1 // q0 + st1 {v22.8h}, [x16], x1 // p1 + st1 {v25.8h}, [x0], x1 // q1 + st1 {v23.8h}, [x16], x1 // p0 + st1 {v26.8h}, [x0], x1 // q2 + sub x0, x0, x1, lsl #1 + sub x0, x0, x1 + br x15 + +8: + sub x16, x0, x1, lsl #1 + st1 {v22.8h}, [x16], x1 // p1 + st1 {v24.8h}, [x0], x1 // q0 + st1 {v23.8h}, [x16], x1 // p0 + st1 {v25.8h}, [x0], x1 // q1 + sub x0, x0, x1, lsl #1 + br x15 +endfunc + +function lpf_h_16_8_neon + mov x15, x30 + sub x16, x0, #16 + ld1 {v16.8h}, [x16], x1 + ld1 {v24.8h}, [x0], x1 + ld1 {v17.8h}, [x16], x1 + ld1 {v25.8h}, [x0], x1 + ld1 {v18.8h}, [x16], x1 + ld1 {v26.8h}, [x0], x1 + ld1 {v19.8h}, [x16], x1 + ld1 {v27.8h}, [x0], x1 + ld1 {v20.8h}, [x16], x1 + ld1 {v28.8h}, [x0], x1 + ld1 {v21.8h}, [x16], x1 + ld1 {v29.8h}, [x0], x1 + ld1 {v22.8h}, [x16], x1 + ld1 {v30.8h}, [x0], x1 + ld1 {v23.8h}, [x16], x1 + ld1 {v31.8h}, [x0], x1 + + transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 + transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v0, v1 + + lpf_8_wd16 + + sub x0, x0, x1, lsl #3 + sub x16, x0, #16 + + transpose_8x8h v16, v17, v0, v1, v2, v3, v4, v5, v18, v19 + transpose_8x8h v6, v7, v8, v9, v10, v11, v30, v31, v18, v19 + + st1 {v16.8h}, [x16], x1 + st1 {v6.8h}, [x0], x1 + st1 {v17.8h}, [x16], x1 + st1 {v7.8h}, [x0], x1 + st1 {v0.8h}, [x16], x1 + st1 {v8.8h}, [x0], x1 + st1 {v1.8h}, [x16], x1 + st1 {v9.8h}, [x0], x1 + st1 {v2.8h}, [x16], x1 + st1 {v10.8h}, [x0], x1 + st1 {v3.8h}, [x16], x1 + st1 {v11.8h}, [x0], x1 + st1 {v4.8h}, [x16], x1 + st1 {v30.8h}, [x0], x1 + st1 {v5.8h}, [x16], x1 + st1 {v31.8h}, [x0], x1 + br x15 + +7: + sub x16, x0, x1, lsl #3 + sub x16, x16, #8 + transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 + add x0, x16, x1, lsl #2 + + st1 {v20.8h}, [x16], x1 + st1 {v24.8h}, [x0], x1 + st1 {v21.8h}, [x16], x1 + st1 {v25.8h}, [x0], x1 + st1 {v22.8h}, [x16], x1 + st1 {v26.8h}, [x0], x1 + st1 {v23.8h}, [x16], x1 + st1 {v27.8h}, [x0], x1 + add x0, x0, #8 + br x15 +8: + sub x16, x0, x1, lsl #3 + sub x16, x16, #4 + transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29 + add x0, x16, x1, lsl #2 + + st1 {v22.d}[0], [x16], x1 + st1 {v22.d}[1], [x0], x1 + st1 {v23.d}[0], [x16], x1 + st1 {v23.d}[1], [x0], x1 + st1 {v24.d}[0], [x16], x1 + st1 {v24.d}[1], [x0], x1 + st1 {v25.d}[0], [x16], x1 + st1 {v25.d}[1], [x0], x1 + add x0, x0, #4 + br x15 +endfunc + +// void dav1d_lpf_v_sb_y_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const uint32_t *const vmask, +// const uint8_t (*l)[4], ptrdiff_t b4_stride, +// const Av1FilterLUT *lut, const int w, +// const int bitdepth_max) + +.macro lpf_func dir, type +function lpf_\dir\()_sb_\type\()_16bpc_neon, export=1 + mov x11, x30 + mov w8, w7 // bitdepth_max + clz w9, w8 + mov w10, #24 + sub w9, w10, w9 // bitdepth_min_8 + stp d8, d9, [sp, #-0x40]! + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + ldp w6, w7, [x2] // vmask[0], vmask[1] +.ifc \type, y + ldr w2, [x2, #8] // vmask[2] +.endif + add x5, x5, #128 // Move to sharp part of lut +.ifc \type, y + orr w7, w7, w2 // vmask[1] |= vmask[2] +.endif +.ifc \dir, v + sub x4, x3, x4, lsl #2 +.else + sub x3, x3, #4 + lsl x4, x4, #2 +.endif + orr w6, w6, w7 // vmask[0] |= vmask[1] + +1: + tst w6, #0x0f +.ifc \dir, v + ld1 {v0.8b}, [x4], #8 + ld1 {v1.8b}, [x3], #8 +.else + ld2 {v0.s,v1.s}[0], [x3], x4 + ld2 {v0.s,v1.s}[1], [x3], x4 +.endif + b.eq 7f // if (!(vm & bits)) continue; + + ld1r {v5.8b}, [x5] // sharp[0] + add x5, x5, #8 + movi v2.2s, #0xff + dup v13.2s, w6 // vmask[0] + dup v31.8h, w9 // bitdepth_min_8 + + and v0.8b, v0.8b, v2.8b // Keep only lowest byte in each 32 bit word + and v1.8b, v1.8b, v2.8b + cmtst v3.8b, v1.8b, v2.8b // Check for nonzero values in l[0][0] + movi v4.8b, #1 + ld1r {v6.8b}, [x5] // sharp[1] + sub x5, x5, #8 + bif v1.8b, v0.8b, v3.8b // if (!l[0][0]) L = l[offset][0] + mul v1.2s, v1.2s, v4.2s // L +.ifc \type, y + dup v15.2s, w2 // vmask[2] +.endif + cmtst v2.2s, v1.2s, v2.2s // L != 0 + dup v14.2s, w7 // vmask[1] + mov x16, v2.d[0] + cmp x16, #0 + b.eq 7f // if (!L) continue; + neg v5.8b, v5.8b // -sharp[0] + movrel x16, word_12 + ushr v12.8b, v1.8b, #4 // H + ld1 {v16.2s}, [x16] + sshl v3.8b, v1.8b, v5.8b // L >> sharp[0] +.ifc \type, y + cmtst v15.2s, v15.2s, v16.2s // if (vmask[2] & bits) +.endif + movi v7.8b, #2 + umin v3.8b, v3.8b, v6.8b // imin(L >> sharp[0], sharp[1]) + add v0.8b, v1.8b, v7.8b // L + 2 + umax v11.8b, v3.8b, v4.8b // imax(imin(), 1) = limit = I + add v0.8b, v0.8b, v0.8b // 2*(L + 2) + cmtst v14.2s, v14.2s, v16.2s // if (vmask[1] & bits) + uxtl v12.8h, v12.8b + add v10.8b, v0.8b, v11.8b // 2*(L + 2) + limit = E + cmtst v13.2s, v13.2s, v16.2s // if (vmask[0] & bits) + uxtl v11.8h, v11.8b + uxtl v10.8h, v10.8b + and v13.8b, v13.8b, v2.8b // vmask[0] &= L != 0 + sxtl v14.8h, v14.8b + sxtl v13.8h, v13.8b +.ifc \type, y + sxtl v15.8h, v15.8b +.endif + ushl v12.8h, v12.8h, v31.8h + ushl v11.8h, v11.8h, v31.8h + ushl v10.8h, v10.8h, v31.8h + +.ifc \type, y + tst w2, #0x0f + b.eq 2f + // wd16 + bl lpf_\dir\()_16_8_neon + b 8f +2: +.endif + tst w7, #0x0f + b.eq 3f +.ifc \type, y + // wd8 + bl lpf_\dir\()_8_8_neon +.else + // wd6 + bl lpf_\dir\()_6_8_neon +.endif + b 8f +3: + // wd4 + bl lpf_\dir\()_4_8_neon +.ifc \dir, h + b 8f +7: + // For dir h, the functions above increment x0. + // If the whole function is skipped, increment it here instead. + add x0, x0, x1, lsl #3 +.else +7: +.endif +8: + lsr w6, w6, #2 // vmask[0] >>= 2 + lsr w7, w7, #2 // vmask[1] >>= 2 +.ifc \type, y + lsr w2, w2, #2 // vmask[2] >>= 2 +.endif +.ifc \dir, v + add x0, x0, #16 +.else + // For dir h, x0 is returned incremented +.endif + cbnz w6, 1b + + ldp d14, d15, [sp, #0x30] + ldp d12, d13, [sp, #0x20] + ldp d10, d11, [sp, #0x10] + ldp d8, d9, [sp], 0x40 + br x11 +endfunc +.endm + +lpf_func v, y +lpf_func h, y +lpf_func v, uv +lpf_func h, uv + +const word_12 + .word 1, 2 +endconst diff --git a/third_party/dav1d/src/arm/64/looprestoration.S b/third_party/dav1d/src/arm/64/looprestoration.S new file mode 100644 index 0000000000..1e864c29ac --- /dev/null +++ b/third_party/dav1d/src/arm/64/looprestoration.S @@ -0,0 +1,1152 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +// void dav1d_wiener_filter_h_8bpc_neon(int16_t *dst, const pixel (*left)[4], +// const pixel *src, ptrdiff_t stride, +// const int16_t fh[7], const intptr_t w, +// int h, enum LrEdgeFlags edges); +function wiener_filter_h_8bpc_neon, export=1 + mov w8, w5 + ld1 {v0.8h}, [x4] + mov w9, #(1 << 14) - (1 << 2) + dup v30.8h, w9 + movi v31.8h, #8, lsl #8 + // Calculate mid_stride + add w10, w5, #7 + bic w10, w10, #7 + lsl w10, w10, #1 + + // Clear the last unused element of v0, to allow filtering a single + // pixel with one plain mul+addv. + ins v0.h[7], wzr + + // Set up pointers for reading/writing alternate rows + add x12, x0, x10 + lsl w10, w10, #1 + add x13, x2, x3 + lsl x3, x3, #1 + + // Subtract the width from mid_stride + sub x10, x10, w5, uxtw #1 + + // For w >= 8, we read (w+5)&~7+8 pixels, for w < 8 we read 16 pixels. + cmp w5, #8 + add w11, w5, #13 + bic w11, w11, #7 + b.ge 1f + mov w11, #16 +1: + sub x3, x3, w11, uxtw + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst w7, #1 // LR_HAVE_LEFT + b.eq 2f + // LR_HAVE_LEFT + cbnz x1, 0f + // left == NULL + sub x2, x2, #3 + sub x13, x13, #3 + b 1f +0: // LR_HAVE_LEFT, left != NULL +2: // !LR_HAVE_LEFT, increase the stride. + // For this case we don't read the left 3 pixels from the src pointer, + // but shift it as if we had done that. + add x3, x3, #3 + + +1: // Loop vertically + ld1 {v3.16b}, [x2], #16 + ld1 {v5.16b}, [x13], #16 + + tst w7, #1 // LR_HAVE_LEFT + b.eq 0f + cbz x1, 2f + // LR_HAVE_LEFT, left != NULL + ld1 {v2.s}[3], [x1], #4 + // Move x2/x13 back to account for the last 3 bytes we loaded earlier, + // which we'll shift out. + sub x2, x2, #3 + sub x13, x13, #3 + ld1 {v4.s}[3], [x1], #4 + ext v3.16b, v2.16b, v3.16b, #13 + ext v5.16b, v4.16b, v5.16b, #13 + b 2f +0: + // !LR_HAVE_LEFT, fill v2 with the leftmost byte + // and shift v3 to have 3x the first byte at the front. + dup v2.16b, v3.b[0] + dup v4.16b, v5.b[0] + // Move x2 back to account for the last 3 bytes we loaded before, + // which we shifted out. + sub x2, x2, #3 + sub x13, x13, #3 + ext v3.16b, v2.16b, v3.16b, #13 + ext v5.16b, v4.16b, v5.16b, #13 + +2: + uxtl v2.8h, v3.8b + uxtl2 v3.8h, v3.16b + uxtl v4.8h, v5.8b + uxtl2 v5.8h, v5.16b + + tst w7, #2 // LR_HAVE_RIGHT + b.ne 4f + // If we'll need to pad the right edge, load that byte to pad with + // here since we can find it pretty easily from here. + sub w9, w5, #14 + ldr b28, [x2, w9, sxtw] + ldr b29, [x13, w9, sxtw] + // Fill v28/v29 with the right padding pixel + dup v28.8b, v28.b[0] + dup v29.8b, v29.b[0] + uxtl v28.8h, v28.8b + uxtl v29.8h, v29.8b +3: // !LR_HAVE_RIGHT + // If we'll have to pad the right edge we need to quit early here. + cmp w5, #11 + b.ge 4f // If w >= 11, all used input pixels are valid + cmp w5, #7 + b.ge 5f // If w >= 7, we can filter 4 pixels + b 6f + +4: // Loop horizontally +.macro filter wd + // Interleaving the mul/mla chains actually hurts performance + // significantly on Cortex A53, thus keeping mul/mla tightly + // chained like this. + ext v16.16b, v2.16b, v3.16b, #2 + ext v17.16b, v2.16b, v3.16b, #4 + ext v18.16b, v2.16b, v3.16b, #6 + ext v19.16b, v2.16b, v3.16b, #8 + ext v20.16b, v2.16b, v3.16b, #10 + ext v21.16b, v2.16b, v3.16b, #12 + mul v6\wd, v2\wd, v0.h[0] + mla v6\wd, v16\wd, v0.h[1] + mla v6\wd, v17\wd, v0.h[2] + mla v6\wd, v18\wd, v0.h[3] + mla v6\wd, v19\wd, v0.h[4] + mla v6\wd, v20\wd, v0.h[5] + mla v6\wd, v21\wd, v0.h[6] + ext v22.16b, v4.16b, v5.16b, #2 + ext v23.16b, v4.16b, v5.16b, #4 + ext v24.16b, v4.16b, v5.16b, #6 + ext v25.16b, v4.16b, v5.16b, #8 + ext v26.16b, v4.16b, v5.16b, #10 + ext v27.16b, v4.16b, v5.16b, #12 + mul v7\wd, v4\wd, v0.h[0] + mla v7\wd, v22\wd, v0.h[1] + mla v7\wd, v23\wd, v0.h[2] + mla v7\wd, v24\wd, v0.h[3] + mla v7\wd, v25\wd, v0.h[4] + mla v7\wd, v26\wd, v0.h[5] + mla v7\wd, v27\wd, v0.h[6] + + shl v18\wd, v18\wd, #7 + shl v24\wd, v24\wd, #7 + sub v18\wd, v18\wd, v30\wd + sub v24\wd, v24\wd, v30\wd + sqadd v6\wd, v6\wd, v18\wd + sqadd v7\wd, v7\wd, v24\wd + sshr v6\wd, v6\wd, #3 + sshr v7\wd, v7\wd, #3 + add v6\wd, v6\wd, v31\wd + add v7\wd, v7\wd, v31\wd +.endm + filter .8h + st1 {v6.8h}, [x0], #16 + st1 {v7.8h}, [x12], #16 + + subs w5, w5, #8 + b.le 9f + tst w7, #2 // LR_HAVE_RIGHT + mov v2.16b, v3.16b + mov v4.16b, v5.16b + ld1 {v3.8b}, [x2], #8 + ld1 {v5.8b}, [x13], #8 + uxtl v3.8h, v3.8b + uxtl v5.8h, v5.8b + b.ne 4b // If we don't need to pad, just keep filtering. + b 3b // If we need to pad, check how many pixels we have left. + +5: // Filter 4 pixels, 7 <= w < 11 + filter .4h + st1 {v6.4h}, [x0], #8 + st1 {v7.4h}, [x12], #8 + + subs w5, w5, #4 // 3 <= w < 7 + ext v2.16b, v2.16b, v3.16b, #8 + ext v3.16b, v3.16b, v3.16b, #8 + ext v4.16b, v4.16b, v5.16b, #8 + ext v5.16b, v5.16b, v5.16b, #8 + +6: // Pad the right edge and filter the last few pixels. + // w < 7, w+3 pixels valid in v2-v3 + cmp w5, #5 + b.lt 7f + b.gt 8f + // w == 5, 8 pixels valid in v2, v3 invalid + mov v3.16b, v28.16b + mov v5.16b, v29.16b + b 88f + +7: // 1 <= w < 5, 4-7 pixels valid in v2 + sub w9, w5, #1 + // w9 = (pixels valid - 4) + adr x11, L(variable_shift_tbl) + ldrh w9, [x11, w9, uxtw #1] + sub x11, x11, w9, uxth + mov v3.16b, v28.16b + mov v5.16b, v29.16b + br x11 +44: // 4 pixels valid in v2/v4, fill the high half with padding. + ins v2.d[1], v3.d[0] + ins v4.d[1], v5.d[0] + b 88f + // Shift v2 right, shifting out invalid pixels, + // shift v2 left to the original offset, shifting in padding pixels. +55: // 5 pixels valid + ext v2.16b, v2.16b, v2.16b, #10 + ext v2.16b, v2.16b, v3.16b, #6 + ext v4.16b, v4.16b, v4.16b, #10 + ext v4.16b, v4.16b, v5.16b, #6 + b 88f +66: // 6 pixels valid, fill the upper 2 pixels with padding. + ins v2.s[3], v3.s[0] + ins v4.s[3], v5.s[0] + b 88f +77: // 7 pixels valid, fill the last pixel with padding. + ins v2.h[7], v3.h[0] + ins v4.h[7], v5.h[0] + b 88f + +L(variable_shift_tbl): + .hword L(variable_shift_tbl) - 44b + .hword L(variable_shift_tbl) - 55b + .hword L(variable_shift_tbl) - 66b + .hword L(variable_shift_tbl) - 77b + +8: // w > 5, w == 6, 9 pixels valid in v2-v3, 1 pixel valid in v3 + ins v28.h[0], v3.h[0] + ins v29.h[0], v5.h[0] + mov v3.16b, v28.16b + mov v5.16b, v29.16b + +88: + // w < 7, v2-v3 padded properly + cmp w5, #4 + b.lt 888f + + // w >= 4, filter 4 pixels + filter .4h + st1 {v6.4h}, [x0], #8 + st1 {v7.4h}, [x12], #8 + subs w5, w5, #4 // 0 <= w < 4 + ext v2.16b, v2.16b, v3.16b, #8 + ext v4.16b, v4.16b, v5.16b, #8 + b.eq 9f +888: // 1 <= w < 4, filter 1 pixel at a time + mul v6.8h, v2.8h, v0.8h + mul v7.8h, v4.8h, v0.8h + addv h6, v6.8h + addv h7, v7.8h + dup v16.4h, v2.h[3] + ins v16.h[1], v4.h[3] + ins v6.h[1], v7.h[0] + shl v16.4h, v16.4h, #7 + sub v16.4h, v16.4h, v30.4h + sqadd v6.4h, v6.4h, v16.4h + sshr v6.4h, v6.4h, #3 + add v6.4h, v6.4h, v31.4h + st1 {v6.h}[0], [x0], #2 + st1 {v6.h}[1], [x12], #2 + subs w5, w5, #1 + ext v2.16b, v2.16b, v3.16b, #2 + ext v4.16b, v4.16b, v5.16b, #2 + b.gt 888b + +9: + subs w6, w6, #2 + b.le 0f + // Jump to the next row and loop horizontally + add x0, x0, x10 + add x12, x12, x10 + add x2, x2, x3 + add x13, x13, x3 + mov w5, w8 + b 1b +0: + ret +.purgem filter +endfunc + +// void dav1d_wiener_filter_v_8bpc_neon(pixel *dst, ptrdiff_t stride, +// const int16_t *mid, int w, int h, +// const int16_t fv[7], enum LrEdgeFlags edges, +// ptrdiff_t mid_stride); +function wiener_filter_v_8bpc_neon, export=1 + mov w8, w4 + ld1 {v0.8h}, [x5] + movi v1.8h, #128 + add v1.8h, v1.8h, v0.8h + + // Calculate the number of rows to move back when looping vertically + mov w11, w4 + tst w6, #4 // LR_HAVE_TOP + b.eq 0f + sub x2, x2, x7, lsl #1 + add w11, w11, #2 +0: + tst w6, #8 // LR_HAVE_BOTTOM + b.eq 1f + add w11, w11, #2 + +1: // Start of horizontal loop; start one vertical filter slice. + // Load rows into v16-v19 and pad properly. + tst w6, #4 // LR_HAVE_TOP + ld1 {v16.8h}, [x2], x7 + b.eq 2f + // LR_HAVE_TOP + ld1 {v18.8h}, [x2], x7 + mov v17.16b, v16.16b + ld1 {v19.8h}, [x2], x7 + b 3f +2: // !LR_HAVE_TOP + mov v17.16b, v16.16b + mov v18.16b, v16.16b + mov v19.16b, v16.16b + +3: + cmp w4, #4 + b.lt 5f + // Start filtering normally; fill in v20-v22 with unique rows. + ld1 {v20.8h}, [x2], x7 + ld1 {v21.8h}, [x2], x7 + ld1 {v22.8h}, [x2], x7 + +4: +.macro filter compare + subs w4, w4, #1 + // Interleaving the mul/mla chains actually hurts performance + // significantly on Cortex A53, thus keeping mul/mla tightly + // chained like this. + smull v2.4s, v16.4h, v0.h[0] + smlal v2.4s, v17.4h, v0.h[1] + smlal v2.4s, v18.4h, v0.h[2] + smlal v2.4s, v19.4h, v1.h[3] + smlal v2.4s, v20.4h, v0.h[4] + smlal v2.4s, v21.4h, v0.h[5] + smlal v2.4s, v22.4h, v0.h[6] + smull2 v3.4s, v16.8h, v0.h[0] + smlal2 v3.4s, v17.8h, v0.h[1] + smlal2 v3.4s, v18.8h, v0.h[2] + smlal2 v3.4s, v19.8h, v1.h[3] + smlal2 v3.4s, v20.8h, v0.h[4] + smlal2 v3.4s, v21.8h, v0.h[5] + smlal2 v3.4s, v22.8h, v0.h[6] + sqrshrun v2.4h, v2.4s, #11 + sqrshrun2 v2.8h, v3.4s, #11 + sqxtun v2.8b, v2.8h + st1 {v2.8b}, [x0], x1 +.if \compare + cmp w4, #4 +.else + b.le 9f +.endif + mov v16.16b, v17.16b + mov v17.16b, v18.16b + mov v18.16b, v19.16b + mov v19.16b, v20.16b + mov v20.16b, v21.16b + mov v21.16b, v22.16b +.endm + filter 1 + b.lt 7f + ld1 {v22.8h}, [x2], x7 + b 4b + +5: // Less than 4 rows in total; not all of v20-v21 are filled yet. + tst w6, #8 // LR_HAVE_BOTTOM + b.eq 6f + // LR_HAVE_BOTTOM + cmp w4, #2 + // We load at least 2 rows in all cases. + ld1 {v20.8h}, [x2], x7 + ld1 {v21.8h}, [x2], x7 + b.gt 53f // 3 rows in total + b.eq 52f // 2 rows in total +51: // 1 row in total, v19 already loaded, load edge into v20-v22. + mov v22.16b, v21.16b + b 8f +52: // 2 rows in total, v19 already loaded, load v20 with content data + // and 2 rows of edge. + ld1 {v22.8h}, [x2], x7 + mov v23.16b, v22.16b + b 8f +53: + // 3 rows in total, v19 already loaded, load v20 and v21 with content + // and 2 rows of edge. + ld1 {v22.8h}, [x2], x7 + ld1 {v23.8h}, [x2], x7 + mov v24.16b, v23.16b + b 8f + +6: + // !LR_HAVE_BOTTOM + cmp w4, #2 + b.gt 63f // 3 rows in total + b.eq 62f // 2 rows in total +61: // 1 row in total, v19 already loaded, pad that into v20-v22. + mov v20.16b, v19.16b + mov v21.16b, v19.16b + mov v22.16b, v19.16b + b 8f +62: // 2 rows in total, v19 already loaded, load v20 and pad that into v21-v23. + ld1 {v20.8h}, [x2], x7 + mov v21.16b, v20.16b + mov v22.16b, v20.16b + mov v23.16b, v20.16b + b 8f +63: + // 3 rows in total, v19 already loaded, load v20 and v21 and pad v21 into v22-v24. + ld1 {v20.8h}, [x2], x7 + ld1 {v21.8h}, [x2], x7 + mov v22.16b, v21.16b + mov v23.16b, v21.16b + mov v24.16b, v21.16b + b 8f + +7: + // All registers up to v21 are filled already, 3 valid rows left. + // < 4 valid rows left; fill in padding and filter the last + // few rows. + tst w6, #8 // LR_HAVE_BOTTOM + b.eq 71f + // LR_HAVE_BOTTOM; load 2 rows of edge. + ld1 {v22.8h}, [x2], x7 + ld1 {v23.8h}, [x2], x7 + mov v24.16b, v23.16b + b 8f +71: + // !LR_HAVE_BOTTOM, pad 3 rows + mov v22.16b, v21.16b + mov v23.16b, v21.16b + mov v24.16b, v21.16b + +8: // At this point, all registers up to v22-v24 are loaded with + // edge/padding (depending on how many rows are left). + filter 0 // This branches to 9f when done + mov v22.16b, v23.16b + mov v23.16b, v24.16b + b 8b + +9: // End of one vertical slice. + subs w3, w3, #8 + b.le 0f + // Move pointers back up to the top and loop horizontally. + msub x0, x1, x8, x0 + msub x2, x7, x11, x2 + add x0, x0, #8 + add x2, x2, #16 + mov w4, w8 + b 1b + +0: + ret +.purgem filter +endfunc + +// void dav1d_copy_narrow_8bpc_neon(pixel *dst, ptrdiff_t stride, +// const pixel *src, int w, int h); +function copy_narrow_8bpc_neon, export=1 + adr x5, L(copy_narrow_tbl) + ldrh w6, [x5, w3, uxtw #1] + sub x5, x5, w6, uxth + br x5 +10: + add x7, x0, x1 + lsl x1, x1, #1 +18: + subs w4, w4, #8 + b.lt 110f + ld1 {v0.8b}, [x2], #8 + st1 {v0.b}[0], [x0], x1 + st1 {v0.b}[1], [x7], x1 + st1 {v0.b}[2], [x0], x1 + st1 {v0.b}[3], [x7], x1 + st1 {v0.b}[4], [x0], x1 + st1 {v0.b}[5], [x7], x1 + st1 {v0.b}[6], [x0], x1 + st1 {v0.b}[7], [x7], x1 + b.le 0f + b 18b +110: + add w4, w4, #8 + asr x1, x1, #1 +11: + subs w4, w4, #1 + ld1 {v0.b}[0], [x2], #1 + st1 {v0.b}[0], [x0], x1 + b.gt 11b +0: + ret + +20: + add x7, x0, x1 + lsl x1, x1, #1 +24: + subs w4, w4, #4 + b.lt 210f + ld1 {v0.4h}, [x2], #8 + st1 {v0.h}[0], [x0], x1 + st1 {v0.h}[1], [x7], x1 + st1 {v0.h}[2], [x0], x1 + st1 {v0.h}[3], [x7], x1 + b.le 0f + b 24b +210: + add w4, w4, #4 + asr x1, x1, #1 +22: + subs w4, w4, #1 + ld1 {v0.h}[0], [x2], #2 + st1 {v0.h}[0], [x0], x1 + b.gt 22b +0: + ret + +30: + ldrh w5, [x2] + ldrb w6, [x2, #2] + add x2, x2, #3 + subs w4, w4, #1 + strh w5, [x0] + strb w6, [x0, #2] + add x0, x0, x1 + b.gt 30b + ret + +40: + add x7, x0, x1 + lsl x1, x1, #1 +42: + subs w4, w4, #2 + b.lt 41f + ld1 {v0.2s}, [x2], #8 + st1 {v0.s}[0], [x0], x1 + st1 {v0.s}[1], [x7], x1 + b.le 0f + b 42b +41: + ld1 {v0.s}[0], [x2] + st1 {v0.s}[0], [x0] +0: + ret + +50: + ldr w5, [x2] + ldrb w6, [x2, #4] + add x2, x2, #5 + subs w4, w4, #1 + str w5, [x0] + strb w6, [x0, #4] + add x0, x0, x1 + b.gt 50b + ret + +60: + ldr w5, [x2] + ldrh w6, [x2, #4] + add x2, x2, #6 + subs w4, w4, #1 + str w5, [x0] + strh w6, [x0, #4] + add x0, x0, x1 + b.gt 60b + ret + +70: + ldr w5, [x2] + ldrh w6, [x2, #4] + ldrb w7, [x2, #6] + add x2, x2, #7 + subs w4, w4, #1 + str w5, [x0] + strh w6, [x0, #4] + strb w7, [x0, #6] + add x0, x0, x1 + b.gt 70b + ret + +L(copy_narrow_tbl): + .hword 0 + .hword L(copy_narrow_tbl) - 10b + .hword L(copy_narrow_tbl) - 20b + .hword L(copy_narrow_tbl) - 30b + .hword L(copy_narrow_tbl) - 40b + .hword L(copy_narrow_tbl) - 50b + .hword L(copy_narrow_tbl) - 60b + .hword L(copy_narrow_tbl) - 70b +endfunc + +#define SUM_STRIDE (384+16) + +#include "looprestoration_tmpl.S" + +// void dav1d_sgr_box3_h_8bpc_neon(int32_t *sumsq, int16_t *sum, +// const pixel (*left)[4], +// const pixel *src, const ptrdiff_t stride, +// const int w, const int h, +// const enum LrEdgeFlags edges); +function sgr_box3_h_8bpc_neon, export=1 + add w5, w5, #2 // w += 2 + + // Set up pointers for reading/writing alternate rows + add x10, x0, #(4*SUM_STRIDE) // sumsq + add x11, x1, #(2*SUM_STRIDE) // sum + add x12, x3, x4 // src + lsl x4, x4, #1 + mov x9, #(2*2*SUM_STRIDE) // double sum stride + + // Subtract the aligned width from the output stride. + // With LR_HAVE_RIGHT, align to 8, without it, align to 4. + tst w7, #2 // LR_HAVE_RIGHT + b.ne 0f + // !LR_HAVE_RIGHT + add w13, w5, #3 + bic w13, w13, #3 + b 1f +0: + add w13, w5, #7 + bic w13, w13, #7 +1: + sub x9, x9, w13, uxtw #1 + + // Store the width for the vertical loop + mov w8, w5 + + // Subtract the number of pixels read from the input from the stride + add w13, w5, #14 + bic w13, w13, #7 + sub x4, x4, w13, uxtw + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst w7, #1 // LR_HAVE_LEFT + b.eq 2f + // LR_HAVE_LEFT + cbnz x2, 0f + // left == NULL + sub x3, x3, #2 + sub x12, x12, #2 + b 1f +0: // LR_HAVE_LEFT, left != NULL +2: // !LR_HAVE_LEFT, increase the stride. + // For this case we don't read the left 2 pixels from the src pointer, + // but shift it as if we had done that. + add x4, x4, #2 + + +1: // Loop vertically + ld1 {v0.16b}, [x3], #16 + ld1 {v4.16b}, [x12], #16 + + tst w7, #1 // LR_HAVE_LEFT + b.eq 0f + cbz x2, 2f + // LR_HAVE_LEFT, left != NULL + ld1 {v1.s}[3], [x2], #4 + // Move x3/x12 back to account for the last 2 bytes we loaded earlier, + // which we'll shift out. + sub x3, x3, #2 + sub x12, x12, #2 + ld1 {v5.s}[3], [x2], #4 + ext v0.16b, v1.16b, v0.16b, #14 + ext v4.16b, v5.16b, v4.16b, #14 + b 2f +0: + // !LR_HAVE_LEFT, fill v1 with the leftmost byte + // and shift v0 to have 2x the first byte at the front. + dup v1.16b, v0.b[0] + dup v5.16b, v4.b[0] + // Move x3 back to account for the last 2 bytes we loaded before, + // which we shifted out. + sub x3, x3, #2 + sub x12, x12, #2 + ext v0.16b, v1.16b, v0.16b, #14 + ext v4.16b, v5.16b, v4.16b, #14 + +2: + umull v1.8h, v0.8b, v0.8b + umull2 v2.8h, v0.16b, v0.16b + umull v5.8h, v4.8b, v4.8b + umull2 v6.8h, v4.16b, v4.16b + + tst w7, #2 // LR_HAVE_RIGHT + b.ne 4f + // If we'll need to pad the right edge, load that byte to pad with + // here since we can find it pretty easily from here. + sub w13, w5, #(2 + 16 - 2 + 1) + ldr b30, [x3, w13, sxtw] + ldr b31, [x12, w13, sxtw] + // Fill v30/v31 with the right padding pixel + dup v30.8b, v30.b[0] + dup v31.8b, v31.b[0] +3: // !LR_HAVE_RIGHT + // If we'll have to pad the right edge we need to quit early here. + cmp w5, #10 + b.ge 4f // If w >= 10, all used input pixels are valid + cmp w5, #6 + b.ge 5f // If w >= 6, we can filter 4 pixels + b 6f + +4: // Loop horizontally +.macro uaddl_nh dst1, dst2, src1, src2, w + uaddl \dst1, \src1\().4h, \src2\().4h +.if \w > 4 + uaddl2 \dst2, \src1\().8h, \src2\().8h +.endif +.endm +.macro uaddw_nh dst1, dst2, src, w + uaddw \dst1, \dst1, \src\().4h +.if \w > 4 + uaddw2 \dst2, \dst2, \src\().8h +.endif +.endm +.macro add_nh dst1, dst2, src1, src2, w + add \dst1, \dst1, \src1 +.if \w > 4 + add \dst2, \dst2, \src2 +.endif +.endm + +.macro add3 w + ext v16.16b, v0.16b, v0.16b, #1 + ext v17.16b, v0.16b, v0.16b, #2 + ext v18.16b, v4.16b, v4.16b, #1 + ext v19.16b, v4.16b, v4.16b, #2 + uaddl v3.8h, v0.8b, v16.8b + uaddw v3.8h, v3.8h, v17.8b + uaddl v7.8h, v4.8b, v18.8b + uaddw v7.8h, v7.8h, v19.8b + + ext v20.16b, v1.16b, v2.16b, #2 + ext v21.16b, v1.16b, v2.16b, #4 + ext v22.16b, v5.16b, v6.16b, #2 + ext v23.16b, v5.16b, v6.16b, #4 + + uaddl_nh v26.4s, v27.4s, v1, v20, \w + uaddw_nh v26.4s, v27.4s, v21, \w + + uaddl_nh v28.4s, v29.4s, v5, v22, \w + uaddw_nh v28.4s, v29.4s, v23, \w +.endm + add3 8 + st1 {v3.8h}, [x1], #16 + st1 {v7.8h}, [x11], #16 + st1 {v26.4s,v27.4s}, [x0], #32 + st1 {v28.4s,v29.4s}, [x10], #32 + + subs w5, w5, #8 + b.le 9f + tst w7, #2 // LR_HAVE_RIGHT + ld1 {v3.8b}, [x3], #8 + ld1 {v7.8b}, [x12], #8 + mov v1.16b, v2.16b + mov v5.16b, v6.16b + ext v0.16b, v0.16b, v3.16b, #8 + ext v4.16b, v4.16b, v7.16b, #8 + umull v2.8h, v3.8b, v3.8b + umull v6.8h, v7.8b, v7.8b + + b.ne 4b // If we don't need to pad, just keep summing. + b 3b // If we need to pad, check how many pixels we have left. + +5: // Produce 4 pixels, 6 <= w < 10 + add3 4 + st1 {v3.4h}, [x1], #8 + st1 {v7.4h}, [x11], #8 + st1 {v26.4s}, [x0], #16 + st1 {v28.4s}, [x10], #16 + + subs w5, w5, #4 // 2 <= w < 6 + ext v0.16b, v0.16b, v0.16b, #4 + ext v4.16b, v4.16b, v4.16b, #4 + +6: // Pad the right edge and produce the last few pixels. + // 2 <= w < 6, 2-5 pixels valid in v0 + sub w13, w5, #2 + // w13 = (pixels valid - 2) + adr x14, L(box3_variable_shift_tbl) + ldrh w13, [x14, w13, uxtw #1] + sub x13, x14, w13, uxth + br x13 + // Shift v0 right, shifting out invalid pixels, + // shift v0 left to the original offset, shifting in padding pixels. +22: // 2 pixels valid + ext v0.16b, v0.16b, v0.16b, #2 + ext v4.16b, v4.16b, v4.16b, #2 + ext v0.16b, v0.16b, v30.16b, #14 + ext v4.16b, v4.16b, v31.16b, #14 + b 88f +33: // 3 pixels valid + ext v0.16b, v0.16b, v0.16b, #3 + ext v4.16b, v4.16b, v4.16b, #3 + ext v0.16b, v0.16b, v30.16b, #13 + ext v4.16b, v4.16b, v31.16b, #13 + b 88f +44: // 4 pixels valid + ext v0.16b, v0.16b, v0.16b, #4 + ext v4.16b, v4.16b, v4.16b, #4 + ext v0.16b, v0.16b, v30.16b, #12 + ext v4.16b, v4.16b, v31.16b, #12 + b 88f +55: // 5 pixels valid + ext v0.16b, v0.16b, v0.16b, #5 + ext v4.16b, v4.16b, v4.16b, #5 + ext v0.16b, v0.16b, v30.16b, #11 + ext v4.16b, v4.16b, v31.16b, #11 + b 88f + +L(box3_variable_shift_tbl): + .hword L(box3_variable_shift_tbl) - 22b + .hword L(box3_variable_shift_tbl) - 33b + .hword L(box3_variable_shift_tbl) - 44b + .hword L(box3_variable_shift_tbl) - 55b + +88: + umull v1.8h, v0.8b, v0.8b + umull2 v2.8h, v0.16b, v0.16b + umull v5.8h, v4.8b, v4.8b + umull2 v6.8h, v4.16b, v4.16b + + add3 4 + subs w5, w5, #4 + st1 {v3.4h}, [x1], #8 + st1 {v7.4h}, [x11], #8 + st1 {v26.4s}, [x0], #16 + st1 {v28.4s}, [x10], #16 + b.le 9f + ext v0.16b, v0.16b, v0.16b, #4 + ext v4.16b, v4.16b, v4.16b, #4 + ext v1.16b, v1.16b, v2.16b, #8 + ext v5.16b, v5.16b, v6.16b, #8 + // Only one needed pixel left, but do a normal 4 pixel + // addition anyway + add3 4 + st1 {v3.4h}, [x1], #8 + st1 {v7.4h}, [x11], #8 + st1 {v26.4s}, [x0], #16 + st1 {v28.4s}, [x10], #16 + +9: + subs w6, w6, #2 + b.le 0f + // Jump to the next row and loop horizontally + add x0, x0, x9, lsl #1 + add x10, x10, x9, lsl #1 + add x1, x1, x9 + add x11, x11, x9 + add x3, x3, x4 + add x12, x12, x4 + mov w5, w8 + b 1b +0: + ret +.purgem add3 +endfunc + +// void dav1d_sgr_box5_h_8bpc_neon(int32_t *sumsq, int16_t *sum, +// const pixel (*left)[4], +// const pixel *src, const ptrdiff_t stride, +// const int w, const int h, +// const enum LrEdgeFlags edges); +function sgr_box5_h_8bpc_neon, export=1 + add w5, w5, #2 // w += 2 + + // Set up pointers for reading/writing alternate rows + add x10, x0, #(4*SUM_STRIDE) // sumsq + add x11, x1, #(2*SUM_STRIDE) // sum + add x12, x3, x4 // src + lsl x4, x4, #1 + mov x9, #(2*2*SUM_STRIDE) // double sum stride + + // Subtract the aligned width from the output stride. + // With LR_HAVE_RIGHT, align to 8, without it, align to 4. + // Subtract the number of pixels read from the input from the stride. + tst w7, #2 // LR_HAVE_RIGHT + b.ne 0f + // !LR_HAVE_RIGHT + add w13, w5, #3 + bic w13, w13, #3 + add w14, w5, #13 + b 1f +0: + add w13, w5, #7 + bic w13, w13, #7 + add w14, w5, #15 +1: + sub x9, x9, w13, uxtw #1 + bic w14, w14, #7 + sub x4, x4, w14, uxtw + + // Store the width for the vertical loop + mov w8, w5 + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst w7, #1 // LR_HAVE_LEFT + b.eq 2f + // LR_HAVE_LEFT + cbnz x2, 0f + // left == NULL + sub x3, x3, #3 + sub x12, x12, #3 + b 1f +0: // LR_HAVE_LEFT, left != NULL +2: // !LR_HAVE_LEFT, increase the stride. + // For this case we don't read the left 3 pixels from the src pointer, + // but shift it as if we had done that. + add x4, x4, #3 + +1: // Loop vertically + ld1 {v0.16b}, [x3], #16 + ld1 {v4.16b}, [x12], #16 + + tst w7, #1 // LR_HAVE_LEFT + b.eq 0f + cbz x2, 2f + // LR_HAVE_LEFT, left != NULL + ld1 {v1.s}[3], [x2], #4 + // Move x3/x12 back to account for the last 3 bytes we loaded earlier, + // which we'll shift out. + sub x3, x3, #3 + sub x12, x12, #3 + ld1 {v5.s}[3], [x2], #4 + ext v0.16b, v1.16b, v0.16b, #13 + ext v4.16b, v5.16b, v4.16b, #13 + b 2f +0: + // !LR_HAVE_LEFT, fill v1 with the leftmost byte + // and shift v0 to have 3x the first byte at the front. + dup v1.16b, v0.b[0] + dup v5.16b, v4.b[0] + // Move x3 back to account for the last 3 bytes we loaded before, + // which we shifted out. + sub x3, x3, #3 + sub x12, x12, #3 + ext v0.16b, v1.16b, v0.16b, #13 + ext v4.16b, v5.16b, v4.16b, #13 + +2: + umull v1.8h, v0.8b, v0.8b + umull2 v2.8h, v0.16b, v0.16b + umull v5.8h, v4.8b, v4.8b + umull2 v6.8h, v4.16b, v4.16b + + tst w7, #2 // LR_HAVE_RIGHT + b.ne 4f + // If we'll need to pad the right edge, load that byte to pad with + // here since we can find it pretty easily from here. + sub w13, w5, #(2 + 16 - 3 + 1) + ldr b30, [x3, w13, sxtw] + ldr b31, [x12, w13, sxtw] + // Fill v30/v31 with the right padding pixel + dup v30.8b, v30.b[0] + dup v31.8b, v31.b[0] +3: // !LR_HAVE_RIGHT + // If we'll have to pad the right edge we need to quit early here. + cmp w5, #11 + b.ge 4f // If w >= 11, all used input pixels are valid + cmp w5, #7 + b.ge 5f // If w >= 7, we can produce 4 pixels + b 6f + +4: // Loop horizontally +.macro add5 w + ext v16.16b, v0.16b, v0.16b, #1 + ext v17.16b, v0.16b, v0.16b, #2 + ext v18.16b, v0.16b, v0.16b, #3 + ext v19.16b, v0.16b, v0.16b, #4 + ext v20.16b, v4.16b, v4.16b, #1 + ext v21.16b, v4.16b, v4.16b, #2 + ext v22.16b, v4.16b, v4.16b, #3 + ext v23.16b, v4.16b, v4.16b, #4 + uaddl v3.8h, v0.8b, v16.8b + uaddl v24.8h, v17.8b, v18.8b + uaddl v7.8h, v4.8b, v20.8b + uaddw v3.8h, v3.8h, v19.8b + uaddl v25.8h, v21.8b, v22.8b + uaddw v7.8h, v7.8h, v23.8b + add v3.8h, v3.8h, v24.8h + add v7.8h, v7.8h, v25.8h + + ext v16.16b, v1.16b, v2.16b, #2 + ext v17.16b, v1.16b, v2.16b, #4 + ext v18.16b, v1.16b, v2.16b, #6 + ext v19.16b, v1.16b, v2.16b, #8 + ext v20.16b, v5.16b, v6.16b, #2 + ext v21.16b, v5.16b, v6.16b, #4 + ext v22.16b, v5.16b, v6.16b, #6 + ext v23.16b, v5.16b, v6.16b, #8 + + uaddl_nh v26.4s, v27.4s, v1, v16, \w + uaddl_nh v16.4s, v17.4s, v17, v18, \w + uaddl_nh v28.4s, v29.4s, v5, v20, \w + uaddw_nh v26.4s, v27.4s, v19, \w + uaddl_nh v20.4s, v21.4s, v21, v22, \w + uaddw_nh v28.4s, v29.4s, v23, \w + add_nh v26.4s, v27.4s, v16.4s, v17.4s, \w + add_nh v28.4s, v29.4s, v20.4s, v21.4s, \w +.endm + add5 8 + st1 {v3.8h}, [x1], #16 + st1 {v7.8h}, [x11], #16 + st1 {v26.4s,v27.4s}, [x0], #32 + st1 {v28.4s,v29.4s}, [x10], #32 + + subs w5, w5, #8 + b.le 9f + tst w7, #2 // LR_HAVE_RIGHT + ld1 {v3.8b}, [x3], #8 + ld1 {v7.8b}, [x12], #8 + mov v1.16b, v2.16b + mov v5.16b, v6.16b + ext v0.16b, v0.16b, v3.16b, #8 + ext v4.16b, v4.16b, v7.16b, #8 + umull v2.8h, v3.8b, v3.8b + umull v6.8h, v7.8b, v7.8b + b.ne 4b // If we don't need to pad, just keep summing. + b 3b // If we need to pad, check how many pixels we have left. + +5: // Produce 4 pixels, 7 <= w < 11 + add5 4 + st1 {v3.4h}, [x1], #8 + st1 {v7.4h}, [x11], #8 + st1 {v26.4s}, [x0], #16 + st1 {v28.4s}, [x10], #16 + + subs w5, w5, #4 // 3 <= w < 7 + ext v0.16b, v0.16b, v0.16b, #4 + ext v4.16b, v4.16b, v4.16b, #4 + +6: // Pad the right edge and produce the last few pixels. + // w < 7, w+1 pixels valid in v0/v4 + sub w13, w5, #1 + // w13 = pixels valid - 2 + adr x14, L(box5_variable_shift_tbl) + ldrh w13, [x14, w13, uxtw #1] + sub x13, x14, w13, uxth + br x13 + // Shift v0 right, shifting out invalid pixels, + // shift v0 left to the original offset, shifting in padding pixels. +22: // 2 pixels valid + ext v0.16b, v0.16b, v0.16b, #2 + ext v4.16b, v4.16b, v4.16b, #2 + ext v0.16b, v0.16b, v30.16b, #14 + ext v4.16b, v4.16b, v31.16b, #14 + b 88f +33: // 3 pixels valid + ext v0.16b, v0.16b, v0.16b, #3 + ext v4.16b, v4.16b, v4.16b, #3 + ext v0.16b, v0.16b, v30.16b, #13 + ext v4.16b, v4.16b, v31.16b, #13 + b 88f +44: // 4 pixels valid + ext v0.16b, v0.16b, v0.16b, #4 + ext v4.16b, v4.16b, v4.16b, #4 + ext v0.16b, v0.16b, v30.16b, #12 + ext v4.16b, v4.16b, v31.16b, #12 + b 88f +55: // 5 pixels valid + ext v0.16b, v0.16b, v0.16b, #5 + ext v4.16b, v4.16b, v4.16b, #5 + ext v0.16b, v0.16b, v30.16b, #11 + ext v4.16b, v4.16b, v31.16b, #11 + b 88f +66: // 6 pixels valid + ext v0.16b, v0.16b, v0.16b, #6 + ext v4.16b, v4.16b, v4.16b, #6 + ext v0.16b, v0.16b, v30.16b, #10 + ext v4.16b, v4.16b, v31.16b, #10 + b 88f +77: // 7 pixels valid + ext v0.16b, v0.16b, v0.16b, #7 + ext v4.16b, v4.16b, v4.16b, #7 + ext v0.16b, v0.16b, v30.16b, #9 + ext v4.16b, v4.16b, v31.16b, #9 + b 88f + +L(box5_variable_shift_tbl): + .hword L(box5_variable_shift_tbl) - 22b + .hword L(box5_variable_shift_tbl) - 33b + .hword L(box5_variable_shift_tbl) - 44b + .hword L(box5_variable_shift_tbl) - 55b + .hword L(box5_variable_shift_tbl) - 66b + .hword L(box5_variable_shift_tbl) - 77b + +88: + umull v1.8h, v0.8b, v0.8b + umull2 v2.8h, v0.16b, v0.16b + umull v5.8h, v4.8b, v4.8b + umull2 v6.8h, v4.16b, v4.16b + + add5 4 + subs w5, w5, #4 + st1 {v3.4h}, [x1], #8 + st1 {v7.4h}, [x11], #8 + st1 {v26.4s}, [x0], #16 + st1 {v28.4s}, [x10], #16 + b.le 9f + ext v0.16b, v0.16b, v0.16b, #4 + ext v1.16b, v1.16b, v2.16b, #8 + ext v4.16b, v4.16b, v4.16b, #4 + ext v5.16b, v5.16b, v6.16b, #8 + add5 4 + st1 {v3.4h}, [x1], #8 + st1 {v7.4h}, [x11], #8 + st1 {v26.4s}, [x0], #16 + st1 {v28.4s}, [x10], #16 + +9: + subs w6, w6, #2 + b.le 0f + // Jump to the next row and loop horizontally + add x0, x0, x9, lsl #1 + add x10, x10, x9, lsl #1 + add x1, x1, x9 + add x11, x11, x9 + add x3, x3, x4 + add x12, x12, x4 + mov w5, w8 + b 1b +0: + ret +.purgem add5 +endfunc + +sgr_funcs 8 diff --git a/third_party/dav1d/src/arm/64/looprestoration16.S b/third_party/dav1d/src/arm/64/looprestoration16.S new file mode 100644 index 0000000000..437988cfac --- /dev/null +++ b/third_party/dav1d/src/arm/64/looprestoration16.S @@ -0,0 +1,1239 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +// void dav1d_wiener_filter_h_16bpc_neon(int16_t *dst, const pixel (*left)[4], +// const pixel *src, ptrdiff_t stride, +// const int16_t fh[7], const intptr_t w, +// int h, enum LrEdgeFlags edges, +// const int bitdepth_max); +function wiener_filter_h_16bpc_neon, export=1 + ldr w8, [sp] // bitdepth_max + ld1 {v0.8h}, [x4] + clz w8, w8 + movi v30.4s, #1 + sub w9, w8, #38 // -(bitdepth + 6) + sub w8, w8, #25 // -round_bits_h + neg w9, w9 // bitdepth + 6 + dup v1.4s, w9 + dup v29.4s, w8 // -round_bits_h + movi v31.8h, #0x20, lsl #8 // 1 << 13 = 8192 + ushl v30.4s, v30.4s, v1.4s // 1 << (bitdepth + 6) + mov w8, w5 + // Calculate mid_stride + add w10, w5, #7 + bic w10, w10, #7 + lsl w10, w10, #1 + + // Clear the last unused element of v0, to allow filtering a single + // pixel with one plain mul+addv. + ins v0.h[7], wzr + + // Set up pointers for reading/writing alternate rows + add x12, x0, x10 + lsl w10, w10, #1 + add x13, x2, x3 + lsl x3, x3, #1 + + // Subtract the width from mid_stride + sub x10, x10, w5, uxtw #1 + + // For w >= 8, we read (w+5)&~7+8 pixels, for w < 8 we read 16 pixels. + cmp w5, #8 + add w11, w5, #13 + bic w11, w11, #7 + b.ge 1f + mov w11, #16 +1: + sub x3, x3, w11, uxtw #1 + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst w7, #1 // LR_HAVE_LEFT + b.eq 2f + // LR_HAVE_LEFT + cbnz x1, 0f + // left == NULL + sub x2, x2, #6 + sub x13, x13, #6 + b 1f +0: // LR_HAVE_LEFT, left != NULL +2: // !LR_HAVE_LEFT, increase the stride. + // For this case we don't read the left 3 pixels from the src pointer, + // but shift it as if we had done that. + add x3, x3, #6 + + +1: // Loop vertically + ld1 {v2.8h, v3.8h}, [x2], #32 + ld1 {v4.8h, v5.8h}, [x13], #32 + + tst w7, #1 // LR_HAVE_LEFT + b.eq 0f + cbz x1, 2f + // LR_HAVE_LEFT, left != NULL + ld1 {v1.d}[1], [x1], #8 + // Move x2/x13 back to account for the last 3 pixels we loaded earlier, + // which we'll shift out. + sub x2, x2, #6 + sub x13, x13, #6 + ld1 {v6.d}[1], [x1], #8 + ext v3.16b, v2.16b, v3.16b, #10 + ext v2.16b, v1.16b, v2.16b, #10 + ext v5.16b, v4.16b, v5.16b, #10 + ext v4.16b, v6.16b, v4.16b, #10 + b 2f +0: + // !LR_HAVE_LEFT, fill v1 with the leftmost pixel + // and shift v2/v3 to have 3x the first pixel at the front. + dup v1.8h, v2.h[0] + dup v6.8h, v4.h[0] + // Move x2 back to account for the last 3 pixels we loaded before, + // which we shifted out. + sub x2, x2, #6 + sub x13, x13, #6 + ext v3.16b, v2.16b, v3.16b, #10 + ext v2.16b, v1.16b, v2.16b, #10 + ext v5.16b, v4.16b, v5.16b, #10 + ext v4.16b, v6.16b, v4.16b, #10 + +2: + + tst w7, #2 // LR_HAVE_RIGHT + b.ne 4f + // If we'll need to pad the right edge, load that byte to pad with + // here since we can find it pretty easily from here. + sub w9, w5, #14 + ldr h27, [x2, w9, sxtw #1] + ldr h28, [x13, w9, sxtw #1] + // Fill v27/v28 with the right padding pixel + dup v27.8h, v27.h[0] + dup v28.8h, v28.h[0] +3: // !LR_HAVE_RIGHT + // If we'll have to pad the right edge we need to quit early here. + cmp w5, #11 + b.ge 4f // If w >= 11, all used input pixels are valid + cmp w5, #7 + b.ge 5f // If w >= 7, we can filter 4 pixels + b 6f + +4: // Loop horizontally +.macro ushll_sz d0, d1, src, shift, wd + ushll \d0\().4s, \src\().4h, \shift +.ifc \wd, .8h + ushll2 \d1\().4s, \src\().8h, \shift +.endif +.endm +.macro add_sz d0, d1, s0, s1, c, wd + add \d0\().4s, \s0\().4s, \c\().4s +.ifc \wd, .8h + add \d1\().4s, \s1\().4s, \c\().4s +.endif +.endm +.macro srshl_sz d0, d1, s0, s1, c, wd + srshl \d0\().4s, \s0\().4s, \c\().4s +.ifc \wd, .8h + srshl \d1\().4s, \s1\().4s, \c\().4s +.endif +.endm +.macro sqxtun_sz dst, s0, s1, wd + sqxtun \dst\().4h, \s0\().4s +.ifc \wd, .8h + sqxtun2 \dst\().8h, \s1\().4s +.endif +.endm + +.macro filter wd + // Interleaving the mul/mla chains actually hurts performance + // significantly on Cortex A53, thus keeping mul/mla tightly + // chained like this. + ext v18.16b, v2.16b, v3.16b, #6 + ext v16.16b, v2.16b, v3.16b, #2 + ext v17.16b, v2.16b, v3.16b, #4 + ext v19.16b, v2.16b, v3.16b, #8 + ext v20.16b, v2.16b, v3.16b, #10 + ushll_sz v6, v7, v18, #7, \wd + ext v21.16b, v2.16b, v3.16b, #12 + smlal v6.4s, v2.4h, v0.h[0] + smlal v6.4s, v16.4h, v0.h[1] + smlal v6.4s, v17.4h, v0.h[2] + smlal v6.4s, v18.4h, v0.h[3] + smlal v6.4s, v19.4h, v0.h[4] + smlal v6.4s, v20.4h, v0.h[5] + smlal v6.4s, v21.4h, v0.h[6] +.ifc \wd, .8h + smlal2 v7.4s, v2.8h, v0.h[0] + smlal2 v7.4s, v16.8h, v0.h[1] + smlal2 v7.4s, v17.8h, v0.h[2] + smlal2 v7.4s, v18.8h, v0.h[3] + smlal2 v7.4s, v19.8h, v0.h[4] + smlal2 v7.4s, v20.8h, v0.h[5] + smlal2 v7.4s, v21.8h, v0.h[6] +.endif + ext v21.16b, v4.16b, v5.16b, #6 + ext v19.16b, v4.16b, v5.16b, #2 + ext v20.16b, v4.16b, v5.16b, #4 + ext v22.16b, v4.16b, v5.16b, #8 + ext v23.16b, v4.16b, v5.16b, #10 + ushll_sz v16, v17, v21, #7, \wd + ext v24.16b, v4.16b, v5.16b, #12 + smlal v16.4s, v4.4h, v0.h[0] + smlal v16.4s, v19.4h, v0.h[1] + smlal v16.4s, v20.4h, v0.h[2] + smlal v16.4s, v21.4h, v0.h[3] + smlal v16.4s, v22.4h, v0.h[4] + smlal v16.4s, v23.4h, v0.h[5] + smlal v16.4s, v24.4h, v0.h[6] +.ifc \wd, .8h + smlal2 v17.4s, v4.8h, v0.h[0] + smlal2 v17.4s, v19.8h, v0.h[1] + smlal2 v17.4s, v20.8h, v0.h[2] + smlal2 v17.4s, v21.8h, v0.h[3] + smlal2 v17.4s, v22.8h, v0.h[4] + smlal2 v17.4s, v23.8h, v0.h[5] + smlal2 v17.4s, v24.8h, v0.h[6] +.endif + mvni v24\wd, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 + add_sz v6, v7, v6, v7, v30, \wd + add_sz v16, v17, v16, v17, v30, \wd + srshl_sz v6, v7, v6, v7, v29, \wd + srshl_sz v16, v17, v16, v17, v29, \wd + sqxtun_sz v6, v6, v7, \wd + sqxtun_sz v7, v16, v17, \wd + umin v6\wd, v6\wd, v24\wd + umin v7\wd, v7\wd, v24\wd + sub v6\wd, v6\wd, v31\wd + sub v7\wd, v7\wd, v31\wd +.endm + filter .8h + st1 {v6.8h}, [x0], #16 + st1 {v7.8h}, [x12], #16 + + subs w5, w5, #8 + b.le 9f + tst w7, #2 // LR_HAVE_RIGHT + mov v2.16b, v3.16b + mov v4.16b, v5.16b + ld1 {v3.8h}, [x2], #16 + ld1 {v5.8h}, [x13], #16 + b.ne 4b // If we don't need to pad, just keep filtering. + b 3b // If we need to pad, check how many pixels we have left. + +5: // Filter 4 pixels, 7 <= w < 11 + filter .4h + st1 {v6.4h}, [x0], #8 + st1 {v7.4h}, [x12], #8 + + subs w5, w5, #4 // 3 <= w < 7 + ext v2.16b, v2.16b, v3.16b, #8 + ext v3.16b, v3.16b, v3.16b, #8 + ext v4.16b, v4.16b, v5.16b, #8 + ext v5.16b, v5.16b, v5.16b, #8 + +6: // Pad the right edge and filter the last few pixels. + // w < 7, w+3 pixels valid in v2-v3 + cmp w5, #5 + b.lt 7f + b.gt 8f + // w == 5, 8 pixels valid in v2, v3 invalid + mov v3.16b, v27.16b + mov v5.16b, v28.16b + b 88f + +7: // 1 <= w < 5, 4-7 pixels valid in v2 + sub w9, w5, #1 + // w9 = (pixels valid - 4) + adr x11, L(variable_shift_tbl) + ldrh w9, [x11, w9, uxtw #1] + sub x11, x11, w9, uxth + mov v3.16b, v27.16b + mov v5.16b, v28.16b + br x11 +44: // 4 pixels valid in v2/v4, fill the high half with padding. + ins v2.d[1], v3.d[0] + ins v4.d[1], v5.d[0] + b 88f + // Shift v2 right, shifting out invalid pixels, + // shift v2 left to the original offset, shifting in padding pixels. +55: // 5 pixels valid + ext v2.16b, v2.16b, v2.16b, #10 + ext v2.16b, v2.16b, v3.16b, #6 + ext v4.16b, v4.16b, v4.16b, #10 + ext v4.16b, v4.16b, v5.16b, #6 + b 88f +66: // 6 pixels valid, fill the upper 2 pixels with padding. + ins v2.s[3], v3.s[0] + ins v4.s[3], v5.s[0] + b 88f +77: // 7 pixels valid, fill the last pixel with padding. + ins v2.h[7], v3.h[0] + ins v4.h[7], v5.h[0] + b 88f + +L(variable_shift_tbl): + .hword L(variable_shift_tbl) - 44b + .hword L(variable_shift_tbl) - 55b + .hword L(variable_shift_tbl) - 66b + .hword L(variable_shift_tbl) - 77b + +8: // w > 5, w == 6, 9 pixels valid in v2-v3, 1 pixel valid in v3 + ins v27.h[0], v3.h[0] + ins v28.h[0], v5.h[0] + mov v3.16b, v27.16b + mov v5.16b, v28.16b + +88: + // w < 7, v2-v3 padded properly + cmp w5, #4 + b.lt 888f + + // w >= 4, filter 4 pixels + filter .4h + st1 {v6.4h}, [x0], #8 + st1 {v7.4h}, [x12], #8 + subs w5, w5, #4 // 0 <= w < 4 + ext v2.16b, v2.16b, v3.16b, #8 + ext v4.16b, v4.16b, v5.16b, #8 + b.eq 9f +888: // 1 <= w < 4, filter 1 pixel at a time + smull v6.4s, v2.4h, v0.4h + smull2 v7.4s, v2.8h, v0.8h + smull v16.4s, v4.4h, v0.4h + smull2 v17.4s, v4.8h, v0.8h + add v6.4s, v6.4s, v7.4s + add v16.4s, v16.4s, v17.4s + addv s6, v6.4s + addv s7, v16.4s + dup v16.4h, v2.h[3] + ins v16.h[1], v4.h[3] + ins v6.s[1], v7.s[0] + mvni v24.4h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 + ushll v16.4s, v16.4h, #7 + add v6.2s, v6.2s, v30.2s + add v6.2s, v6.2s, v16.2s + srshl v6.2s, v6.2s, v29.2s + sqxtun v6.4h, v6.4s + umin v6.4h, v6.4h, v24.4h + sub v6.4h, v6.4h, v31.4h + st1 {v6.h}[0], [x0], #2 + st1 {v6.h}[1], [x12], #2 + subs w5, w5, #1 + ext v2.16b, v2.16b, v3.16b, #2 + ext v4.16b, v4.16b, v5.16b, #2 + b.gt 888b + +9: + subs w6, w6, #2 + b.le 0f + // Jump to the next row and loop horizontally + add x0, x0, x10 + add x12, x12, x10 + add x2, x2, x3 + add x13, x13, x3 + mov w5, w8 + b 1b +0: + ret +.purgem filter +endfunc + +// void dav1d_wiener_filter_v_16bpc_neon(pixel *dst, ptrdiff_t stride, +// const int16_t *mid, int w, int h, +// const int16_t fv[7], enum LrEdgeFlags edges, +// ptrdiff_t mid_stride, const int bitdepth_max); +function wiener_filter_v_16bpc_neon, export=1 + ldr w8, [sp] // bitdepth_max + ld1 {v0.8h}, [x5] + dup v31.8h, w8 + clz w8, w8 + movi v1.8h, #128 + sub w8, w8, #11 // round_bits_v + add v1.8h, v1.8h, v0.8h + dup v30.4s, w8 + mov w8, w4 + neg v30.4s, v30.4s // -round_bits_v + + // Calculate the number of rows to move back when looping vertically + mov w11, w4 + tst w6, #4 // LR_HAVE_TOP + b.eq 0f + sub x2, x2, x7, lsl #1 + add w11, w11, #2 +0: + tst w6, #8 // LR_HAVE_BOTTOM + b.eq 1f + add w11, w11, #2 + +1: // Start of horizontal loop; start one vertical filter slice. + // Load rows into v16-v19 and pad properly. + tst w6, #4 // LR_HAVE_TOP + ld1 {v16.8h}, [x2], x7 + b.eq 2f + // LR_HAVE_TOP + ld1 {v18.8h}, [x2], x7 + mov v17.16b, v16.16b + ld1 {v19.8h}, [x2], x7 + b 3f +2: // !LR_HAVE_TOP + mov v17.16b, v16.16b + mov v18.16b, v16.16b + mov v19.16b, v16.16b + +3: + cmp w4, #4 + b.lt 5f + // Start filtering normally; fill in v20-v22 with unique rows. + ld1 {v20.8h}, [x2], x7 + ld1 {v21.8h}, [x2], x7 + ld1 {v22.8h}, [x2], x7 + +4: +.macro filter compare + subs w4, w4, #1 + // Interleaving the mul/mla chains actually hurts performance + // significantly on Cortex A53, thus keeping mul/mla tightly + // chained like this. + smull v2.4s, v16.4h, v0.h[0] + smlal v2.4s, v17.4h, v0.h[1] + smlal v2.4s, v18.4h, v0.h[2] + smlal v2.4s, v19.4h, v1.h[3] + smlal v2.4s, v20.4h, v0.h[4] + smlal v2.4s, v21.4h, v0.h[5] + smlal v2.4s, v22.4h, v0.h[6] + smull2 v3.4s, v16.8h, v0.h[0] + smlal2 v3.4s, v17.8h, v0.h[1] + smlal2 v3.4s, v18.8h, v0.h[2] + smlal2 v3.4s, v19.8h, v1.h[3] + smlal2 v3.4s, v20.8h, v0.h[4] + smlal2 v3.4s, v21.8h, v0.h[5] + smlal2 v3.4s, v22.8h, v0.h[6] + srshl v2.4s, v2.4s, v30.4s // round_bits_v + srshl v3.4s, v3.4s, v30.4s + sqxtun v2.4h, v2.4s + sqxtun2 v2.8h, v3.4s + umin v2.8h, v2.8h, v31.8h // bitdepth_max + st1 {v2.8h}, [x0], x1 +.if \compare + cmp w4, #4 +.else + b.le 9f +.endif + mov v16.16b, v17.16b + mov v17.16b, v18.16b + mov v18.16b, v19.16b + mov v19.16b, v20.16b + mov v20.16b, v21.16b + mov v21.16b, v22.16b +.endm + filter 1 + b.lt 7f + ld1 {v22.8h}, [x2], x7 + b 4b + +5: // Less than 4 rows in total; not all of v20-v21 are filled yet. + tst w6, #8 // LR_HAVE_BOTTOM + b.eq 6f + // LR_HAVE_BOTTOM + cmp w4, #2 + // We load at least 2 rows in all cases. + ld1 {v20.8h}, [x2], x7 + ld1 {v21.8h}, [x2], x7 + b.gt 53f // 3 rows in total + b.eq 52f // 2 rows in total +51: // 1 row in total, v19 already loaded, load edge into v20-v22. + mov v22.16b, v21.16b + b 8f +52: // 2 rows in total, v19 already loaded, load v20 with content data + // and 2 rows of edge. + ld1 {v22.8h}, [x2], x7 + mov v23.16b, v22.16b + b 8f +53: + // 3 rows in total, v19 already loaded, load v20 and v21 with content + // and 2 rows of edge. + ld1 {v22.8h}, [x2], x7 + ld1 {v23.8h}, [x2], x7 + mov v24.16b, v23.16b + b 8f + +6: + // !LR_HAVE_BOTTOM + cmp w4, #2 + b.gt 63f // 3 rows in total + b.eq 62f // 2 rows in total +61: // 1 row in total, v19 already loaded, pad that into v20-v22. + mov v20.16b, v19.16b + mov v21.16b, v19.16b + mov v22.16b, v19.16b + b 8f +62: // 2 rows in total, v19 already loaded, load v20 and pad that into v21-v23. + ld1 {v20.8h}, [x2], x7 + mov v21.16b, v20.16b + mov v22.16b, v20.16b + mov v23.16b, v20.16b + b 8f +63: + // 3 rows in total, v19 already loaded, load v20 and v21 and pad v21 into v22-v24. + ld1 {v20.8h}, [x2], x7 + ld1 {v21.8h}, [x2], x7 + mov v22.16b, v21.16b + mov v23.16b, v21.16b + mov v24.16b, v21.16b + b 8f + +7: + // All registers up to v21 are filled already, 3 valid rows left. + // < 4 valid rows left; fill in padding and filter the last + // few rows. + tst w6, #8 // LR_HAVE_BOTTOM + b.eq 71f + // LR_HAVE_BOTTOM; load 2 rows of edge. + ld1 {v22.8h}, [x2], x7 + ld1 {v23.8h}, [x2], x7 + mov v24.16b, v23.16b + b 8f +71: + // !LR_HAVE_BOTTOM, pad 3 rows + mov v22.16b, v21.16b + mov v23.16b, v21.16b + mov v24.16b, v21.16b + +8: // At this point, all registers up to v22-v24 are loaded with + // edge/padding (depending on how many rows are left). + filter 0 // This branches to 9f when done + mov v22.16b, v23.16b + mov v23.16b, v24.16b + b 8b + +9: // End of one vertical slice. + subs w3, w3, #8 + b.le 0f + // Move pointers back up to the top and loop horizontally. + msub x0, x1, x8, x0 + msub x2, x7, x11, x2 + add x0, x0, #16 + add x2, x2, #16 + mov w4, w8 + b 1b + +0: + ret +.purgem filter +endfunc + +// void dav1d_copy_narrow_16bpc_neon(pixel *dst, ptrdiff_t stride, +// const pixel *src, int w, int h); +function copy_narrow_16bpc_neon, export=1 + adr x5, L(copy_narrow_tbl) + ldrh w6, [x5, w3, uxtw #1] + sub x5, x5, w6, uxth + br x5 +10: + add x7, x0, x1 + lsl x1, x1, #1 +18: + subs w4, w4, #8 + b.lt 110f + ld1 {v0.8h}, [x2], #16 + st1 {v0.h}[0], [x0], x1 + st1 {v0.h}[1], [x7], x1 + st1 {v0.h}[2], [x0], x1 + st1 {v0.h}[3], [x7], x1 + st1 {v0.h}[4], [x0], x1 + st1 {v0.h}[5], [x7], x1 + st1 {v0.h}[6], [x0], x1 + st1 {v0.h}[7], [x7], x1 + b.le 0f + b 18b +110: + add w4, w4, #8 + asr x1, x1, #1 +11: + subs w4, w4, #1 + ld1 {v0.h}[0], [x2], #2 + st1 {v0.h}[0], [x0], x1 + b.gt 11b +0: + ret + +20: + add x7, x0, x1 + lsl x1, x1, #1 +24: + subs w4, w4, #4 + b.lt 210f + ld1 {v0.4s}, [x2], #16 + st1 {v0.s}[0], [x0], x1 + st1 {v0.s}[1], [x7], x1 + st1 {v0.s}[2], [x0], x1 + st1 {v0.s}[3], [x7], x1 + b.le 0f + b 24b +210: + add w4, w4, #4 + asr x1, x1, #1 +22: + subs w4, w4, #1 + ld1 {v0.s}[0], [x2], #4 + st1 {v0.s}[0], [x0], x1 + b.gt 22b +0: + ret + +30: + ldr w5, [x2] + ldrh w6, [x2, #4] + add x2, x2, #6 + subs w4, w4, #1 + str w5, [x0] + strh w6, [x0, #4] + add x0, x0, x1 + b.gt 30b + ret + +40: + add x7, x0, x1 + lsl x1, x1, #1 +42: + subs w4, w4, #2 + b.lt 41f + ld1 {v0.2d}, [x2], #16 + st1 {v0.d}[0], [x0], x1 + st1 {v0.d}[1], [x7], x1 + b.le 0f + b 42b +41: + ld1 {v0.4h}, [x2] + st1 {v0.4h}, [x0] +0: + ret + +50: + ldr x5, [x2] + ldrh w6, [x2, #8] + add x2, x2, #10 + subs w4, w4, #1 + str x5, [x0] + strh w6, [x0, #8] + add x0, x0, x1 + b.gt 50b + ret + +60: + ldr x5, [x2] + ldr w6, [x2, #8] + add x2, x2, #12 + subs w4, w4, #1 + str x5, [x0] + str w6, [x0, #8] + add x0, x0, x1 + b.gt 60b + ret + +70: + ldr x5, [x2] + ldr w6, [x2, #8] + ldrh w7, [x2, #12] + add x2, x2, #14 + subs w4, w4, #1 + str x5, [x0] + str w6, [x0, #8] + strh w7, [x0, #12] + add x0, x0, x1 + b.gt 70b + ret + +L(copy_narrow_tbl): + .hword 0 + .hword L(copy_narrow_tbl) - 10b + .hword L(copy_narrow_tbl) - 20b + .hword L(copy_narrow_tbl) - 30b + .hword L(copy_narrow_tbl) - 40b + .hword L(copy_narrow_tbl) - 50b + .hword L(copy_narrow_tbl) - 60b + .hword L(copy_narrow_tbl) - 70b +endfunc + +#define SUM_STRIDE (384+16) + +#include "looprestoration_tmpl.S" + +// void dav1d_sgr_box3_h_16bpc_neon(int32_t *sumsq, int16_t *sum, +// const pixel (*left)[4], +// const pixel *src, const ptrdiff_t stride, +// const int w, const int h, +// const enum LrEdgeFlags edges); +function sgr_box3_h_16bpc_neon, export=1 + add w5, w5, #2 // w += 2 + + // Set up pointers for reading/writing alternate rows + add x10, x0, #(4*SUM_STRIDE) // sumsq + add x11, x1, #(2*SUM_STRIDE) // sum + add x12, x3, x4 // src + lsl x4, x4, #1 + mov x9, #(2*2*SUM_STRIDE) // double sum stride + + // Subtract the aligned width from the output stride. + // With LR_HAVE_RIGHT, align to 8, without it, align to 4. + tst w7, #2 // LR_HAVE_RIGHT + b.ne 0f + // !LR_HAVE_RIGHT + add w13, w5, #3 + bic w13, w13, #3 + b 1f +0: + add w13, w5, #7 + bic w13, w13, #7 +1: + sub x9, x9, w13, uxtw #1 + + // Store the width for the vertical loop + mov w8, w5 + + // Subtract the number of pixels read from the input from the stride + add w13, w5, #14 + bic w13, w13, #7 + sub x4, x4, w13, uxtw #1 + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst w7, #1 // LR_HAVE_LEFT + b.eq 2f + // LR_HAVE_LEFT + cbnz x2, 0f + // left == NULL + sub x3, x3, #4 + sub x12, x12, #4 + b 1f +0: // LR_HAVE_LEFT, left != NULL +2: // !LR_HAVE_LEFT, increase the stride. + // For this case we don't read the left 2 pixels from the src pointer, + // but shift it as if we had done that. + add x4, x4, #4 + + +1: // Loop vertically + ld1 {v0.8h, v1.8h}, [x3], #32 + ld1 {v16.8h, v17.8h}, [x12], #32 + + tst w7, #1 // LR_HAVE_LEFT + b.eq 0f + cbz x2, 2f + // LR_HAVE_LEFT, left != NULL + ld1 {v2.d}[1], [x2], #8 + // Move x3/x12 back to account for the last 2 pixels we loaded earlier, + // which we'll shift out. + sub x3, x3, #4 + sub x12, x12, #4 + ld1 {v18.d}[1], [x2], #8 + ext v1.16b, v0.16b, v1.16b, #12 + ext v0.16b, v2.16b, v0.16b, #12 + ext v17.16b, v16.16b, v17.16b, #12 + ext v16.16b, v18.16b, v16.16b, #12 + b 2f +0: + // !LR_HAVE_LEFT, fill v2 with the leftmost pixel + // and shift v0/v1 to have 2x the first pixel at the front. + dup v2.8h, v0.h[0] + dup v18.8h, v16.h[0] + // Move x3 back to account for the last 2 pixels we loaded before, + // which we shifted out. + sub x3, x3, #4 + sub x12, x12, #4 + ext v1.16b, v0.16b, v1.16b, #12 + ext v0.16b, v2.16b, v0.16b, #12 + ext v17.16b, v16.16b, v17.16b, #12 + ext v16.16b, v18.16b, v16.16b, #12 + +2: + umull v2.4s, v0.4h, v0.4h + umull2 v3.4s, v0.8h, v0.8h + umull v4.4s, v1.4h, v1.4h + umull v18.4s, v16.4h, v16.4h + umull2 v19.4s, v16.8h, v16.8h + umull v20.4s, v17.4h, v17.4h + + tst w7, #2 // LR_HAVE_RIGHT + b.ne 4f + // If we'll need to pad the right edge, load that byte to pad with + // here since we can find it pretty easily from here. + sub w13, w5, #(2 + 16 - 2 + 1) + ldr h30, [x3, w13, sxtw #1] + ldr h31, [x12, w13, sxtw #1] + // Fill v30/v31 with the right padding pixel + dup v30.8h, v30.h[0] + dup v31.8h, v31.h[0] +3: // !LR_HAVE_RIGHT + // If we'll have to pad the right edge we need to quit early here. + cmp w5, #10 + b.ge 4f // If w >= 10, all used input pixels are valid + cmp w5, #6 + b.ge 5f // If w >= 6, we can filter 4 pixels + b 6f + +4: // Loop horizontally +.macro ext_n dst1, dst2, src1, src2, src3, n, w + ext \dst1, \src1, \src2, \n +.if \w > 4 + ext \dst2, \src2, \src3, \n +.endif +.endm +.macro add_n dst1, dst2, src1, src2, src3, src4, w + add \dst1, \src1, \src3 +.if \w > 4 + add \dst2, \src2, \src4 +.endif +.endm + +.macro add3 w, wd + ext v24.16b, v0.16b, v1.16b, #2 + ext v25.16b, v0.16b, v1.16b, #4 + ext v26.16b, v16.16b, v17.16b, #2 + ext v27.16b, v16.16b, v17.16b, #4 + add v6\wd, v0\wd, v24\wd + add v7\wd, v16\wd, v26\wd + add v6\wd, v6\wd, v25\wd + add v7\wd, v7\wd, v27\wd + + ext_n v24.16b, v25.16b, v2.16b, v3.16b, v4.16b, #4, \w + ext_n v26.16b, v27.16b, v2.16b, v3.16b, v4.16b, #8, \w + + add_n v22.4s, v23.4s, v2.4s, v3.4s, v24.4s, v25.4s, \w + add_n v22.4s, v23.4s, v22.4s, v23.4s, v26.4s, v27.4s, \w + + ext_n v24.16b, v25.16b, v18.16b, v19.16b, v20.16b, #4, \w + ext_n v26.16b, v27.16b, v18.16b, v19.16b, v20.16b, #8, \w + + add_n v24.4s, v25.4s, v18.4s, v19.4s, v24.4s, v25.4s, \w + add_n v24.4s, v25.4s, v24.4s, v25.4s, v26.4s, v27.4s, \w +.endm + add3 8, .8h + st1 {v6.8h}, [x1], #16 + st1 {v7.8h}, [x11], #16 + st1 {v22.4s,v23.4s}, [x0], #32 + st1 {v24.4s,v25.4s}, [x10], #32 + + subs w5, w5, #8 + b.le 9f + tst w7, #2 // LR_HAVE_RIGHT + mov v0.16b, v1.16b + mov v16.16b, v17.16b + ld1 {v1.8h}, [x3], #16 + ld1 {v17.8h}, [x12], #16 + mov v2.16b, v4.16b + umull2 v3.4s, v0.8h, v0.8h + umull v4.4s, v1.4h, v1.4h + mov v18.16b, v20.16b + umull2 v19.4s, v16.8h, v16.8h + umull v20.4s, v17.4h, v17.4h + + b.ne 4b // If we don't need to pad, just keep summing. + b 3b // If we need to pad, check how many pixels we have left. + +5: // Produce 4 pixels, 6 <= w < 10 + add3 4, .4h + st1 {v6.4h}, [x1], #8 + st1 {v7.4h}, [x11], #8 + st1 {v22.4s}, [x0], #16 + st1 {v24.4s}, [x10], #16 + + subs w5, w5, #4 // 2 <= w < 6 + ext v0.16b, v0.16b, v1.16b, #8 + ext v16.16b, v16.16b, v17.16b, #8 + +6: // Pad the right edge and produce the last few pixels. + // 2 <= w < 6, 2-5 pixels valid in v0 + sub w13, w5, #2 + // w13 = (pixels valid - 2) + adr x14, L(box3_variable_shift_tbl) + ldrh w13, [x14, w13, uxtw #1] + sub x13, x14, w13, uxth + br x13 + // Shift v0 right, shifting out invalid pixels, + // shift v0 left to the original offset, shifting in padding pixels. +22: // 2 pixels valid + ext v0.16b, v0.16b, v0.16b, #4 + ext v16.16b, v16.16b, v16.16b, #4 + ext v0.16b, v0.16b, v30.16b, #12 + ext v16.16b, v16.16b, v31.16b, #12 + b 88f +33: // 3 pixels valid + ext v0.16b, v0.16b, v0.16b, #6 + ext v16.16b, v16.16b, v16.16b, #6 + ext v0.16b, v0.16b, v30.16b, #10 + ext v16.16b, v16.16b, v31.16b, #10 + b 88f +44: // 4 pixels valid + ext v0.16b, v0.16b, v0.16b, #8 + ext v16.16b, v16.16b, v16.16b, #8 + ext v0.16b, v0.16b, v30.16b, #8 + ext v16.16b, v16.16b, v31.16b, #8 + b 88f +55: // 5 pixels valid + ext v0.16b, v0.16b, v0.16b, #10 + ext v16.16b, v16.16b, v16.16b, #10 + ext v0.16b, v0.16b, v30.16b, #6 + ext v16.16b, v16.16b, v31.16b, #6 + b 88f + +L(box3_variable_shift_tbl): + .hword L(box3_variable_shift_tbl) - 22b + .hword L(box3_variable_shift_tbl) - 33b + .hword L(box3_variable_shift_tbl) - 44b + .hword L(box3_variable_shift_tbl) - 55b + +88: + umull v2.4s, v0.4h, v0.4h + umull2 v3.4s, v0.8h, v0.8h + umull v18.4s, v16.4h, v16.4h + umull2 v19.4s, v16.8h, v16.8h + + add3 4, .4h + subs w5, w5, #4 + st1 {v6.4h}, [x1], #8 + st1 {v7.4h}, [x11], #8 + st1 {v22.4s}, [x0], #16 + st1 {v24.4s}, [x10], #16 + b.le 9f + ext v0.16b, v0.16b, v0.16b, #8 + ext v16.16b, v16.16b, v16.16b, #8 + mov v2.16b, v3.16b + mov v3.16b, v4.16b + mov v18.16b, v19.16b + mov v19.16b, v20.16b + // Only one needed pixel left, but do a normal 4 pixel + // addition anyway + add3 4, .4h + st1 {v6.4h}, [x1], #8 + st1 {v7.4h}, [x11], #8 + st1 {v22.4s}, [x0], #16 + st1 {v24.4s}, [x10], #16 + +9: + subs w6, w6, #2 + b.le 0f + // Jump to the next row and loop horizontally + add x0, x0, x9, lsl #1 + add x10, x10, x9, lsl #1 + add x1, x1, x9 + add x11, x11, x9 + add x3, x3, x4 + add x12, x12, x4 + mov w5, w8 + b 1b +0: + ret +.purgem add3 +endfunc + +// void dav1d_sgr_box5_h_16bpc_neon(int32_t *sumsq, int16_t *sum, +// const pixel (*left)[4], +// const pixel *src, const ptrdiff_t stride, +// const int w, const int h, +// const enum LrEdgeFlags edges); +function sgr_box5_h_16bpc_neon, export=1 + add w5, w5, #2 // w += 2 + + // Set up pointers for reading/writing alternate rows + add x10, x0, #(4*SUM_STRIDE) // sumsq + add x11, x1, #(2*SUM_STRIDE) // sum + add x12, x3, x4 // src + lsl x4, x4, #1 + mov x9, #(2*2*SUM_STRIDE) // double sum stride + + // Subtract the aligned width from the output stride. + // With LR_HAVE_RIGHT, align to 8, without it, align to 4. + // Subtract the number of pixels read from the input from the stride. + tst w7, #2 // LR_HAVE_RIGHT + b.ne 0f + // !LR_HAVE_RIGHT + add w13, w5, #3 + bic w13, w13, #3 + add w14, w5, #13 + b 1f +0: + add w13, w5, #7 + bic w13, w13, #7 + add w14, w5, #15 +1: + sub x9, x9, w13, uxtw #1 + bic w14, w14, #7 + sub x4, x4, w14, uxtw #1 + + // Store the width for the vertical loop + mov w8, w5 + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst w7, #1 // LR_HAVE_LEFT + b.eq 2f + // LR_HAVE_LEFT + cbnz x2, 0f + // left == NULL + sub x3, x3, #6 + sub x12, x12, #6 + b 1f +0: // LR_HAVE_LEFT, left != NULL +2: // !LR_HAVE_LEFT, increase the stride. + // For this case we don't read the left 3 pixels from the src pointer, + // but shift it as if we had done that. + add x4, x4, #6 + +1: // Loop vertically + ld1 {v0.8h, v1.8h}, [x3], #32 + ld1 {v16.8h, v17.8h}, [x12], #32 + + tst w7, #1 // LR_HAVE_LEFT + b.eq 0f + cbz x2, 2f + // LR_HAVE_LEFT, left != NULL + ld1 {v2.d}[1], [x2], #8 + // Move x3/x12 back to account for the last 3 pixels we loaded earlier, + // which we'll shift out. + sub x3, x3, #6 + sub x12, x12, #6 + ld1 {v18.d}[1], [x2], #8 + ext v1.16b, v0.16b, v1.16b, #10 + ext v0.16b, v2.16b, v0.16b, #10 + ext v17.16b, v16.16b, v17.16b, #10 + ext v16.16b, v18.16b, v16.16b, #10 + b 2f +0: + // !LR_HAVE_LEFT, fill v2 with the leftmost pixel + // and shift v0/v1 to have 3x the first pixel at the front. + dup v2.8h, v0.h[0] + dup v18.8h, v16.h[0] + // Move x3 back to account for the last 6 bytes we loaded before, + // which we shifted out. + sub x3, x3, #6 + sub x12, x12, #6 + ext v1.16b, v0.16b, v1.16b, #10 + ext v0.16b, v2.16b, v0.16b, #10 + ext v17.16b, v16.16b, v17.16b, #10 + ext v16.16b, v18.16b, v16.16b, #10 + +2: + umull v2.4s, v0.4h, v0.4h + umull2 v3.4s, v0.8h, v0.8h + umull v4.4s, v1.4h, v1.4h + umull v18.4s, v16.4h, v16.4h + umull2 v19.4s, v16.8h, v16.8h + umull v20.4s, v17.4h, v17.4h + + tst w7, #2 // LR_HAVE_RIGHT + b.ne 4f + // If we'll need to pad the right edge, load that byte to pad with + // here since we can find it pretty easily from here. + sub w13, w5, #(2 + 16 - 3 + 1) + ldr h30, [x3, w13, sxtw #1] + ldr h31, [x12, w13, sxtw #1] + // Fill v30/v31 with the right padding pixel + dup v30.8h, v30.h[0] + dup v31.8h, v31.h[0] +3: // !LR_HAVE_RIGHT + // If we'll have to pad the right edge we need to quit early here. + cmp w5, #11 + b.ge 4f // If w >= 11, all used input pixels are valid + cmp w5, #7 + b.ge 5f // If w >= 7, we can produce 4 pixels + b 6f + +4: // Loop horizontally +.macro add5 w, wd + ext v24.16b, v0.16b, v1.16b, #2 + ext v25.16b, v0.16b, v1.16b, #4 + ext v26.16b, v0.16b, v1.16b, #6 + ext v27.16b, v0.16b, v1.16b, #8 + + add v6\wd, v0\wd, v24\wd + add v25\wd, v25\wd, v26\wd + add v6\wd, v6\wd, v27\wd + + ext v26.16b, v16.16b, v17.16b, #2 + ext v27.16b, v16.16b, v17.16b, #4 + ext v28.16b, v16.16b, v17.16b, #6 + ext v29.16b, v16.16b, v17.16b, #8 + + add v7\wd, v16\wd, v26\wd + add v27\wd, v27\wd, v28\wd + add v7\wd, v7\wd, v29\wd + add v6\wd, v6\wd, v25\wd + add v7\wd, v7\wd, v27\wd + + ext_n v24.16b, v25.16b, v2.16b, v3.16b, v4.16b, #4, \w + ext_n v26.16b, v27.16b, v2.16b, v3.16b, v4.16b, #8, \w + ext_n v28.16b, v29.16b, v2.16b, v3.16b, v4.16b, #12, \w + + add_n v22.4s, v23.4s, v2.4s, v3.4s, v24.4s, v25.4s, \w + add_n v26.4s, v27.4s, v26.4s, v27.4s, v28.4s, v29.4s, \w + add_n v22.4s, v23.4s, v22.4s, v23.4s, v3.4s, v4.4s, \w + add_n v22.4s, v23.4s, v22.4s, v23.4s, v26.4s, v27.4s, \w + + ext_n v24.16b, v25.16b, v18.16b, v19.16b, v20.16b, #4, \w + ext_n v26.16b, v27.16b, v18.16b, v19.16b, v20.16b, #8, \w + ext_n v28.16b, v29.16b, v18.16b, v19.16b, v20.16b, #12, \w + + add_n v24.4s, v25.4s, v18.4s, v19.4s, v24.4s, v25.4s, \w + add_n v26.4s, v27.4s, v26.4s, v27.4s, v28.4s, v29.4s, \w + add_n v24.4s, v25.4s, v24.4s, v25.4s, v19.4s, v20.4s, \w + add_n v24.4s, v25.4s, v24.4s, v25.4s, v26.4s, v27.4s, \w +.endm + add5 8, .8h + st1 {v6.8h}, [x1], #16 + st1 {v7.8h}, [x11], #16 + st1 {v22.4s,v23.4s}, [x0], #32 + st1 {v24.4s,v25.4s}, [x10], #32 + + subs w5, w5, #8 + b.le 9f + tst w7, #2 // LR_HAVE_RIGHT + mov v0.16b, v1.16b + mov v16.16b, v17.16b + ld1 {v1.8h}, [x3], #16 + ld1 {v17.8h}, [x12], #16 + mov v2.16b, v4.16b + umull2 v3.4s, v0.8h, v0.8h + umull v4.4s, v1.4h, v1.4h + mov v18.16b, v20.16b + umull2 v19.4s, v16.8h, v16.8h + umull v20.4s, v17.4h, v17.4h + + b.ne 4b // If we don't need to pad, just keep summing. + b 3b // If we need to pad, check how many pixels we have left. + +5: // Produce 4 pixels, 7 <= w < 11 + add5 4, .4h + st1 {v6.4h}, [x1], #8 + st1 {v7.4h}, [x11], #8 + st1 {v22.4s}, [x0], #16 + st1 {v24.4s}, [x10], #16 + + subs w5, w5, #4 // 3 <= w < 7 + ext v0.16b, v0.16b, v1.16b, #8 + ext v16.16b, v16.16b, v17.16b, #8 + +6: // Pad the right edge and produce the last few pixels. + // w < 7, w+1 pixels valid in v0/v4 + sub w13, w5, #1 + // w13 = pixels valid - 2 + adr x14, L(box5_variable_shift_tbl) + ldrh w13, [x14, w13, uxtw #1] + mov v1.16b, v30.16b + mov v17.16b, v31.16b + sub x13, x14, w13, uxth + br x13 + // Shift v0 right, shifting out invalid pixels, + // shift v0 left to the original offset, shifting in padding pixels. +22: // 2 pixels valid + ext v0.16b, v0.16b, v0.16b, #4 + ext v16.16b, v16.16b, v16.16b, #4 + ext v0.16b, v0.16b, v30.16b, #12 + ext v16.16b, v16.16b, v31.16b, #12 + b 88f +33: // 3 pixels valid + ext v0.16b, v0.16b, v0.16b, #6 + ext v16.16b, v16.16b, v16.16b, #6 + ext v0.16b, v0.16b, v30.16b, #10 + ext v16.16b, v16.16b, v31.16b, #10 + b 88f +44: // 4 pixels valid + ext v0.16b, v0.16b, v0.16b, #8 + ext v16.16b, v16.16b, v16.16b, #8 + ext v0.16b, v0.16b, v30.16b, #8 + ext v16.16b, v16.16b, v31.16b, #8 + b 88f +55: // 5 pixels valid + ext v0.16b, v0.16b, v0.16b, #10 + ext v16.16b, v16.16b, v16.16b, #10 + ext v0.16b, v0.16b, v30.16b, #6 + ext v16.16b, v16.16b, v31.16b, #6 + b 88f +66: // 6 pixels valid + ext v0.16b, v0.16b, v0.16b, #12 + ext v16.16b, v16.16b, v16.16b, #12 + ext v0.16b, v0.16b, v30.16b, #4 + ext v16.16b, v16.16b, v31.16b, #4 + b 88f +77: // 7 pixels valid + ext v0.16b, v0.16b, v0.16b, #14 + ext v16.16b, v16.16b, v16.16b, #14 + ext v0.16b, v0.16b, v30.16b, #2 + ext v16.16b, v16.16b, v31.16b, #2 + b 88f + +L(box5_variable_shift_tbl): + .hword L(box5_variable_shift_tbl) - 22b + .hword L(box5_variable_shift_tbl) - 33b + .hword L(box5_variable_shift_tbl) - 44b + .hword L(box5_variable_shift_tbl) - 55b + .hword L(box5_variable_shift_tbl) - 66b + .hword L(box5_variable_shift_tbl) - 77b + +88: + umull v2.4s, v0.4h, v0.4h + umull2 v3.4s, v0.8h, v0.8h + umull v4.4s, v1.4h, v1.4h + umull v18.4s, v16.4h, v16.4h + umull2 v19.4s, v16.8h, v16.8h + umull v20.4s, v17.4h, v17.4h + + add5 4, .4h + subs w5, w5, #4 + st1 {v6.4h}, [x1], #8 + st1 {v7.4h}, [x11], #8 + st1 {v22.4s}, [x0], #16 + st1 {v24.4s}, [x10], #16 + b.le 9f + ext v0.16b, v0.16b, v1.16b, #8 + ext v16.16b, v16.16b, v17.16b, #8 + mov v2.16b, v3.16b + mov v3.16b, v4.16b + mov v18.16b, v19.16b + mov v19.16b, v20.16b + add5 4, .4h + st1 {v6.4h}, [x1], #8 + st1 {v7.4h}, [x11], #8 + st1 {v22.4s}, [x0], #16 + st1 {v24.4s}, [x10], #16 + +9: + subs w6, w6, #2 + b.le 0f + // Jump to the next row and loop horizontally + add x0, x0, x9, lsl #1 + add x10, x10, x9, lsl #1 + add x1, x1, x9 + add x11, x11, x9 + add x3, x3, x4 + add x12, x12, x4 + mov w5, w8 + b 1b +0: + ret +.purgem add5 +endfunc + +sgr_funcs 16 diff --git a/third_party/dav1d/src/arm/64/looprestoration_common.S b/third_party/dav1d/src/arm/64/looprestoration_common.S new file mode 100644 index 0000000000..200eb63189 --- /dev/null +++ b/third_party/dav1d/src/arm/64/looprestoration_common.S @@ -0,0 +1,432 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +#define SUM_STRIDE (384+16) + +// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum, +// const int w, const int h, +// const enum LrEdgeFlags edges); +function sgr_box3_v_neon, export=1 + add w10, w3, #2 // Number of output rows to move back + mov w11, w3 // Number of input rows to move back + add w2, w2, #2 // Actual summed width + mov x7, #(4*SUM_STRIDE) // sumsq stride + mov x8, #(2*SUM_STRIDE) // sum stride + sub x0, x0, #(4*SUM_STRIDE) // sumsq -= stride + sub x1, x1, #(2*SUM_STRIDE) // sum -= stride + + tst w4, #4 // LR_HAVE_TOP + b.eq 0f + // If have top, read from row -2. + sub x5, x0, #(4*SUM_STRIDE) + sub x6, x1, #(2*SUM_STRIDE) + add w11, w11, #2 + b 1f +0: + // !LR_HAVE_TOP + // If we don't have top, read from row 0 even if + // we start writing to row -1. + add x5, x0, #(4*SUM_STRIDE) + add x6, x1, #(2*SUM_STRIDE) +1: + + tst w4, #8 // LR_HAVE_BOTTOM + b.eq 1f + // LR_HAVE_BOTTOM + add w3, w3, #2 // Sum all h+2 lines with the main loop + add w11, w11, #2 +1: + mov w9, w3 // Backup of h for next loops + +1: + // Start of horizontal loop; start one vertical filter slice. + // Start loading rows into v16-v21 and v24-v26 taking top + // padding into consideration. + tst w4, #4 // LR_HAVE_TOP + ld1 {v16.4s, v17.4s}, [x5], x7 + ld1 {v24.8h}, [x6], x8 + b.eq 2f + // LR_HAVE_TOP + ld1 {v18.4s, v19.4s}, [x5], x7 + ld1 {v25.8h}, [x6], x8 + ld1 {v20.4s, v21.4s}, [x5], x7 + ld1 {v26.8h}, [x6], x8 + b 3f +2: // !LR_HAVE_TOP + mov v18.16b, v16.16b + mov v19.16b, v17.16b + mov v25.16b, v24.16b + mov v20.16b, v16.16b + mov v21.16b, v17.16b + mov v26.16b, v24.16b + +3: + subs w3, w3, #1 +.macro add3 + add v16.4s, v16.4s, v18.4s + add v17.4s, v17.4s, v19.4s + add v24.8h, v24.8h, v25.8h + add v16.4s, v16.4s, v20.4s + add v17.4s, v17.4s, v21.4s + add v24.8h, v24.8h, v26.8h + st1 {v16.4s, v17.4s}, [x0], x7 + st1 {v24.8h}, [x1], x8 +.endm + add3 + mov v16.16b, v18.16b + mov v17.16b, v19.16b + mov v24.16b, v25.16b + mov v18.16b, v20.16b + mov v19.16b, v21.16b + mov v25.16b, v26.16b + b.le 4f + ld1 {v20.4s, v21.4s}, [x5], x7 + ld1 {v26.8h}, [x6], x8 + b 3b + +4: + tst w4, #8 // LR_HAVE_BOTTOM + b.ne 5f + // !LR_HAVE_BOTTOM + // Produce two more rows, extending the already loaded rows. + add3 + mov v16.16b, v18.16b + mov v17.16b, v19.16b + mov v24.16b, v25.16b + add3 + +5: // End of one vertical slice. + subs w2, w2, #8 + b.le 0f + // Move pointers back up to the top and loop horizontally. + // Input pointers + msub x5, x7, x11, x5 + msub x6, x8, x11, x6 + // Output pointers + msub x0, x7, x10, x0 + msub x1, x8, x10, x1 + add x0, x0, #32 + add x1, x1, #16 + add x5, x5, #32 + add x6, x6, #16 + mov w3, w9 + b 1b + +0: + ret +.purgem add3 +endfunc + +// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum, +// const int w, const int h, +// const enum LrEdgeFlags edges); +function sgr_box5_v_neon, export=1 + add w10, w3, #2 // Number of output rows to move back + mov w11, w3 // Number of input rows to move back + add w2, w2, #8 // Actual summed width + mov x7, #(4*SUM_STRIDE) // sumsq stride + mov x8, #(2*SUM_STRIDE) // sum stride + sub x0, x0, #(4*SUM_STRIDE) // sumsq -= stride + sub x1, x1, #(2*SUM_STRIDE) // sum -= stride + + tst w4, #4 // LR_HAVE_TOP + b.eq 0f + // If have top, read from row -2. + sub x5, x0, #(4*SUM_STRIDE) + sub x6, x1, #(2*SUM_STRIDE) + add w11, w11, #2 + b 1f +0: + // !LR_HAVE_TOP + // If we don't have top, read from row 0 even if + // we start writing to row -1. + add x5, x0, #(4*SUM_STRIDE) + add x6, x1, #(2*SUM_STRIDE) +1: + + tst w4, #8 // LR_HAVE_BOTTOM + b.eq 0f + // LR_HAVE_BOTTOM + add w3, w3, #2 // Handle h+2 lines with the main loop + add w11, w11, #2 + b 1f +0: + // !LR_HAVE_BOTTOM + sub w3, w3, #1 // Handle h-1 lines with the main loop +1: + mov w9, w3 // Backup of h for next loops + +1: + // Start of horizontal loop; start one vertical filter slice. + // Start loading rows into v16-v25 and v26-v30 taking top + // padding into consideration. + tst w4, #4 // LR_HAVE_TOP + ld1 {v16.4s, v17.4s}, [x5], x7 + ld1 {v26.8h}, [x6], x8 + b.eq 2f + // LR_HAVE_TOP + ld1 {v20.4s, v21.4s}, [x5], x7 + ld1 {v28.8h}, [x6], x8 + mov v18.16b, v16.16b + mov v19.16b, v17.16b + mov v27.16b, v26.16b + ld1 {v22.4s, v23.4s}, [x5], x7 + ld1 {v29.8h}, [x6], x8 + b 3f +2: // !LR_HAVE_TOP + mov v18.16b, v16.16b + mov v19.16b, v17.16b + mov v27.16b, v26.16b + mov v20.16b, v16.16b + mov v21.16b, v17.16b + mov v28.16b, v26.16b + mov v22.16b, v16.16b + mov v23.16b, v17.16b + mov v29.16b, v26.16b + +3: + cbz w3, 4f + ld1 {v24.4s, v25.4s}, [x5], x7 + ld1 {v30.8h}, [x6], x8 + +3: + // Start of vertical loop + subs w3, w3, #2 +.macro add5 + add v16.4s, v16.4s, v18.4s + add v17.4s, v17.4s, v19.4s + add v26.8h, v26.8h, v27.8h + add v0.4s, v20.4s, v22.4s + add v1.4s, v21.4s, v23.4s + add v2.8h, v28.8h, v29.8h + add v16.4s, v16.4s, v24.4s + add v17.4s, v17.4s, v25.4s + add v26.8h, v26.8h, v30.8h + add v16.4s, v16.4s, v0.4s + add v17.4s, v17.4s, v1.4s + add v26.8h, v26.8h, v2.8h + st1 {v16.4s, v17.4s}, [x0], x7 + st1 {v26.8h}, [x1], x8 +.endm + add5 +.macro shift2 + mov v16.16b, v20.16b + mov v17.16b, v21.16b + mov v26.16b, v28.16b + mov v18.16b, v22.16b + mov v19.16b, v23.16b + mov v27.16b, v29.16b + mov v20.16b, v24.16b + mov v21.16b, v25.16b + mov v28.16b, v30.16b +.endm + shift2 + add x0, x0, x7 + add x1, x1, x8 + b.le 5f + ld1 {v22.4s, v23.4s}, [x5], x7 + ld1 {v29.8h}, [x6], x8 + ld1 {v24.4s, v25.4s}, [x5], x7 + ld1 {v30.8h}, [x6], x8 + b 3b + +4: + // h == 1, !LR_HAVE_BOTTOM. + // Pad the last row with the only content row, and add. + mov v24.16b, v22.16b + mov v25.16b, v23.16b + mov v30.16b, v29.16b + add5 + shift2 + add x0, x0, x7 + add x1, x1, x8 + add5 + b 6f + +5: + tst w4, #8 // LR_HAVE_BOTTOM + b.ne 6f + // !LR_HAVE_BOTTOM + cbnz w3, 5f + // The intended three edge rows left; output the one at h-2 and + // the past edge one at h. + ld1 {v22.4s, v23.4s}, [x5], x7 + ld1 {v29.8h}, [x6], x8 + // Pad the past-edge row from the last content row. + mov v24.16b, v22.16b + mov v25.16b, v23.16b + mov v30.16b, v29.16b + add5 + shift2 + add x0, x0, x7 + add x1, x1, x8 + // The last two rows are already padded properly here. + add5 + b 6f + +5: + // w3 == -1, two rows left, output one. + // Pad the last two rows from the mid one. + mov v22.16b, v20.16b + mov v23.16b, v21.16b + mov v29.16b, v28.16b + mov v24.16b, v20.16b + mov v25.16b, v21.16b + mov v30.16b, v28.16b + add5 + add x0, x0, x7 + add x1, x1, x8 + b 6f + +6: // End of one vertical slice. + subs w2, w2, #8 + b.le 0f + // Move pointers back up to the top and loop horizontally. + // Input pointers + msub x5, x7, x11, x5 + msub x6, x8, x11, x6 + // Output pointers + msub x0, x7, x10, x0 + msub x1, x8, x10, x1 + add x0, x0, #32 + add x1, x1, #16 + add x5, x5, #32 + add x6, x6, #16 + mov w3, w9 + b 1b + +0: + ret +.purgem add5 +endfunc + +// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b, +// const int w, const int h, const int strength, +// const int bitdepth_max); +// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b, +// const int w, const int h, const int strength, +// const int bitdepth_max); +function sgr_calc_ab1_neon, export=1 + clz w9, w5 + add x3, x3, #2 // h += 2 + movi v31.4s, #9 // n + mov x5, #455 + mov x8, #SUM_STRIDE + b sgr_calc_ab_neon +endfunc + +function sgr_calc_ab2_neon, export=1 + clz w9, w5 + add x3, x3, #3 // h += 3 + asr x3, x3, #1 // h /= 2 + movi v31.4s, #25 // n + mov x5, #164 + mov x8, #(2*SUM_STRIDE) +endfunc + +function sgr_calc_ab_neon + sub w9, w9, #24 // -bitdepth_min_8 + movrel x12, X(sgr_x_by_x) + ld1 {v16.16b, v17.16b, v18.16b}, [x12] + dup v6.8h, w9 // -bitdepth_min_8 + movi v19.16b, #5 + movi v20.8b, #55 // idx of last 5 + movi v21.8b, #72 // idx of last 4 + movi v22.8b, #101 // idx of last 3 + movi v23.8b, #169 // idx of last 2 + movi v24.8b, #254 // idx of last 1 + saddl v7.4s, v6.4h, v6.4h // -2*bitdepth_min_8 + add x2, x2, #2 // w += 2 + add x7, x2, #7 + bic x7, x7, #7 // aligned w + sub x7, x8, x7 // increment between rows + movi v29.8h, #1, lsl #8 + dup v28.4s, w4 + dup v30.4s, w5 // one_by_x + sub x0, x0, #(4*(SUM_STRIDE)) + sub x1, x1, #(2*(SUM_STRIDE)) + mov x6, x2 // backup of w + sub v16.16b, v16.16b, v19.16b + sub v17.16b, v17.16b, v19.16b + sub v18.16b, v18.16b, v19.16b +1: + subs x2, x2, #8 + ld1 {v0.4s, v1.4s}, [x0] // a + ld1 {v2.8h}, [x1] // b + srshl v0.4s, v0.4s, v7.4s + srshl v1.4s, v1.4s, v7.4s + srshl v4.8h, v2.8h, v6.8h + mul v0.4s, v0.4s, v31.4s // a * n + mul v1.4s, v1.4s, v31.4s // a * n + umull v3.4s, v4.4h, v4.4h // b * b + umull2 v4.4s, v4.8h, v4.8h // b * b + uqsub v0.4s, v0.4s, v3.4s // imax(a * n - b * b, 0) + uqsub v1.4s, v1.4s, v4.4s // imax(a * n - b * b, 0) + mul v0.4s, v0.4s, v28.4s // p * s + mul v1.4s, v1.4s, v28.4s // p * s + uqshrn v0.4h, v0.4s, #16 + uqshrn2 v0.8h, v1.4s, #16 + uqrshrn v0.8b, v0.8h, #4 // imin(z, 255) + + cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5 + cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4 + tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b + cmhi v27.8b, v0.8b, v22.8b // = -1 if sgr_x_by_x[v0] < 3 + cmhi v4.8b, v0.8b, v23.8b // = -1 if sgr_x_by_x[v0] < 2 + add v25.8b, v25.8b, v26.8b + cmhi v5.8b, v0.8b, v24.8b // = -1 if sgr_x_by_x[v0] < 1 + add v27.8b, v27.8b, v4.8b + add v5.8b, v5.8b, v19.8b + add v25.8b, v25.8b, v27.8b + add v1.8b, v1.8b, v5.8b + add v1.8b, v1.8b, v25.8b + uxtl v1.8h, v1.8b // x + + umull v3.4s, v1.4h, v2.4h // x * BB[i] + umull2 v4.4s, v1.8h, v2.8h // x * BB[i] + mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x + mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x + srshr v3.4s, v3.4s, #12 // AA[i] + srshr v4.4s, v4.4s, #12 // AA[i] + sub v2.8h, v29.8h, v1.8h // 256 - x + + st1 {v3.4s, v4.4s}, [x0], #32 + st1 {v2.8h}, [x1], #16 + b.gt 1b + + subs x3, x3, #1 + b.le 0f + add x0, x0, x7, lsl #2 + add x1, x1, x7, lsl #1 + mov x2, x6 + b 1b +0: + ret +endfunc diff --git a/third_party/dav1d/src/arm/64/looprestoration_tmpl.S b/third_party/dav1d/src/arm/64/looprestoration_tmpl.S new file mode 100644 index 0000000000..520365b41e --- /dev/null +++ b/third_party/dav1d/src/arm/64/looprestoration_tmpl.S @@ -0,0 +1,597 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" + +#define FILTER_OUT_STRIDE 384 + +.macro sgr_funcs bpc +// void dav1d_sgr_finish_filter1_Xbpc_neon(int16_t *tmp, +// const pixel *src, const ptrdiff_t stride, +// const int32_t *a, const int16_t *b, +// const int w, const int h); +function sgr_finish_filter1_\bpc\()bpc_neon, export=1 + sub x7, x3, #(4*SUM_STRIDE) + add x8, x3, #(4*SUM_STRIDE) + sub x9, x4, #(2*SUM_STRIDE) + add x10, x4, #(2*SUM_STRIDE) + mov x11, #SUM_STRIDE + mov x12, #FILTER_OUT_STRIDE + add x13, x5, #7 + bic x13, x13, #7 // Aligned width +.if \bpc == 8 + sub x2, x2, x13 +.else + sub x2, x2, x13, lsl #1 +.endif + sub x12, x12, x13 + sub x11, x11, x13 + sub x11, x11, #4 // We read 4 extra elements from a + sub x14, x11, #4 // We read 8 extra elements from b + mov x13, x5 + movi v6.8h, #3 + movi v7.4s, #3 +1: + ld1 {v0.8h, v1.8h}, [x9], #32 + ld1 {v2.8h, v3.8h}, [x4], #32 + ld1 {v4.8h, v5.8h}, [x10], #32 + ld1 {v16.4s, v17.4s, v18.4s}, [x7], #48 + ld1 {v19.4s, v20.4s, v21.4s}, [x3], #48 + ld1 {v22.4s, v23.4s, v24.4s}, [x8], #48 + +2: + subs x5, x5, #8 + ext v25.16b, v0.16b, v1.16b, #2 // -stride + ext v26.16b, v2.16b, v3.16b, #2 // 0 + ext v27.16b, v4.16b, v5.16b, #2 // +stride + ext v28.16b, v0.16b, v1.16b, #4 // +1-stride + ext v29.16b, v2.16b, v3.16b, #4 // +1 + ext v30.16b, v4.16b, v5.16b, #4 // +1+stride + add v2.8h, v2.8h, v25.8h // -1, -stride + add v26.8h, v26.8h, v27.8h // 0, +stride + add v0.8h, v0.8h, v28.8h // -1-stride, +1-stride + add v2.8h, v2.8h, v26.8h + add v4.8h, v4.8h, v30.8h // -1+stride, +1+stride + add v2.8h, v2.8h, v29.8h // +1 + add v0.8h, v0.8h, v4.8h + + ext v25.16b, v16.16b, v17.16b, #4 // -stride + ext v26.16b, v17.16b, v18.16b, #4 + shl v2.8h, v2.8h, #2 + ext v27.16b, v16.16b, v17.16b, #8 // +1-stride + ext v28.16b, v17.16b, v18.16b, #8 + ext v29.16b, v19.16b, v20.16b, #4 // 0 + ext v30.16b, v20.16b, v21.16b, #4 + mla v2.8h, v0.8h, v6.8h // * 3 -> a + add v25.4s, v25.4s, v19.4s // -stride, -1 + add v26.4s, v26.4s, v20.4s + add v16.4s, v16.4s, v27.4s // -1-stride, +1-stride + add v17.4s, v17.4s, v28.4s + ext v27.16b, v19.16b, v20.16b, #8 // +1 + ext v28.16b, v20.16b, v21.16b, #8 + add v16.4s, v16.4s, v22.4s // -1+stride + add v17.4s, v17.4s, v23.4s + add v29.4s, v29.4s, v27.4s // 0, +1 + add v30.4s, v30.4s, v28.4s + add v25.4s, v25.4s, v29.4s + add v26.4s, v26.4s, v30.4s + ext v27.16b, v22.16b, v23.16b, #4 // +stride + ext v28.16b, v23.16b, v24.16b, #4 + ext v29.16b, v22.16b, v23.16b, #8 // +1+stride + ext v30.16b, v23.16b, v24.16b, #8 +.if \bpc == 8 + ld1 {v19.8b}, [x1], #8 // src +.else + ld1 {v19.8h}, [x1], #16 // src +.endif + add v25.4s, v25.4s, v27.4s // +stride + add v26.4s, v26.4s, v28.4s + add v16.4s, v16.4s, v29.4s // +1+stride + add v17.4s, v17.4s, v30.4s + shl v25.4s, v25.4s, #2 + shl v26.4s, v26.4s, #2 + mla v25.4s, v16.4s, v7.4s // * 3 -> b + mla v26.4s, v17.4s, v7.4s +.if \bpc == 8 + uxtl v19.8h, v19.8b // src +.endif + mov v0.16b, v1.16b + umlal v25.4s, v2.4h, v19.4h // b + a * src + umlal2 v26.4s, v2.8h, v19.8h + mov v2.16b, v3.16b + rshrn v25.4h, v25.4s, #9 + rshrn2 v25.8h, v26.4s, #9 + mov v4.16b, v5.16b + st1 {v25.8h}, [x0], #16 + + b.le 3f + mov v16.16b, v18.16b + mov v19.16b, v21.16b + mov v22.16b, v24.16b + ld1 {v1.8h}, [x9], #16 + ld1 {v3.8h}, [x4], #16 + ld1 {v5.8h}, [x10], #16 + ld1 {v17.4s, v18.4s}, [x7], #32 + ld1 {v20.4s, v21.4s}, [x3], #32 + ld1 {v23.4s, v24.4s}, [x8], #32 + b 2b + +3: + subs x6, x6, #1 + b.le 0f + mov x5, x13 + add x0, x0, x12, lsl #1 + add x1, x1, x2 + add x3, x3, x11, lsl #2 + add x7, x7, x11, lsl #2 + add x8, x8, x11, lsl #2 + add x4, x4, x14, lsl #1 + add x9, x9, x14, lsl #1 + add x10, x10, x14, lsl #1 + b 1b +0: + ret +endfunc + +// void dav1d_sgr_finish_filter2_Xbpc_neon(int16_t *tmp, +// const pixel *src, const ptrdiff_t stride, +// const int32_t *a, const int16_t *b, +// const int w, const int h); +function sgr_finish_filter2_\bpc\()bpc_neon, export=1 + add x7, x3, #(4*(SUM_STRIDE)) + sub x3, x3, #(4*(SUM_STRIDE)) + add x8, x4, #(2*(SUM_STRIDE)) + sub x4, x4, #(2*(SUM_STRIDE)) + mov x9, #(2*SUM_STRIDE) + mov x10, #FILTER_OUT_STRIDE + add x11, x5, #7 + bic x11, x11, #7 // Aligned width +.if \bpc == 8 + sub x2, x2, x11 +.else + sub x2, x2, x11, lsl #1 +.endif + sub x10, x10, x11 + sub x9, x9, x11 + sub x9, x9, #4 // We read 4 extra elements from a + sub x12, x9, #4 // We read 8 extra elements from b + mov x11, x5 + movi v4.8h, #5 + movi v5.4s, #5 + movi v6.8h, #6 + movi v7.4s, #6 +1: + ld1 {v0.8h, v1.8h}, [x4], #32 + ld1 {v2.8h, v3.8h}, [x8], #32 + ld1 {v16.4s, v17.4s, v18.4s}, [x3], #48 + ld1 {v19.4s, v20.4s, v21.4s}, [x7], #48 + +2: + subs x5, x5, #8 + ext v24.16b, v0.16b, v1.16b, #4 // +1-stride + ext v25.16b, v2.16b, v3.16b, #4 // +1+stride + ext v22.16b, v0.16b, v1.16b, #2 // -stride + ext v23.16b, v2.16b, v3.16b, #2 // +stride + add v0.8h, v0.8h, v24.8h // -1-stride, +1-stride + add v25.8h, v2.8h, v25.8h // -1+stride, +1+stride + add v2.8h, v22.8h, v23.8h // -stride, +stride + add v0.8h, v0.8h, v25.8h + + ext v22.16b, v16.16b, v17.16b, #4 // -stride + ext v23.16b, v17.16b, v18.16b, #4 + ext v24.16b, v19.16b, v20.16b, #4 // +stride + ext v25.16b, v20.16b, v21.16b, #4 + ext v26.16b, v16.16b, v17.16b, #8 // +1-stride + ext v27.16b, v17.16b, v18.16b, #8 + ext v28.16b, v19.16b, v20.16b, #8 // +1+stride + ext v29.16b, v20.16b, v21.16b, #8 + mul v0.8h, v0.8h, v4.8h // * 5 + mla v0.8h, v2.8h, v6.8h // * 6 +.if \bpc == 8 + ld1 {v31.8b}, [x1], #8 +.else + ld1 {v31.8h}, [x1], #16 +.endif + add v16.4s, v16.4s, v26.4s // -1-stride, +1-stride + add v17.4s, v17.4s, v27.4s + add v19.4s, v19.4s, v28.4s // -1+stride, +1+stride + add v20.4s, v20.4s, v29.4s + add v16.4s, v16.4s, v19.4s + add v17.4s, v17.4s, v20.4s + + add v22.4s, v22.4s, v24.4s // -stride, +stride + add v23.4s, v23.4s, v25.4s + // This is, surprisingly, faster than other variants where the + // mul+mla pairs are further apart, on Cortex A53. + mul v16.4s, v16.4s, v5.4s // * 5 + mla v16.4s, v22.4s, v7.4s // * 6 + mul v17.4s, v17.4s, v5.4s // * 5 + mla v17.4s, v23.4s, v7.4s // * 6 + +.if \bpc == 8 + uxtl v31.8h, v31.8b +.endif + umlal v16.4s, v0.4h, v31.4h // b + a * src + umlal2 v17.4s, v0.8h, v31.8h + mov v0.16b, v1.16b + rshrn v16.4h, v16.4s, #9 + rshrn2 v16.8h, v17.4s, #9 + mov v2.16b, v3.16b + st1 {v16.8h}, [x0], #16 + + b.le 3f + mov v16.16b, v18.16b + mov v19.16b, v21.16b + ld1 {v1.8h}, [x4], #16 + ld1 {v3.8h}, [x8], #16 + ld1 {v17.4s, v18.4s}, [x3], #32 + ld1 {v20.4s, v21.4s}, [x7], #32 + b 2b + +3: + subs x6, x6, #1 + b.le 0f + mov x5, x11 + add x0, x0, x10, lsl #1 + add x1, x1, x2 + add x3, x3, x9, lsl #2 + add x7, x7, x9, lsl #2 + add x4, x4, x12, lsl #1 + add x8, x8, x12, lsl #1 + mov x13, x3 + mov x14, x4 + + ld1 {v0.8h, v1.8h}, [x4], #32 + ld1 {v16.4s, v17.4s, v18.4s}, [x3], #48 + +4: + subs x5, x5, #8 + ext v23.16b, v0.16b, v1.16b, #4 // +1 + ext v22.16b, v0.16b, v1.16b, #2 // 0 + add v0.8h, v0.8h, v23.8h // -1, +1 + + ext v24.16b, v16.16b, v17.16b, #4 // 0 + ext v25.16b, v17.16b, v18.16b, #4 + ext v26.16b, v16.16b, v17.16b, #8 // +1 + ext v27.16b, v17.16b, v18.16b, #8 + mul v2.8h, v22.8h, v6.8h // * 6 + mla v2.8h, v0.8h, v4.8h // * 5 -> a +.if \bpc == 8 + ld1 {v31.8b}, [x1], #8 +.else + ld1 {v31.8h}, [x1], #16 +.endif + add v16.4s, v16.4s, v26.4s // -1, +1 + add v17.4s, v17.4s, v27.4s +.if \bpc == 8 + uxtl v31.8h, v31.8b +.endif + // This is, surprisingly, faster than other variants where the + // mul+mla pairs are further apart, on Cortex A53. + mul v24.4s, v24.4s, v7.4s // * 6 + mla v24.4s, v16.4s, v5.4s // * 5 -> b + mul v25.4s, v25.4s, v7.4s // * 6 + mla v25.4s, v17.4s, v5.4s // * 5 -> b + + umlal v24.4s, v2.4h, v31.4h // b + a * src + umlal2 v25.4s, v2.8h, v31.8h + mov v0.16b, v1.16b + rshrn v24.4h, v24.4s, #8 + rshrn2 v24.8h, v25.4s, #8 + mov v16.16b, v18.16b + st1 {v24.8h}, [x0], #16 + + b.le 5f + ld1 {v1.8h}, [x4], #16 + ld1 {v17.4s, v18.4s}, [x3], #32 + b 4b + +5: + subs x6, x6, #1 + b.le 0f + mov x5, x11 + add x0, x0, x10, lsl #1 + add x1, x1, x2 + mov x3, x13 // Rewind x3/x4 to where they started + mov x4, x14 + b 1b +0: + ret +endfunc + +// void dav1d_sgr_weighted1_Xbpc_neon(pixel *dst, const ptrdiff_t dst_stride, +// const pixel *src, const ptrdiff_t src_stride, +// const int16_t *t1, const int w, const int h, +// const int wt, const int bitdepth_max); +function sgr_weighted1_\bpc\()bpc_neon, export=1 +.if \bpc == 16 + ldr w8, [sp] +.endif + dup v31.8h, w7 + cmp x6, #2 +.if \bpc == 16 + dup v30.8h, w8 +.endif + add x9, x0, x1 + add x10, x2, x3 + add x11, x4, #2*FILTER_OUT_STRIDE + mov x7, #(4*FILTER_OUT_STRIDE) + lsl x1, x1, #1 + lsl x3, x3, #1 + add x8, x5, #7 + bic x8, x8, #7 // Aligned width +.if \bpc == 8 + sub x1, x1, x8 + sub x3, x3, x8 +.else + sub x1, x1, x8, lsl #1 + sub x3, x3, x8, lsl #1 +.endif + sub x7, x7, x8, lsl #1 + mov x8, x5 + b.lt 2f +1: +.if \bpc == 8 + ld1 {v0.8b}, [x2], #8 + ld1 {v4.8b}, [x10], #8 +.else + ld1 {v0.8h}, [x2], #16 + ld1 {v4.8h}, [x10], #16 +.endif + ld1 {v1.8h}, [x4], #16 + ld1 {v5.8h}, [x11], #16 + subs x5, x5, #8 +.if \bpc == 8 + ushll v0.8h, v0.8b, #4 // u + ushll v4.8h, v4.8b, #4 // u +.else + shl v0.8h, v0.8h, #4 // u + shl v4.8h, v4.8h, #4 // u +.endif + sub v1.8h, v1.8h, v0.8h // t1 - u + sub v5.8h, v5.8h, v4.8h // t1 - u + ushll v2.4s, v0.4h, #7 // u << 7 + ushll2 v3.4s, v0.8h, #7 // u << 7 + ushll v6.4s, v4.4h, #7 // u << 7 + ushll2 v7.4s, v4.8h, #7 // u << 7 + smlal v2.4s, v1.4h, v31.4h // v + smlal2 v3.4s, v1.8h, v31.8h // v + smlal v6.4s, v5.4h, v31.4h // v + smlal2 v7.4s, v5.8h, v31.8h // v +.if \bpc == 8 + rshrn v2.4h, v2.4s, #11 + rshrn2 v2.8h, v3.4s, #11 + rshrn v6.4h, v6.4s, #11 + rshrn2 v6.8h, v7.4s, #11 + sqxtun v2.8b, v2.8h + sqxtun v6.8b, v6.8h + st1 {v2.8b}, [x0], #8 + st1 {v6.8b}, [x9], #8 +.else + sqrshrun v2.4h, v2.4s, #11 + sqrshrun2 v2.8h, v3.4s, #11 + sqrshrun v6.4h, v6.4s, #11 + sqrshrun2 v6.8h, v7.4s, #11 + umin v2.8h, v2.8h, v30.8h + umin v6.8h, v6.8h, v30.8h + st1 {v2.8h}, [x0], #16 + st1 {v6.8h}, [x9], #16 +.endif + b.gt 1b + + sub x6, x6, #2 + cmp x6, #1 + b.lt 0f + mov x5, x8 + add x0, x0, x1 + add x9, x9, x1 + add x2, x2, x3 + add x10, x10, x3 + add x4, x4, x7 + add x11, x11, x7 + b.eq 2f + b 1b + +2: +.if \bpc == 8 + ld1 {v0.8b}, [x2], #8 +.else + ld1 {v0.8h}, [x2], #16 +.endif + ld1 {v1.8h}, [x4], #16 + subs x5, x5, #8 +.if \bpc == 8 + ushll v0.8h, v0.8b, #4 // u +.else + shl v0.8h, v0.8h, #4 // u +.endif + sub v1.8h, v1.8h, v0.8h // t1 - u + ushll v2.4s, v0.4h, #7 // u << 7 + ushll2 v3.4s, v0.8h, #7 // u << 7 + smlal v2.4s, v1.4h, v31.4h // v + smlal2 v3.4s, v1.8h, v31.8h // v +.if \bpc == 8 + rshrn v2.4h, v2.4s, #11 + rshrn2 v2.8h, v3.4s, #11 + sqxtun v2.8b, v2.8h + st1 {v2.8b}, [x0], #8 +.else + sqrshrun v2.4h, v2.4s, #11 + sqrshrun2 v2.8h, v3.4s, #11 + umin v2.8h, v2.8h, v30.8h + st1 {v2.8h}, [x0], #16 +.endif + b.gt 2b +0: + ret +endfunc + +// void dav1d_sgr_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *src, const ptrdiff_t src_stride, +// const int16_t *t1, const int16_t *t2, +// const int w, const int h, +// const int16_t wt[2]); +function sgr_weighted2_\bpc\()bpc_neon, export=1 +.if \bpc == 8 + ldr x8, [sp] +.else + ldp x8, x9, [sp] +.endif + cmp x7, #2 + add x10, x0, x1 + add x11, x2, x3 + add x12, x4, #2*FILTER_OUT_STRIDE + add x13, x5, #2*FILTER_OUT_STRIDE + ld2r {v30.8h, v31.8h}, [x8] // wt[0], wt[1] +.if \bpc == 16 + dup v29.8h, w9 +.endif + mov x8, #4*FILTER_OUT_STRIDE + lsl x1, x1, #1 + lsl x3, x3, #1 + add x9, x6, #7 + bic x9, x9, #7 // Aligned width +.if \bpc == 8 + sub x1, x1, x9 + sub x3, x3, x9 +.else + sub x1, x1, x9, lsl #1 + sub x3, x3, x9, lsl #1 +.endif + sub x8, x8, x9, lsl #1 + mov x9, x6 + b.lt 2f +1: +.if \bpc == 8 + ld1 {v0.8b}, [x2], #8 + ld1 {v16.8b}, [x11], #8 +.else + ld1 {v0.8h}, [x2], #16 + ld1 {v16.8h}, [x11], #16 +.endif + ld1 {v1.8h}, [x4], #16 + ld1 {v17.8h}, [x12], #16 + ld1 {v2.8h}, [x5], #16 + ld1 {v18.8h}, [x13], #16 + subs x6, x6, #8 +.if \bpc == 8 + ushll v0.8h, v0.8b, #4 // u + ushll v16.8h, v16.8b, #4 // u +.else + shl v0.8h, v0.8h, #4 // u + shl v16.8h, v16.8h, #4 // u +.endif + sub v1.8h, v1.8h, v0.8h // t1 - u + sub v2.8h, v2.8h, v0.8h // t2 - u + sub v17.8h, v17.8h, v16.8h // t1 - u + sub v18.8h, v18.8h, v16.8h // t2 - u + ushll v3.4s, v0.4h, #7 // u << 7 + ushll2 v4.4s, v0.8h, #7 // u << 7 + ushll v19.4s, v16.4h, #7 // u << 7 + ushll2 v20.4s, v16.8h, #7 // u << 7 + smlal v3.4s, v1.4h, v30.4h // wt[0] * (t1 - u) + smlal v3.4s, v2.4h, v31.4h // wt[1] * (t2 - u) + smlal2 v4.4s, v1.8h, v30.8h // wt[0] * (t1 - u) + smlal2 v4.4s, v2.8h, v31.8h // wt[1] * (t2 - u) + smlal v19.4s, v17.4h, v30.4h // wt[0] * (t1 - u) + smlal v19.4s, v18.4h, v31.4h // wt[1] * (t2 - u) + smlal2 v20.4s, v17.8h, v30.8h // wt[0] * (t1 - u) + smlal2 v20.4s, v18.8h, v31.8h // wt[1] * (t2 - u) +.if \bpc == 8 + rshrn v3.4h, v3.4s, #11 + rshrn2 v3.8h, v4.4s, #11 + rshrn v19.4h, v19.4s, #11 + rshrn2 v19.8h, v20.4s, #11 + sqxtun v3.8b, v3.8h + sqxtun v19.8b, v19.8h + st1 {v3.8b}, [x0], #8 + st1 {v19.8b}, [x10], #8 +.else + sqrshrun v3.4h, v3.4s, #11 + sqrshrun2 v3.8h, v4.4s, #11 + sqrshrun v19.4h, v19.4s, #11 + sqrshrun2 v19.8h, v20.4s, #11 + umin v3.8h, v3.8h, v29.8h + umin v19.8h, v19.8h, v29.8h + st1 {v3.8h}, [x0], #16 + st1 {v19.8h}, [x10], #16 +.endif + b.gt 1b + + subs x7, x7, #2 + cmp x7, #1 + b.lt 0f + mov x6, x9 + add x0, x0, x1 + add x10, x10, x1 + add x2, x2, x3 + add x11, x11, x3 + add x4, x4, x8 + add x12, x12, x8 + add x5, x5, x8 + add x13, x13, x8 + b.eq 2f + b 1b + +2: +.if \bpc == 8 + ld1 {v0.8b}, [x2], #8 +.else + ld1 {v0.8h}, [x2], #16 +.endif + ld1 {v1.8h}, [x4], #16 + ld1 {v2.8h}, [x5], #16 + subs x6, x6, #8 +.if \bpc == 8 + ushll v0.8h, v0.8b, #4 // u +.else + shl v0.8h, v0.8h, #4 // u +.endif + sub v1.8h, v1.8h, v0.8h // t1 - u + sub v2.8h, v2.8h, v0.8h // t2 - u + ushll v3.4s, v0.4h, #7 // u << 7 + ushll2 v4.4s, v0.8h, #7 // u << 7 + smlal v3.4s, v1.4h, v30.4h // wt[0] * (t1 - u) + smlal v3.4s, v2.4h, v31.4h // wt[1] * (t2 - u) + smlal2 v4.4s, v1.8h, v30.8h // wt[0] * (t1 - u) + smlal2 v4.4s, v2.8h, v31.8h // wt[1] * (t2 - u) +.if \bpc == 8 + rshrn v3.4h, v3.4s, #11 + rshrn2 v3.8h, v4.4s, #11 + sqxtun v3.8b, v3.8h + st1 {v3.8b}, [x0], #8 +.else + sqrshrun v3.4h, v3.4s, #11 + sqrshrun2 v3.8h, v4.4s, #11 + umin v3.8h, v3.8h, v29.8h + st1 {v3.8h}, [x0], #16 +.endif + b.gt 1b +0: + ret +endfunc +.endm diff --git a/third_party/dav1d/src/arm/64/mc.S b/third_party/dav1d/src/arm/64/mc.S new file mode 100644 index 0000000000..32ed6e901a --- /dev/null +++ b/third_party/dav1d/src/arm/64/mc.S @@ -0,0 +1,3247 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Janne Grunau + * Copyright © 2018, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +.macro avg dst, t0, t1, t2, t3 + ld1 {\t0\().8h,\t1\().8h}, [x2], 32 + ld1 {\t2\().8h,\t3\().8h}, [x3], 32 + add \t0\().8h, \t0\().8h, \t2\().8h + add \t1\().8h, \t1\().8h, \t3\().8h + sqrshrun \dst\().8b, \t0\().8h, #5 + sqrshrun2 \dst\().16b, \t1\().8h, #5 +.endm + +.macro w_avg dst, t0, t1, t2, t3 + ld1 {\t0\().8h,\t1\().8h}, [x2], 32 + ld1 {\t2\().8h,\t3\().8h}, [x3], 32 + sub \t0\().8h, \t2\().8h, \t0\().8h + sub \t1\().8h, \t3\().8h, \t1\().8h + sqdmulh \t0\().8h, \t0\().8h, v30.8h + sqdmulh \t1\().8h, \t1\().8h, v30.8h + add \t0\().8h, \t2\().8h, \t0\().8h + add \t1\().8h, \t3\().8h, \t1\().8h + sqrshrun \dst\().8b, \t0\().8h, #4 + sqrshrun2 \dst\().16b, \t1\().8h, #4 +.endm + +.macro mask dst, t0, t1, t2, t3 + ld1 {v30.16b}, [x6], 16 + ld1 {\t0\().8h,\t1\().8h}, [x2], 32 + mul v30.16b, v30.16b, v31.16b + ld1 {\t2\().8h,\t3\().8h}, [x3], 32 + shll v28.8h, v30.8b, #8 + shll2 v29.8h, v30.16b, #8 + sub \t0\().8h, \t2\().8h, \t0\().8h + sub \t1\().8h, \t3\().8h, \t1\().8h + sqdmulh \t0\().8h, \t0\().8h, v28.8h + sqdmulh \t1\().8h, \t1\().8h, v29.8h + add \t0\().8h, \t2\().8h, \t0\().8h + add \t1\().8h, \t3\().8h, \t1\().8h + sqrshrun \dst\().8b, \t0\().8h, #4 + sqrshrun2 \dst\().16b, \t1\().8h, #4 +.endm + +.macro bidir_fn type +function \type\()_8bpc_neon, export=1 + clz w4, w4 +.ifc \type, w_avg + dup v30.8h, w6 + neg v30.8h, v30.8h + shl v30.8h, v30.8h, #11 +.endif +.ifc \type, mask + movi v31.16b, #256-2 +.endif + adr x7, L(\type\()_tbl) + sub w4, w4, #24 + ldrh w4, [x7, x4, lsl #1] + \type v4, v0, v1, v2, v3 + sub x7, x7, w4, uxtw + br x7 +40: + add x7, x0, x1 + lsl x1, x1, #1 +4: + cmp w5, #4 + st1 {v4.s}[0], [x0], x1 + st1 {v4.s}[1], [x7], x1 + st1 {v4.s}[2], [x0], x1 + st1 {v4.s}[3], [x7], x1 + b.eq 0f + \type v5, v0, v1, v2, v3 + cmp w5, #8 + st1 {v5.s}[0], [x0], x1 + st1 {v5.s}[1], [x7], x1 + st1 {v5.s}[2], [x0], x1 + st1 {v5.s}[3], [x7], x1 + b.eq 0f + \type v4, v0, v1, v2, v3 + st1 {v4.s}[0], [x0], x1 + st1 {v4.s}[1], [x7], x1 + \type v5, v0, v1, v2, v3 + st1 {v4.s}[2], [x0], x1 + st1 {v4.s}[3], [x7], x1 + st1 {v5.s}[0], [x0], x1 + st1 {v5.s}[1], [x7], x1 + st1 {v5.s}[2], [x0], x1 + st1 {v5.s}[3], [x7], x1 + ret +80: + add x7, x0, x1 + lsl x1, x1, #1 +8: + st1 {v4.d}[0], [x0], x1 + \type v5, v0, v1, v2, v3 + st1 {v4.d}[1], [x7], x1 + st1 {v5.d}[0], [x0], x1 + subs w5, w5, #4 + st1 {v5.d}[1], [x7], x1 + b.le 0f + \type v4, v0, v1, v2, v3 + b 8b +16: + \type v5, v0, v1, v2, v3 + st1 {v4.16b}, [x0], x1 + \type v6, v0, v1, v2, v3 + st1 {v5.16b}, [x0], x1 + \type v7, v0, v1, v2, v3 + st1 {v6.16b}, [x0], x1 + subs w5, w5, #4 + st1 {v7.16b}, [x0], x1 + b.le 0f + \type v4, v0, v1, v2, v3 + b 16b +320: + add x7, x0, x1 + lsl x1, x1, #1 +32: + \type v5, v0, v1, v2, v3 + \type v6, v0, v1, v2, v3 + st1 {v4.16b,v5.16b}, [x0], x1 + \type v7, v0, v1, v2, v3 + subs w5, w5, #2 + st1 {v6.16b,v7.16b}, [x7], x1 + b.le 0f + \type v4, v0, v1, v2, v3 + b 32b +640: + add x7, x0, x1 + lsl x1, x1, #1 +64: + \type v5, v0, v1, v2, v3 + \type v6, v0, v1, v2, v3 + \type v7, v0, v1, v2, v3 + \type v16, v0, v1, v2, v3 + \type v17, v0, v1, v2, v3 + st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1 + \type v18, v0, v1, v2, v3 + \type v19, v0, v1, v2, v3 + subs w5, w5, #2 + st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1 + b.le 0f + \type v4, v0, v1, v2, v3 + b 64b +1280: + add x7, x0, #64 +128: + \type v5, v0, v1, v2, v3 + \type v6, v0, v1, v2, v3 + \type v7, v0, v1, v2, v3 + \type v16, v0, v1, v2, v3 + \type v17, v0, v1, v2, v3 + st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1 + \type v18, v0, v1, v2, v3 + \type v19, v0, v1, v2, v3 + subs w5, w5, #1 + st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1 + b.le 0f + \type v4, v0, v1, v2, v3 + b 128b +0: + ret +L(\type\()_tbl): + .hword L(\type\()_tbl) - 1280b + .hword L(\type\()_tbl) - 640b + .hword L(\type\()_tbl) - 320b + .hword L(\type\()_tbl) - 16b + .hword L(\type\()_tbl) - 80b + .hword L(\type\()_tbl) - 40b +endfunc +.endm + +bidir_fn avg +bidir_fn w_avg +bidir_fn mask + + +.macro w_mask_fn type +function w_mask_\type\()_8bpc_neon, export=1 + clz w8, w4 + adr x9, L(w_mask_\type\()_tbl) + sub w8, w8, #24 + ldrh w8, [x9, x8, lsl #1] + sub x9, x9, w8, uxtw + mov w10, #6903 + dup v0.8h, w10 +.if \type == 444 + movi v1.16b, #64 +.elseif \type == 422 + dup v2.8b, w7 + movi v3.8b, #129 + sub v3.8b, v3.8b, v2.8b +.elseif \type == 420 + dup v2.8h, w7 + movi v3.8h, #1, lsl #8 + sub v3.8h, v3.8h, v2.8h +.endif + add x12, x0, x1 + lsl x1, x1, #1 + br x9 +4: + ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once) + ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once) + subs w5, w5, #4 + sub v16.8h, v6.8h, v4.8h + sub v17.8h, v7.8h, v5.8h + sabd v18.8h, v4.8h, v6.8h + sabd v19.8h, v5.8h, v7.8h + uqsub v18.8h, v0.8h, v18.8h + uqsub v19.8h, v0.8h, v19.8h + ushr v18.8h, v18.8h, #8 + ushr v19.8h, v19.8h, #8 + shl v20.8h, v18.8h, #9 + shl v21.8h, v19.8h, #9 + sqdmulh v20.8h, v20.8h, v16.8h + sqdmulh v21.8h, v21.8h, v17.8h + add v20.8h, v20.8h, v4.8h + add v21.8h, v21.8h, v5.8h + sqrshrun v22.8b, v20.8h, #4 + sqrshrun v23.8b, v21.8h, #4 +.if \type == 444 + xtn v18.8b, v18.8h + xtn2 v18.16b, v19.8h + sub v18.16b, v1.16b, v18.16b + st1 {v18.16b}, [x6], #16 +.elseif \type == 422 + addp v18.8h, v18.8h, v19.8h + xtn v18.8b, v18.8h + uhsub v18.8b, v3.8b, v18.8b + st1 {v18.8b}, [x6], #8 +.elseif \type == 420 + trn1 v24.2d, v18.2d, v19.2d + trn2 v25.2d, v18.2d, v19.2d + add v24.8h, v24.8h, v25.8h + addp v18.8h, v24.8h, v24.8h + sub v18.4h, v3.4h, v18.4h + rshrn v18.8b, v18.8h, #2 + st1 {v18.s}[0], [x6], #4 +.endif + st1 {v22.s}[0], [x0], x1 + st1 {v22.s}[1], [x12], x1 + st1 {v23.s}[0], [x0], x1 + st1 {v23.s}[1], [x12], x1 + b.gt 4b + ret +8: + ld1 {v4.8h, v5.8h}, [x2], #32 + ld1 {v6.8h, v7.8h}, [x3], #32 + subs w5, w5, #2 + sub v16.8h, v6.8h, v4.8h + sub v17.8h, v7.8h, v5.8h + sabd v18.8h, v4.8h, v6.8h + sabd v19.8h, v5.8h, v7.8h + uqsub v18.8h, v0.8h, v18.8h + uqsub v19.8h, v0.8h, v19.8h + ushr v18.8h, v18.8h, #8 + ushr v19.8h, v19.8h, #8 + shl v20.8h, v18.8h, #9 + shl v21.8h, v19.8h, #9 + sqdmulh v20.8h, v20.8h, v16.8h + sqdmulh v21.8h, v21.8h, v17.8h + add v20.8h, v20.8h, v4.8h + add v21.8h, v21.8h, v5.8h + sqrshrun v22.8b, v20.8h, #4 + sqrshrun v23.8b, v21.8h, #4 +.if \type == 444 + xtn v18.8b, v18.8h + xtn2 v18.16b, v19.8h + sub v18.16b, v1.16b, v18.16b + st1 {v18.16b}, [x6], #16 +.elseif \type == 422 + addp v18.8h, v18.8h, v19.8h + xtn v18.8b, v18.8h + uhsub v18.8b, v3.8b, v18.8b + st1 {v18.8b}, [x6], #8 +.elseif \type == 420 + add v18.8h, v18.8h, v19.8h + addp v18.8h, v18.8h, v18.8h + sub v18.4h, v3.4h, v18.4h + rshrn v18.8b, v18.8h, #2 + st1 {v18.s}[0], [x6], #4 +.endif + st1 {v22.8b}, [x0], x1 + st1 {v23.8b}, [x12], x1 + b.gt 8b + ret +1280: +640: +320: +160: + mov w11, w4 + sub x1, x1, w4, uxtw +.if \type == 444 + add x10, x6, w4, uxtw +.elseif \type == 422 + add x10, x6, x11, lsr #1 +.endif + add x9, x3, w4, uxtw #1 + add x7, x2, w4, uxtw #1 +161: + mov w8, w4 +16: + ld1 {v4.8h, v5.8h}, [x2], #32 + ld1 {v6.8h, v7.8h}, [x3], #32 + ld1 {v16.8h, v17.8h}, [x7], #32 + ld1 {v18.8h, v19.8h}, [x9], #32 + subs w8, w8, #16 + sub v6.8h, v6.8h, v4.8h + sub v7.8h, v7.8h, v5.8h + sub v18.8h, v18.8h, v16.8h + sub v19.8h, v19.8h, v17.8h + abs v20.8h, v6.8h + abs v21.8h, v7.8h + abs v22.8h, v18.8h + abs v23.8h, v19.8h + uqsub v20.8h, v0.8h, v20.8h + uqsub v21.8h, v0.8h, v21.8h + uqsub v22.8h, v0.8h, v22.8h + uqsub v23.8h, v0.8h, v23.8h + ushr v20.8h, v20.8h, #8 + ushr v21.8h, v21.8h, #8 + ushr v22.8h, v22.8h, #8 + ushr v23.8h, v23.8h, #8 + shl v24.8h, v20.8h, #9 + shl v25.8h, v21.8h, #9 + shl v26.8h, v22.8h, #9 + shl v27.8h, v23.8h, #9 + sqdmulh v24.8h, v24.8h, v6.8h + sqdmulh v25.8h, v25.8h, v7.8h + sqdmulh v26.8h, v26.8h, v18.8h + sqdmulh v27.8h, v27.8h, v19.8h + add v24.8h, v24.8h, v4.8h + add v25.8h, v25.8h, v5.8h + add v26.8h, v26.8h, v16.8h + add v27.8h, v27.8h, v17.8h + sqrshrun v24.8b, v24.8h, #4 + sqrshrun v25.8b, v25.8h, #4 + sqrshrun v26.8b, v26.8h, #4 + sqrshrun v27.8b, v27.8h, #4 +.if \type == 444 + xtn v20.8b, v20.8h + xtn2 v20.16b, v21.8h + xtn v21.8b, v22.8h + xtn2 v21.16b, v23.8h + sub v20.16b, v1.16b, v20.16b + sub v21.16b, v1.16b, v21.16b + st1 {v20.16b}, [x6], #16 + st1 {v21.16b}, [x10], #16 +.elseif \type == 422 + addp v20.8h, v20.8h, v21.8h + addp v21.8h, v22.8h, v23.8h + xtn v20.8b, v20.8h + xtn v21.8b, v21.8h + uhsub v20.8b, v3.8b, v20.8b + uhsub v21.8b, v3.8b, v21.8b + st1 {v20.8b}, [x6], #8 + st1 {v21.8b}, [x10], #8 +.elseif \type == 420 + add v20.8h, v20.8h, v22.8h + add v21.8h, v21.8h, v23.8h + addp v20.8h, v20.8h, v21.8h + sub v20.8h, v3.8h, v20.8h + rshrn v20.8b, v20.8h, #2 + st1 {v20.8b}, [x6], #8 +.endif + st1 {v24.8b, v25.8b}, [x0], #16 + st1 {v26.8b, v27.8b}, [x12], #16 + b.gt 16b + subs w5, w5, #2 + add x2, x2, w4, uxtw #1 + add x3, x3, w4, uxtw #1 + add x7, x7, w4, uxtw #1 + add x9, x9, w4, uxtw #1 +.if \type == 444 + add x6, x6, w4, uxtw + add x10, x10, w4, uxtw +.elseif \type == 422 + add x6, x6, x11, lsr #1 + add x10, x10, x11, lsr #1 +.endif + add x0, x0, x1 + add x12, x12, x1 + b.gt 161b + ret +L(w_mask_\type\()_tbl): + .hword L(w_mask_\type\()_tbl) - 1280b + .hword L(w_mask_\type\()_tbl) - 640b + .hword L(w_mask_\type\()_tbl) - 320b + .hword L(w_mask_\type\()_tbl) - 160b + .hword L(w_mask_\type\()_tbl) - 8b + .hword L(w_mask_\type\()_tbl) - 4b +endfunc +.endm + +w_mask_fn 444 +w_mask_fn 422 +w_mask_fn 420 + + +function blend_8bpc_neon, export=1 + adr x6, L(blend_tbl) + clz w3, w3 + sub w3, w3, #26 + ldrh w3, [x6, x3, lsl #1] + sub x6, x6, w3, uxtw + movi v4.16b, #64 + add x8, x0, x1 + lsl x1, x1, #1 + br x6 +4: + ld1 {v2.8b}, [x5], #8 + ld1 {v1.d}[0], [x2], #8 + ld1 {v0.s}[0], [x0] + subs w4, w4, #2 + ld1 {v0.s}[1], [x8] + sub v3.8b, v4.8b, v2.8b + umull v5.8h, v1.8b, v2.8b + umlal v5.8h, v0.8b, v3.8b + rshrn v6.8b, v5.8h, #6 + st1 {v6.s}[0], [x0], x1 + st1 {v6.s}[1], [x8], x1 + b.gt 4b + ret +8: + ld1 {v2.16b}, [x5], #16 + ld1 {v1.16b}, [x2], #16 + ld1 {v0.d}[0], [x0] + ld1 {v0.d}[1], [x8] + sub v3.16b, v4.16b, v2.16b + subs w4, w4, #2 + umull v5.8h, v1.8b, v2.8b + umlal v5.8h, v0.8b, v3.8b + umull2 v6.8h, v1.16b, v2.16b + umlal2 v6.8h, v0.16b, v3.16b + rshrn v7.8b, v5.8h, #6 + rshrn2 v7.16b, v6.8h, #6 + st1 {v7.d}[0], [x0], x1 + st1 {v7.d}[1], [x8], x1 + b.gt 8b + ret +16: + ld1 {v1.16b, v2.16b}, [x5], #32 + ld1 {v5.16b, v6.16b}, [x2], #32 + ld1 {v0.16b}, [x0] + subs w4, w4, #2 + sub v7.16b, v4.16b, v1.16b + sub v20.16b, v4.16b, v2.16b + ld1 {v3.16b}, [x8] + umull v16.8h, v5.8b, v1.8b + umlal v16.8h, v0.8b, v7.8b + umull2 v17.8h, v5.16b, v1.16b + umlal2 v17.8h, v0.16b, v7.16b + umull v21.8h, v6.8b, v2.8b + umlal v21.8h, v3.8b, v20.8b + umull2 v22.8h, v6.16b, v2.16b + umlal2 v22.8h, v3.16b, v20.16b + rshrn v18.8b, v16.8h, #6 + rshrn2 v18.16b, v17.8h, #6 + rshrn v19.8b, v21.8h, #6 + rshrn2 v19.16b, v22.8h, #6 + st1 {v18.16b}, [x0], x1 + st1 {v19.16b}, [x8], x1 + b.gt 16b + ret +32: + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x5], #64 + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64 + ld1 {v20.16b, v21.16b}, [x0] + subs w4, w4, #2 + ld1 {v22.16b, v23.16b}, [x8] + sub v5.16b, v4.16b, v0.16b + sub v6.16b, v4.16b, v1.16b + sub v30.16b, v4.16b, v2.16b + sub v31.16b, v4.16b, v3.16b + umull v24.8h, v16.8b, v0.8b + umlal v24.8h, v20.8b, v5.8b + umull2 v26.8h, v16.16b, v0.16b + umlal2 v26.8h, v20.16b, v5.16b + umull v28.8h, v17.8b, v1.8b + umlal v28.8h, v21.8b, v6.8b + umull2 v7.8h, v17.16b, v1.16b + umlal2 v7.8h, v21.16b, v6.16b + umull v27.8h, v18.8b, v2.8b + umlal v27.8h, v22.8b, v30.8b + umull2 v1.8h, v18.16b, v2.16b + umlal2 v1.8h, v22.16b, v30.16b + umull v29.8h, v19.8b, v3.8b + umlal v29.8h, v23.8b, v31.8b + umull2 v21.8h, v19.16b, v3.16b + umlal2 v21.8h, v23.16b, v31.16b + rshrn v24.8b, v24.8h, #6 + rshrn2 v24.16b, v26.8h, #6 + rshrn v25.8b, v28.8h, #6 + rshrn2 v25.16b, v7.8h, #6 + rshrn v27.8b, v27.8h, #6 + rshrn2 v27.16b, v1.8h, #6 + rshrn v28.8b, v29.8h, #6 + rshrn2 v28.16b, v21.8h, #6 + st1 {v24.16b, v25.16b}, [x0], x1 + st1 {v27.16b, v28.16b}, [x8], x1 + b.gt 32b + ret +L(blend_tbl): + .hword L(blend_tbl) - 32b + .hword L(blend_tbl) - 16b + .hword L(blend_tbl) - 8b + .hword L(blend_tbl) - 4b +endfunc + +function blend_h_8bpc_neon, export=1 + adr x6, L(blend_h_tbl) + movrel x5, X(obmc_masks) + add x5, x5, w4, uxtw + sub w4, w4, w4, lsr #2 + clz w7, w3 + movi v4.16b, #64 + add x8, x0, x1 + lsl x1, x1, #1 + sub w7, w7, #24 + ldrh w7, [x6, x7, lsl #1] + sub x6, x6, w7, uxtw + br x6 +2: + ld1 {v0.h}[0], [x5], #2 + ld1 {v1.s}[0], [x2], #4 + subs w4, w4, #2 + ld1 {v2.h}[0], [x0] + zip1 v0.8b, v0.8b, v0.8b + sub v3.8b, v4.8b, v0.8b + ld1 {v2.h}[1], [x8] + umull v5.8h, v1.8b, v0.8b + umlal v5.8h, v2.8b, v3.8b + rshrn v5.8b, v5.8h, #6 + st1 {v5.h}[0], [x0], x1 + st1 {v5.h}[1], [x8], x1 + b.gt 2b + ret +4: + ld2r {v0.8b, v1.8b}, [x5], #2 + ld1 {v2.8b}, [x2], #8 + subs w4, w4, #2 + ext v0.8b, v0.8b, v1.8b, #4 + ld1 {v3.s}[0], [x0] + sub v5.8b, v4.8b, v0.8b + ld1 {v3.s}[1], [x8] + umull v6.8h, v2.8b, v0.8b + umlal v6.8h, v3.8b, v5.8b + rshrn v6.8b, v6.8h, #6 + st1 {v6.s}[0], [x0], x1 + st1 {v6.s}[1], [x8], x1 + b.gt 4b + ret +8: + ld2r {v0.16b, v1.16b}, [x5], #2 + ld1 {v2.16b}, [x2], #16 + ld1 {v3.d}[0], [x0] + ext v0.16b, v0.16b, v1.16b, #8 + sub v5.16b, v4.16b, v0.16b + ld1 {v3.d}[1], [x8] + subs w4, w4, #2 + umull v6.8h, v0.8b, v2.8b + umlal v6.8h, v3.8b, v5.8b + umull2 v7.8h, v0.16b, v2.16b + umlal2 v7.8h, v3.16b, v5.16b + rshrn v16.8b, v6.8h, #6 + rshrn2 v16.16b, v7.8h, #6 + st1 {v16.d}[0], [x0], x1 + st1 {v16.d}[1], [x8], x1 + b.gt 8b + ret +16: + ld2r {v0.16b, v1.16b}, [x5], #2 + ld1 {v2.16b, v3.16b}, [x2], #32 + ld1 {v5.16b}, [x0] + sub v7.16b, v4.16b, v0.16b + sub v16.16b, v4.16b, v1.16b + ld1 {v6.16b}, [x8] + subs w4, w4, #2 + umull v17.8h, v0.8b, v2.8b + umlal v17.8h, v5.8b, v7.8b + umull2 v18.8h, v0.16b, v2.16b + umlal2 v18.8h, v5.16b, v7.16b + umull v19.8h, v1.8b, v3.8b + umlal v19.8h, v6.8b, v16.8b + umull2 v20.8h, v1.16b, v3.16b + umlal2 v20.8h, v6.16b, v16.16b + rshrn v21.8b, v17.8h, #6 + rshrn2 v21.16b, v18.8h, #6 + rshrn v22.8b, v19.8h, #6 + rshrn2 v22.16b, v20.8h, #6 + st1 {v21.16b}, [x0], x1 + st1 {v22.16b}, [x8], x1 + b.gt 16b + ret +1280: +640: +320: + sub x1, x1, w3, uxtw + add x7, x2, w3, uxtw +321: + ld2r {v0.16b, v1.16b}, [x5], #2 + mov w6, w3 + sub v20.16b, v4.16b, v0.16b + sub v21.16b, v4.16b, v1.16b +32: + ld1 {v16.16b, v17.16b}, [x2], #32 + ld1 {v2.16b, v3.16b}, [x0] + subs w6, w6, #32 + umull v23.8h, v0.8b, v16.8b + umlal v23.8h, v2.8b, v20.8b + ld1 {v18.16b, v19.16b}, [x7], #32 + umull2 v27.8h, v0.16b, v16.16b + umlal2 v27.8h, v2.16b, v20.16b + ld1 {v6.16b, v7.16b}, [x8] + umull v24.8h, v0.8b, v17.8b + umlal v24.8h, v3.8b, v20.8b + umull2 v28.8h, v0.16b, v17.16b + umlal2 v28.8h, v3.16b, v20.16b + umull v25.8h, v1.8b, v18.8b + umlal v25.8h, v6.8b, v21.8b + umull2 v5.8h, v1.16b, v18.16b + umlal2 v5.8h, v6.16b, v21.16b + rshrn v29.8b, v23.8h, #6 + rshrn2 v29.16b, v27.8h, #6 + umull v26.8h, v1.8b, v19.8b + umlal v26.8h, v7.8b, v21.8b + umull2 v31.8h, v1.16b, v19.16b + umlal2 v31.8h, v7.16b, v21.16b + rshrn v30.8b, v24.8h, #6 + rshrn2 v30.16b, v28.8h, #6 + rshrn v23.8b, v25.8h, #6 + rshrn2 v23.16b, v5.8h, #6 + rshrn v24.8b, v26.8h, #6 + st1 {v29.16b, v30.16b}, [x0], #32 + rshrn2 v24.16b, v31.8h, #6 + st1 {v23.16b, v24.16b}, [x8], #32 + b.gt 32b + subs w4, w4, #2 + add x0, x0, x1 + add x8, x8, x1 + add x2, x2, w3, uxtw + add x7, x7, w3, uxtw + b.gt 321b + ret +L(blend_h_tbl): + .hword L(blend_h_tbl) - 1280b + .hword L(blend_h_tbl) - 640b + .hword L(blend_h_tbl) - 320b + .hword L(blend_h_tbl) - 16b + .hword L(blend_h_tbl) - 8b + .hword L(blend_h_tbl) - 4b + .hword L(blend_h_tbl) - 2b +endfunc + +function blend_v_8bpc_neon, export=1 + adr x6, L(blend_v_tbl) + movrel x5, X(obmc_masks) + add x5, x5, w3, uxtw + clz w3, w3 + movi v4.16b, #64 + add x8, x0, x1 + lsl x1, x1, #1 + sub w3, w3, #26 + ldrh w3, [x6, x3, lsl #1] + sub x6, x6, w3, uxtw + br x6 +20: + ld1r {v0.8b}, [x5] + sub v1.8b, v4.8b, v0.8b +2: + ld1 {v2.h}[0], [x2], #2 + ld1 {v3.b}[0], [x0] + subs w4, w4, #2 + ld1 {v2.b}[1], [x2] + ld1 {v3.b}[1], [x8] + umull v5.8h, v2.8b, v0.8b + umlal v5.8h, v3.8b, v1.8b + rshrn v5.8b, v5.8h, #6 + add x2, x2, #2 + st1 {v5.b}[0], [x0], x1 + st1 {v5.b}[1], [x8], x1 + b.gt 2b + ret +40: + ld1r {v0.2s}, [x5] + sub x1, x1, #2 + sub v1.8b, v4.8b, v0.8b +4: + ld1 {v2.8b}, [x2], #8 + ld1 {v3.s}[0], [x0] + ld1 {v3.s}[1], [x8] + subs w4, w4, #2 + umull v5.8h, v2.8b, v0.8b + umlal v5.8h, v3.8b, v1.8b + rshrn v5.8b, v5.8h, #6 + st1 {v5.h}[0], [x0], #2 + st1 {v5.h}[2], [x8], #2 + st1 {v5.b}[2], [x0], x1 + st1 {v5.b}[6], [x8], x1 + b.gt 4b + ret +80: + ld1r {v0.2d}, [x5] + sub x1, x1, #4 + sub v1.16b, v4.16b, v0.16b +8: + ld1 {v2.16b}, [x2], #16 + ld1 {v3.d}[0], [x0] + ld1 {v3.d}[1], [x8] + subs w4, w4, #2 + umull v5.8h, v0.8b, v2.8b + umlal v5.8h, v3.8b, v1.8b + umull2 v6.8h, v0.16b, v2.16b + umlal2 v6.8h, v3.16b, v1.16b + rshrn v7.8b, v5.8h, #6 + rshrn2 v7.16b, v6.8h, #6 + st1 {v7.s}[0], [x0], #4 + st1 {v7.s}[2], [x8], #4 + st1 {v7.h}[2], [x0], x1 + st1 {v7.h}[6], [x8], x1 + b.gt 8b + ret +160: + ld1 {v0.16b}, [x5] + sub x1, x1, #8 + sub v2.16b, v4.16b, v0.16b +16: + ld1 {v5.16b, v6.16b}, [x2], #32 + ld1 {v7.16b}, [x0] + subs w4, w4, #2 + ld1 {v16.16b}, [x8] + umull v17.8h, v5.8b, v0.8b + umlal v17.8h, v7.8b, v2.8b + umull2 v18.8h, v5.16b, v0.16b + umlal2 v18.8h, v7.16b, v2.16b + umull v20.8h, v6.8b, v0.8b + umlal v20.8h, v16.8b, v2.8b + umull2 v21.8h, v6.16b, v0.16b + umlal2 v21.8h, v16.16b, v2.16b + rshrn v19.8b, v17.8h, #6 + rshrn2 v19.16b, v18.8h, #6 + rshrn v22.8b, v20.8h, #6 + rshrn2 v22.16b, v21.8h, #6 + st1 {v19.8b}, [x0], #8 + st1 {v22.8b}, [x8], #8 + st1 {v19.s}[2], [x0], x1 + st1 {v22.s}[2], [x8], x1 + b.gt 16b + ret +320: + ld1 {v0.16b, v1.16b}, [x5] + sub x1, x1, #16 + sub v2.16b, v4.16b, v0.16b + sub v3.8b, v4.8b, v1.8b +32: + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64 + ld1 {v5.16b, v6.16b}, [x0] + subs w4, w4, #2 + ld1 {v20.16b, v21.16b}, [x8] + umull v22.8h, v16.8b, v0.8b + umlal v22.8h, v5.8b, v2.8b + umull2 v23.8h, v16.16b, v0.16b + umlal2 v23.8h, v5.16b, v2.16b + umull v28.8h, v17.8b, v1.8b + umlal v28.8h, v6.8b, v3.8b + umull v30.8h, v18.8b, v0.8b + umlal v30.8h, v20.8b, v2.8b + umull2 v31.8h, v18.16b, v0.16b + umlal2 v31.8h, v20.16b, v2.16b + umull v25.8h, v19.8b, v1.8b + umlal v25.8h, v21.8b, v3.8b + rshrn v24.8b, v22.8h, #6 + rshrn2 v24.16b, v23.8h, #6 + rshrn v28.8b, v28.8h, #6 + rshrn v30.8b, v30.8h, #6 + rshrn2 v30.16b, v31.8h, #6 + rshrn v27.8b, v25.8h, #6 + st1 {v24.16b}, [x0], #16 + st1 {v30.16b}, [x8], #16 + st1 {v28.8b}, [x0], x1 + st1 {v27.8b}, [x8], x1 + b.gt 32b + ret +L(blend_v_tbl): + .hword L(blend_v_tbl) - 320b + .hword L(blend_v_tbl) - 160b + .hword L(blend_v_tbl) - 80b + .hword L(blend_v_tbl) - 40b + .hword L(blend_v_tbl) - 20b +endfunc + + +// This has got the same signature as the put_8tap functions, +// and assumes that x8 is set to (clz(w)-24). +function put_neon + adr x9, L(put_tbl) + ldrh w8, [x9, x8, lsl #1] + sub x9, x9, w8, uxtw + br x9 + +2: + ld1 {v0.h}[0], [x2], x3 + ld1 {v1.h}[0], [x2], x3 + subs w5, w5, #2 + st1 {v0.h}[0], [x0], x1 + st1 {v1.h}[0], [x0], x1 + b.gt 2b + ret +4: + ld1 {v0.s}[0], [x2], x3 + ld1 {v1.s}[0], [x2], x3 + subs w5, w5, #2 + st1 {v0.s}[0], [x0], x1 + st1 {v1.s}[0], [x0], x1 + b.gt 4b + ret +8: + ld1 {v0.8b}, [x2], x3 + ld1 {v1.8b}, [x2], x3 + subs w5, w5, #2 + st1 {v0.8b}, [x0], x1 + st1 {v1.8b}, [x0], x1 + b.gt 8b + ret +160: + add x8, x0, x1 + lsl x1, x1, #1 + add x9, x2, x3 + lsl x3, x3, #1 +16: + ld1 {v0.16b}, [x2], x3 + ld1 {v1.16b}, [x9], x3 + subs w5, w5, #2 + st1 {v0.16b}, [x0], x1 + st1 {v1.16b}, [x8], x1 + b.gt 16b + ret +32: + ldp x6, x7, [x2] + ldp x8, x9, [x2, #16] + stp x6, x7, [x0] + subs w5, w5, #1 + stp x8, x9, [x0, #16] + add x2, x2, x3 + add x0, x0, x1 + b.gt 32b + ret +64: + ldp x6, x7, [x2] + ldp x8, x9, [x2, #16] + stp x6, x7, [x0] + ldp x10, x11, [x2, #32] + stp x8, x9, [x0, #16] + subs w5, w5, #1 + ldp x12, x13, [x2, #48] + stp x10, x11, [x0, #32] + stp x12, x13, [x0, #48] + add x2, x2, x3 + add x0, x0, x1 + b.gt 64b + ret +128: + ldp q0, q1, [x2] + ldp q2, q3, [x2, #32] + stp q0, q1, [x0] + ldp q4, q5, [x2, #64] + stp q2, q3, [x0, #32] + ldp q6, q7, [x2, #96] + subs w5, w5, #1 + stp q4, q5, [x0, #64] + stp q6, q7, [x0, #96] + add x2, x2, x3 + add x0, x0, x1 + b.gt 128b + ret + +L(put_tbl): + .hword L(put_tbl) - 128b + .hword L(put_tbl) - 64b + .hword L(put_tbl) - 32b + .hword L(put_tbl) - 160b + .hword L(put_tbl) - 8b + .hword L(put_tbl) - 4b + .hword L(put_tbl) - 2b +endfunc + + +// This has got the same signature as the prep_8tap functions, +// and assumes that x8 is set to (clz(w)-24), and x7 to w*2. +function prep_neon + adr x9, L(prep_tbl) + ldrh w8, [x9, x8, lsl #1] + sub x9, x9, w8, uxtw + br x9 + +4: + ld1 {v0.s}[0], [x1], x2 + ld1 {v1.s}[0], [x1], x2 + subs w4, w4, #2 + ushll v0.8h, v0.8b, #4 + ushll v1.8h, v1.8b, #4 + st1 {v0.4h, v1.4h}, [x0], #16 + b.gt 4b + ret +8: + ld1 {v0.8b}, [x1], x2 + ld1 {v1.8b}, [x1], x2 + subs w4, w4, #2 + ushll v0.8h, v0.8b, #4 + ushll v1.8h, v1.8b, #4 + st1 {v0.8h, v1.8h}, [x0], #32 + b.gt 8b + ret +160: + add x9, x1, x2 + lsl x2, x2, #1 +16: + ld1 {v0.16b}, [x1], x2 + ld1 {v1.16b}, [x9], x2 + subs w4, w4, #2 + ushll v4.8h, v0.8b, #4 + ushll2 v5.8h, v0.16b, #4 + ushll v6.8h, v1.8b, #4 + ushll2 v7.8h, v1.16b, #4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 + b.gt 16b + ret +320: + add x8, x0, w3, uxtw +32: + ld1 {v0.16b, v1.16b}, [x1], x2 + subs w4, w4, #2 + ushll v4.8h, v0.8b, #4 + ushll2 v5.8h, v0.16b, #4 + ld1 {v2.16b, v3.16b}, [x1], x2 + ushll v6.8h, v1.8b, #4 + ushll2 v7.8h, v1.16b, #4 + ushll v16.8h, v2.8b, #4 + st1 {v4.8h, v5.8h}, [x0], x7 + ushll2 v17.8h, v2.16b, #4 + st1 {v6.8h, v7.8h}, [x8], x7 + ushll v18.8h, v3.8b, #4 + st1 {v16.8h, v17.8h}, [x0], x7 + ushll2 v19.8h, v3.16b, #4 + st1 {v18.8h, v19.8h}, [x8], x7 + b.gt 32b + ret +640: + add x8, x0, #32 + mov x6, #64 +64: + ldp q0, q1, [x1] + subs w4, w4, #1 + ushll v4.8h, v0.8b, #4 + ushll2 v5.8h, v0.16b, #4 + ldp q2, q3, [x1, #32] + ushll v6.8h, v1.8b, #4 + ushll2 v7.8h, v1.16b, #4 + add x1, x1, x2 + ushll v16.8h, v2.8b, #4 + st1 {v4.8h, v5.8h}, [x0], x6 + ushll2 v17.8h, v2.16b, #4 + ushll v18.8h, v3.8b, #4 + st1 {v6.8h, v7.8h}, [x8], x6 + ushll2 v19.8h, v3.16b, #4 + st1 {v16.8h, v17.8h}, [x0], x6 + st1 {v18.8h, v19.8h}, [x8], x6 + b.gt 64b + ret +1280: + add x8, x0, #64 + mov x6, #128 +128: + ldp q0, q1, [x1] + ldp q2, q3, [x1, #32] + ushll v16.8h, v0.8b, #4 + ushll2 v17.8h, v0.16b, #4 + ushll v18.8h, v1.8b, #4 + ushll2 v19.8h, v1.16b, #4 + ushll v20.8h, v2.8b, #4 + ushll2 v21.8h, v2.16b, #4 + ldp q4, q5, [x1, #64] + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x6 + ushll v22.8h, v3.8b, #4 + ushll2 v23.8h, v3.16b, #4 + ushll v24.8h, v4.8b, #4 + ushll2 v25.8h, v4.16b, #4 + ushll v26.8h, v5.8b, #4 + ushll2 v27.8h, v5.16b, #4 + ldp q6, q7, [x1, #96] + st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x8], x6 + ushll v28.8h, v6.8b, #4 + ushll2 v29.8h, v6.16b, #4 + ushll v30.8h, v7.8b, #4 + ushll2 v31.8h, v7.16b, #4 + subs w4, w4, #1 + add x1, x1, x2 + st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], x6 + st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x6 + b.gt 128b + ret + +L(prep_tbl): + .hword L(prep_tbl) - 1280b + .hword L(prep_tbl) - 640b + .hword L(prep_tbl) - 320b + .hword L(prep_tbl) - 160b + .hword L(prep_tbl) - 8b + .hword L(prep_tbl) - 4b +endfunc + + +.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 + ld1 {\d0\wd}[0], [\s0], \strd + ld1 {\d1\wd}[0], [\s1], \strd +.ifnb \d2 + ld1 {\d2\wd}[0], [\s0], \strd + ld1 {\d3\wd}[0], [\s1], \strd +.endif +.ifnb \d4 + ld1 {\d4\wd}[0], [\s0], \strd +.endif +.ifnb \d5 + ld1 {\d5\wd}[0], [\s1], \strd +.endif +.ifnb \d6 + ld1 {\d6\wd}[0], [\s0], \strd +.endif +.endm +.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 + ld1 {\d0\wd}, [\s0], \strd + ld1 {\d1\wd}, [\s1], \strd +.ifnb \d2 + ld1 {\d2\wd}, [\s0], \strd + ld1 {\d3\wd}, [\s1], \strd +.endif +.ifnb \d4 + ld1 {\d4\wd}, [\s0], \strd +.endif +.ifnb \d5 + ld1 {\d5\wd}, [\s1], \strd +.endif +.ifnb \d6 + ld1 {\d6\wd}, [\s0], \strd +.endif +.endm +.macro load_h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 + load_slice \s0, \s1, \strd, .h, \d0, \d1, \d2, \d3, \d4, \d5, \d6 +.endm +.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 + load_slice \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6 +.endm +.macro load_8b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 + load_reg \s0, \s1, \strd, .8b, \d0, \d1, \d2, \d3, \d4, \d5, \d6 +.endm +.macro load_16b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 + load_reg \s0, \s1, \strd, .16b, \d0, \d1, \d2, \d3, \d4, \d5, \d6 +.endm +.macro interleave_1 wd, r0, r1, r2, r3, r4 + trn1 \r0\wd, \r0\wd, \r1\wd + trn1 \r1\wd, \r1\wd, \r2\wd +.ifnb \r3 + trn1 \r2\wd, \r2\wd, \r3\wd + trn1 \r3\wd, \r3\wd, \r4\wd +.endif +.endm +.macro interleave_1_h r0, r1, r2, r3, r4 + interleave_1 .4h, \r0, \r1, \r2, \r3, \r4 +.endm +.macro interleave_1_s r0, r1, r2, r3, r4 + interleave_1 .2s, \r0, \r1, \r2, \r3, \r4 +.endm +.macro interleave_2 wd, r0, r1, r2, r3, r4, r5 + trn1 \r0\wd, \r0\wd, \r2\wd + trn1 \r1\wd, \r1\wd, \r3\wd + trn1 \r2\wd, \r2\wd, \r4\wd + trn1 \r3\wd, \r3\wd, \r5\wd +.endm +.macro interleave_2_s r0, r1, r2, r3, r4, r5 + interleave_2 .2s, \r0, \r1, \r2, \r3, \r4, \r5 +.endm +.macro uxtl_b r0, r1, r2, r3, r4, r5, r6 + uxtl \r0\().8h, \r0\().8b + uxtl \r1\().8h, \r1\().8b +.ifnb \r2 + uxtl \r2\().8h, \r2\().8b + uxtl \r3\().8h, \r3\().8b +.endif +.ifnb \r4 + uxtl \r4\().8h, \r4\().8b +.endif +.ifnb \r5 + uxtl \r5\().8h, \r5\().8b +.endif +.ifnb \r6 + uxtl \r6\().8h, \r6\().8b +.endif +.endm +.macro mul_mla_4 d, s0, s1, s2, s3, wd + mul \d\wd, \s0\wd, v0.h[0] + mla \d\wd, \s1\wd, v0.h[1] + mla \d\wd, \s2\wd, v0.h[2] + mla \d\wd, \s3\wd, v0.h[3] +.endm +// Interleaving the mul/mla chains actually hurts performance +// significantly on Cortex A53, thus keeping mul/mla tightly +// chained like this. +.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8 + mul \d0\().8h, \s0\().8h, v0.h[0] + mla \d0\().8h, \s1\().8h, v0.h[1] + mla \d0\().8h, \s2\().8h, v0.h[2] + mla \d0\().8h, \s3\().8h, v0.h[3] + mla \d0\().8h, \s4\().8h, v0.h[4] + mla \d0\().8h, \s5\().8h, v0.h[5] + mla \d0\().8h, \s6\().8h, v0.h[6] + mla \d0\().8h, \s7\().8h, v0.h[7] + mul \d1\().8h, \s1\().8h, v0.h[0] + mla \d1\().8h, \s2\().8h, v0.h[1] + mla \d1\().8h, \s3\().8h, v0.h[2] + mla \d1\().8h, \s4\().8h, v0.h[3] + mla \d1\().8h, \s5\().8h, v0.h[4] + mla \d1\().8h, \s6\().8h, v0.h[5] + mla \d1\().8h, \s7\().8h, v0.h[6] + mla \d1\().8h, \s8\().8h, v0.h[7] +.endm +.macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9 + mul \d0\().8h, \s0\().8h, v0.h[0] + mla \d0\().8h, \s1\().8h, v0.h[1] + mla \d0\().8h, \s2\().8h, v0.h[2] + mla \d0\().8h, \s3\().8h, v0.h[3] + mla \d0\().8h, \s4\().8h, v0.h[4] + mla \d0\().8h, \s5\().8h, v0.h[5] + mla \d0\().8h, \s6\().8h, v0.h[6] + mla \d0\().8h, \s7\().8h, v0.h[7] + mul \d1\().8h, \s2\().8h, v0.h[0] + mla \d1\().8h, \s3\().8h, v0.h[1] + mla \d1\().8h, \s4\().8h, v0.h[2] + mla \d1\().8h, \s5\().8h, v0.h[3] + mla \d1\().8h, \s6\().8h, v0.h[4] + mla \d1\().8h, \s7\().8h, v0.h[5] + mla \d1\().8h, \s8\().8h, v0.h[6] + mla \d1\().8h, \s9\().8h, v0.h[7] +.endm +.macro mul_mla_8_4 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11 + mul \d0\().8h, \s0\().8h, v0.h[0] + mla \d0\().8h, \s1\().8h, v0.h[1] + mla \d0\().8h, \s2\().8h, v0.h[2] + mla \d0\().8h, \s3\().8h, v0.h[3] + mla \d0\().8h, \s4\().8h, v0.h[4] + mla \d0\().8h, \s5\().8h, v0.h[5] + mla \d0\().8h, \s6\().8h, v0.h[6] + mla \d0\().8h, \s7\().8h, v0.h[7] + mul \d1\().8h, \s4\().8h, v0.h[0] + mla \d1\().8h, \s5\().8h, v0.h[1] + mla \d1\().8h, \s6\().8h, v0.h[2] + mla \d1\().8h, \s7\().8h, v0.h[3] + mla \d1\().8h, \s8\().8h, v0.h[4] + mla \d1\().8h, \s9\().8h, v0.h[5] + mla \d1\().8h, \s10\().8h, v0.h[6] + mla \d1\().8h, \s11\().8h, v0.h[7] +.endm +.macro sqrshrun_b shift, r0, r1, r2, r3 + sqrshrun \r0\().8b, \r0\().8h, #\shift +.ifnb \r1 + sqrshrun \r1\().8b, \r1\().8h, #\shift +.endif +.ifnb \r2 + sqrshrun \r2\().8b, \r2\().8h, #\shift + sqrshrun \r3\().8b, \r3\().8h, #\shift +.endif +.endm +.macro srshr_h shift, r0, r1, r2, r3 + srshr \r0\().8h, \r0\().8h, #\shift +.ifnb \r1 + srshr \r1\().8h, \r1\().8h, #\shift +.endif +.ifnb \r2 + srshr \r2\().8h, \r2\().8h, #\shift + srshr \r3\().8h, \r3\().8h, #\shift +.endif +.endm +.macro st_h strd, reg, lanes + st1 {\reg\().h}[0], [x0], \strd + st1 {\reg\().h}[1], [x8], \strd +.if \lanes > 2 + st1 {\reg\().h}[2], [x0], \strd + st1 {\reg\().h}[3], [x8], \strd +.endif +.endm +.macro st_s strd, r0, r1 + st1 {\r0\().s}[0], [x0], \strd + st1 {\r0\().s}[1], [x8], \strd +.ifnb \r1 + st1 {\r1\().s}[0], [x0], \strd + st1 {\r1\().s}[1], [x8], \strd +.endif +.endm +.macro st_d strd, r0, r1 + st1 {\r0\().d}[0], [x0], \strd + st1 {\r0\().d}[1], [x8], \strd +.ifnb \r1 + st1 {\r1\().d}[0], [x0], \strd + st1 {\r1\().d}[1], [x8], \strd +.endif +.endm +.macro shift_store_4 type, strd, r0, r1 +.ifc \type, put + sqrshrun_b 6, \r0, \r1 + st_s \strd, \r0, \r1 +.else + srshr_h 2, \r0, \r1 + st_d \strd, \r0, \r1 +.endif +.endm +.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7 + st1 {\r0\wd}, [x0], \strd + st1 {\r1\wd}, [x8], \strd +.ifnb \r2 + st1 {\r2\wd}, [x0], \strd + st1 {\r3\wd}, [x8], \strd +.endif +.ifnb \r4 + st1 {\r4\wd}, [x0], \strd + st1 {\r5\wd}, [x8], \strd + st1 {\r6\wd}, [x0], \strd + st1 {\r7\wd}, [x8], \strd +.endif +.endm +.macro st_8b strd, r0, r1, r2, r3, r4, r5, r6, r7 + st_reg \strd, .8b, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 +.endm +.macro st_16b strd, r0, r1, r2, r3, r4, r5, r6, r7 + st_reg \strd, .16b, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 +.endm +.macro shift_store_8 type, strd, r0, r1, r2, r3 +.ifc \type, put + sqrshrun_b 6, \r0, \r1, \r2, \r3 + st_8b \strd, \r0, \r1, \r2, \r3 +.else + srshr_h 2, \r0, \r1, \r2, \r3 + st_16b \strd, \r0, \r1, \r2, \r3 +.endif +.endm +.macro shift_store_16 type, strd, r0, r1, r2, r3 +.ifc \type, put + sqrshrun \r0\().8b, \r0\().8h, #6 + sqrshrun2 \r0\().16b, \r1\().8h, #6 + sqrshrun \r2\().8b, \r2\().8h, #6 + sqrshrun2 \r2\().16b, \r3\().8h, #6 + st_16b \strd, \r0, \r2 +.else + srshr_h 2, \r0, \r1, \r2, \r3 + st1 {\r0\().8h, \r1\().8h}, [x0], \strd + st1 {\r2\().8h, \r3\().8h}, [x8], \strd +.endif +.endm + +.macro make_8tap_fn op, type, type_h, type_v +function \op\()_8tap_\type\()_8bpc_neon, export=1 + mov x8, \type_h + mov x9, \type_v + b \op\()_8tap_neon +endfunc +.endm + +// No spaces in these expressions, due to gas-preprocessor. +#define REGULAR ((0*15<<7)|3*15) +#define SMOOTH ((1*15<<7)|4*15) +#define SHARP ((2*15<<7)|3*15) + +.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv +make_8tap_fn \type, regular, REGULAR, REGULAR +make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH +make_8tap_fn \type, regular_sharp, REGULAR, SHARP +make_8tap_fn \type, smooth, SMOOTH, SMOOTH +make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR +make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP +make_8tap_fn \type, sharp, SHARP, SHARP +make_8tap_fn \type, sharp_regular, SHARP, REGULAR +make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH + +function \type\()_8tap_neon + mov w10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) + mul \mx, \mx, w10 + mul \my, \my, w10 + add \mx, \mx, w8 // mx, 8tap_h, 4tap_h + add \my, \my, w9 // my, 8tap_v, 4tap_v +.ifc \type, prep + uxtw \d_strd, \w + lsl \d_strd, \d_strd, #1 +.endif + + clz w8, \w + tst \mx, #(0x7f << 14) + sub w8, w8, #24 + movrel x10, X(mc_subpel_filters), -8 + b.ne L(\type\()_8tap_h) + tst \my, #(0x7f << 14) + b.ne L(\type\()_8tap_v) + b \type\()_neon + +L(\type\()_8tap_h): + cmp \w, #4 + ubfx w9, \mx, #7, #7 + and \mx, \mx, #0x7f + b.le 4f + mov \mx, w9 +4: + tst \my, #(0x7f << 14) + add \xmx, x10, \mx, uxtw #3 + b.ne L(\type\()_8tap_hv) + + adr x9, L(\type\()_8tap_h_tbl) + ldrh w8, [x9, x8, lsl #1] + sub x9, x9, w8, uxtw + br x9 + +20: // 2xN h +.ifc \type, put + add \xmx, \xmx, #2 + ld1 {v0.s}[0], [\xmx] + sub \src, \src, #1 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + sxtl v0.8h, v0.8b +2: + ld1 {v4.8b}, [\src], \s_strd + ld1 {v6.8b}, [\sr2], \s_strd + uxtl v4.8h, v4.8b + uxtl v6.8h, v6.8b + ext v5.16b, v4.16b, v4.16b, #2 + ext v7.16b, v6.16b, v6.16b, #2 + subs \h, \h, #2 + trn1 v3.2s, v4.2s, v6.2s + trn2 v6.2s, v4.2s, v6.2s + trn1 v4.2s, v5.2s, v7.2s + trn2 v7.2s, v5.2s, v7.2s + mul v3.4h, v3.4h, v0.h[0] + mla v3.4h, v4.4h, v0.h[1] + mla v3.4h, v6.4h, v0.h[2] + mla v3.4h, v7.4h, v0.h[3] + srshr v3.4h, v3.4h, #2 + sqrshrun v3.8b, v3.8h, #4 + st1 {v3.h}[0], [\dst], \d_strd + st1 {v3.h}[1], [\ds2], \d_strd + b.gt 2b + ret +.endif + +40: // 4xN h + add \xmx, \xmx, #2 + ld1 {v0.s}[0], [\xmx] + sub \src, \src, #1 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + sxtl v0.8h, v0.8b +4: + ld1 {v16.8b}, [\src], \s_strd + ld1 {v20.8b}, [\sr2], \s_strd + uxtl v16.8h, v16.8b + uxtl v20.8h, v20.8b + ext v17.16b, v16.16b, v16.16b, #2 + ext v18.16b, v16.16b, v16.16b, #4 + ext v19.16b, v16.16b, v16.16b, #6 + ext v21.16b, v20.16b, v20.16b, #2 + ext v22.16b, v20.16b, v20.16b, #4 + ext v23.16b, v20.16b, v20.16b, #6 + subs \h, \h, #2 + mul v16.4h, v16.4h, v0.h[0] + mla v16.4h, v17.4h, v0.h[1] + mla v16.4h, v18.4h, v0.h[2] + mla v16.4h, v19.4h, v0.h[3] + mul v20.4h, v20.4h, v0.h[0] + mla v20.4h, v21.4h, v0.h[1] + mla v20.4h, v22.4h, v0.h[2] + mla v20.4h, v23.4h, v0.h[3] + srshr v16.4h, v16.4h, #2 + srshr v20.4h, v20.4h, #2 +.ifc \type, put + sqrshrun v16.8b, v16.8h, #4 + sqrshrun v20.8b, v20.8h, #4 + st1 {v16.s}[0], [\dst], \d_strd + st1 {v20.s}[0], [\ds2], \d_strd +.else + st1 {v16.4h}, [\dst], \d_strd + st1 {v20.4h}, [\ds2], \d_strd +.endif + b.gt 4b + ret + +80: // 8xN h + ld1 {v0.8b}, [\xmx] + sub \src, \src, #3 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + sxtl v0.8h, v0.8b +8: + ld1 {v16.8b, v17.8b}, [\src], \s_strd + ld1 {v20.8b, v21.8b}, [\sr2], \s_strd + uxtl v16.8h, v16.8b + uxtl v17.8h, v17.8b + uxtl v20.8h, v20.8b + uxtl v21.8h, v21.8b + + mul v18.8h, v16.8h, v0.h[0] + mul v22.8h, v20.8h, v0.h[0] +.irpc i, 1234567 + ext v19.16b, v16.16b, v17.16b, #(2*\i) + ext v23.16b, v20.16b, v21.16b, #(2*\i) + mla v18.8h, v19.8h, v0.h[\i] + mla v22.8h, v23.8h, v0.h[\i] +.endr + subs \h, \h, #2 + srshr v18.8h, v18.8h, #2 + srshr v22.8h, v22.8h, #2 +.ifc \type, put + sqrshrun v18.8b, v18.8h, #4 + sqrshrun v22.8b, v22.8h, #4 + st1 {v18.8b}, [\dst], \d_strd + st1 {v22.8b}, [\ds2], \d_strd +.else + st1 {v18.8h}, [\dst], \d_strd + st1 {v22.8h}, [\ds2], \d_strd +.endif + b.gt 8b + ret +160: +320: +640: +1280: // 16xN, 32xN, ... h + ld1 {v0.8b}, [\xmx] + sub \src, \src, #3 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + sxtl v0.8h, v0.8b + + sub \s_strd, \s_strd, \w, uxtw + sub \s_strd, \s_strd, #8 +.ifc \type, put + lsl \d_strd, \d_strd, #1 + sub \d_strd, \d_strd, \w, uxtw +.endif +161: + ld1 {v16.8b, v17.8b, v18.8b}, [\src], #24 + ld1 {v20.8b, v21.8b, v22.8b}, [\sr2], #24 + mov \mx, \w + uxtl v16.8h, v16.8b + uxtl v17.8h, v17.8b + uxtl v18.8h, v18.8b + uxtl v20.8h, v20.8b + uxtl v21.8h, v21.8b + uxtl v22.8h, v22.8b + +16: + mul v24.8h, v16.8h, v0.h[0] + mul v25.8h, v17.8h, v0.h[0] + mul v26.8h, v20.8h, v0.h[0] + mul v27.8h, v21.8h, v0.h[0] +.irpc i, 1234567 + ext v28.16b, v16.16b, v17.16b, #(2*\i) + ext v29.16b, v17.16b, v18.16b, #(2*\i) + ext v30.16b, v20.16b, v21.16b, #(2*\i) + ext v31.16b, v21.16b, v22.16b, #(2*\i) + mla v24.8h, v28.8h, v0.h[\i] + mla v25.8h, v29.8h, v0.h[\i] + mla v26.8h, v30.8h, v0.h[\i] + mla v27.8h, v31.8h, v0.h[\i] +.endr + srshr v24.8h, v24.8h, #2 + srshr v25.8h, v25.8h, #2 + srshr v26.8h, v26.8h, #2 + srshr v27.8h, v27.8h, #2 + subs \mx, \mx, #16 +.ifc \type, put + sqrshrun v24.8b, v24.8h, #4 + sqrshrun2 v24.16b, v25.8h, #4 + sqrshrun v26.8b, v26.8h, #4 + sqrshrun2 v26.16b, v27.8h, #4 + st1 {v24.16b}, [\dst], #16 + st1 {v26.16b}, [\ds2], #16 +.else + st1 {v24.8h, v25.8h}, [\dst], #32 + st1 {v26.8h, v27.8h}, [\ds2], #32 +.endif + b.le 9f + + mov v16.16b, v18.16b + mov v20.16b, v22.16b + ld1 {v17.8b, v18.8b}, [\src], #16 + ld1 {v21.8b, v22.8b}, [\sr2], #16 + uxtl v17.8h, v17.8b + uxtl v18.8h, v18.8b + uxtl v21.8h, v21.8b + uxtl v22.8h, v22.8b + b 16b + +9: + add \dst, \dst, \d_strd + add \ds2, \ds2, \d_strd + add \src, \src, \s_strd + add \sr2, \sr2, \s_strd + + subs \h, \h, #2 + b.gt 161b + ret + +L(\type\()_8tap_h_tbl): + .hword L(\type\()_8tap_h_tbl) - 1280b + .hword L(\type\()_8tap_h_tbl) - 640b + .hword L(\type\()_8tap_h_tbl) - 320b + .hword L(\type\()_8tap_h_tbl) - 160b + .hword L(\type\()_8tap_h_tbl) - 80b + .hword L(\type\()_8tap_h_tbl) - 40b + .hword L(\type\()_8tap_h_tbl) - 20b + .hword 0 + + +L(\type\()_8tap_v): + cmp \h, #4 + ubfx w9, \my, #7, #7 + and \my, \my, #0x7f + b.le 4f + mov \my, w9 +4: + add \xmy, x10, \my, uxtw #3 + + adr x9, L(\type\()_8tap_v_tbl) + ldrh w8, [x9, x8, lsl #1] + sub x9, x9, w8, uxtw + br x9 + +20: // 2xN v +.ifc \type, put + b.gt 28f + + cmp \h, #2 + add \xmy, \xmy, #2 + ld1 {v0.s}[0], [\xmy] + sub \src, \src, \s_strd + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + + // 2x2 v + load_h \src, \sr2, \s_strd, v1, v2, v3, v4, v5 + interleave_1_h v1, v2, v3, v4, v5 + b.gt 24f + uxtl_b v1, v2, v3, v4 + mul_mla_4 v6, v1, v2, v3, v4, .4h + sqrshrun_b 6, v6 + st_h \d_strd, v6, 2 + ret + +24: // 2x4 v + load_h \sr2, \src, \s_strd, v6, v7 + interleave_1_h v5, v6, v7 + interleave_2_s v1, v2, v3, v4, v5, v6 + uxtl_b v1, v2, v3, v4 + mul_mla_4 v6, v1, v2, v3, v4, .8h + sqrshrun_b 6, v6 + st_h \d_strd, v6, 4 + ret + +28: // 2x8, 2x16 v + ld1 {v0.8b}, [\xmy] + sub \sr2, \src, \s_strd, lsl #1 + add \ds2, \dst, \d_strd + sub \src, \sr2, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + sxtl v0.8h, v0.8b + + load_h \src, \sr2, \s_strd, v1, v2, v3, v4, v5, v6, v7 + interleave_1_h v1, v2, v3, v4, v5 + interleave_1_h v5, v6, v7 + interleave_2_s v1, v2, v3, v4, v5, v6 + uxtl_b v1, v2, v3, v4 +216: + subs \h, \h, #8 + load_h \sr2, \src, \s_strd, v16, v17, v18, v19 + load_h \sr2, \src, \s_strd, v20, v21, v22, v23 + interleave_1_h v7, v16, v17, v18, v19 + interleave_1_h v19, v20, v21, v22, v23 + interleave_2_s v5, v6, v7, v16, v17, v18 + interleave_2_s v17, v18, v19, v20, v21, v22 + uxtl_b v5, v6, v7, v16 + uxtl_b v17, v18, v19, v20 + mul_mla_8_4 v30, v31, v1, v2, v3, v4, v5, v6, v7, v16, v17, v18, v19, v20 + sqrshrun_b 6, v30, v31 + st_h \d_strd, v30, 4 + st_h \d_strd, v31, 4 + b.le 0f + mov v1.16b, v17.16b + mov v2.16b, v18.16b + mov v3.16b, v19.16b + mov v4.16b, v20.16b + mov v5.16b, v21.16b + mov v6.16b, v22.16b + mov v7.16b, v23.16b + b 216b +0: + ret +.endif + +40: + b.gt 480f + + // 4x2, 4x4 v + cmp \h, #2 + add \xmy, \xmy, #2 + ld1 {v0.s}[0], [\xmy] + sub \src, \src, \s_strd + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + + load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5 + interleave_1_s v1, v2, v3, v4, v5 + uxtl_b v1, v2, v3, v4 + mul_mla_4 v6, v1, v2, v3, v4, .8h + shift_store_4 \type, \d_strd, v6 + b.le 0f + load_s \sr2, \src, \s_strd, v6, v7 + interleave_1_s v5, v6, v7 + uxtl_b v5, v6 + mul_mla_4 v7, v3, v4, v5, v6, .8h + shift_store_4 \type, \d_strd, v7 +0: + ret + +480: // 4x8, 4x16 v + ld1 {v0.8b}, [\xmy] + sub \sr2, \src, \s_strd, lsl #1 + add \ds2, \dst, \d_strd + sub \src, \sr2, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + + load_s \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 + interleave_1_s v16, v17, v18 + interleave_1_s v18, v19, v20, v21, v22 + uxtl_b v16, v17 + uxtl_b v18, v19, v20, v21 + +48: + subs \h, \h, #4 + load_s \sr2, \src, \s_strd, v23, v24, v25, v26 + interleave_1_s v22, v23, v24, v25, v26 + uxtl_b v22, v23, v24, v25 + mul_mla_8_2 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 + shift_store_4 \type, \d_strd, v1, v2 + b.le 0f + subs \h, \h, #4 + load_s \sr2, \src, \s_strd, v27, v16, v17, v18 + interleave_1_s v26, v27, v16, v17, v18 + uxtl_b v26, v27, v16, v17 + mul_mla_8_2 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16, v17 + shift_store_4 \type, \d_strd, v1, v2 + b.le 0f + subs \h, \h, #4 + load_s \sr2, \src, \s_strd, v19, v20, v21, v22 + interleave_1_s v18, v19, v20, v21, v22 + uxtl_b v18, v19, v20, v21 + mul_mla_8_2 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21 + shift_store_4 \type, \d_strd, v1, v2 + b.gt 48b +0: + ret + +80: + b.gt 880f + + // 8x2, 8x4 v + cmp \h, #2 + add \xmy, \xmy, #2 + ld1 {v0.s}[0], [\xmy] + sub \src, \src, \s_strd + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + + load_8b \src, \sr2, \s_strd, v1, v2, v3, v4, v5 + uxtl_b v1, v2, v3, v4, v5 + mul_mla_4 v6, v1, v2, v3, v4, .8h + mul_mla_4 v7, v2, v3, v4, v5, .8h + shift_store_8 \type, \d_strd, v6, v7 + b.le 0f + load_8b \sr2, \src, \s_strd, v6, v7 + uxtl_b v6, v7 + mul_mla_4 v1, v3, v4, v5, v6, .8h + mul_mla_4 v2, v4, v5, v6, v7, .8h + shift_store_8 \type, \d_strd, v1, v2 +0: + ret + +880: // 8x6, 8x8, 8x16, 8x32 v +1680: // 16x8, 16x16, ... +320: // 32x8, 32x16, ... +640: +1280: + ld1 {v0.8b}, [\xmy] + sub \src, \src, \s_strd + sub \src, \src, \s_strd, lsl #1 + sxtl v0.8h, v0.8b + mov \my, \h +168: + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + load_8b \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 + uxtl_b v16, v17, v18, v19, v20, v21, v22 + +88: + subs \h, \h, #2 + load_8b \sr2, \src, \s_strd, v23, v24 + uxtl_b v23, v24 + mul_mla_8_1 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24 + shift_store_8 \type, \d_strd, v1, v2 + b.le 9f + subs \h, \h, #2 + load_8b \sr2, \src, \s_strd, v25, v26 + uxtl_b v25, v26 + mul_mla_8_1 v3, v4, v18, v19, v20, v21, v22, v23, v24, v25, v26 + shift_store_8 \type, \d_strd, v3, v4 + b.le 9f + subs \h, \h, #2 + load_8b \sr2, \src, \s_strd, v27, v16 + uxtl_b v27, v16 + mul_mla_8_1 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16 + shift_store_8 \type, \d_strd, v1, v2 + b.le 9f + subs \h, \h, #2 + load_8b \sr2, \src, \s_strd, v17, v18 + uxtl_b v17, v18 + mul_mla_8_1 v3, v4, v22, v23, v24, v25, v26, v27, v16, v17, v18 + shift_store_8 \type, \d_strd, v3, v4 + b.le 9f + subs \h, \h, #4 + load_8b \sr2, \src, \s_strd, v19, v20, v21, v22 + uxtl_b v19, v20, v21, v22 + mul_mla_8_1 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20 + mul_mla_8_1 v3, v4, v26, v27, v16, v17, v18, v19, v20, v21, v22 + shift_store_8 \type, \d_strd, v1, v2, v3, v4 + b.gt 88b +9: + subs \w, \w, #8 + b.le 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + msub \src, \s_strd, \xmy, \src + msub \dst, \d_strd, \xmy, \dst + sub \src, \src, \s_strd, lsl #3 + mov \h, \my + add \src, \src, #8 +.ifc \type, put + add \dst, \dst, #8 +.else + add \dst, \dst, #16 +.endif + b 168b +0: + ret + +160: + b.gt 1680b + + // 16x2, 16x4 v + add \xmy, \xmy, #2 + ld1 {v0.s}[0], [\xmy] + sub \src, \src, \s_strd + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + + cmp \h, #2 + load_16b \src, \sr2, \s_strd, v1, v2, v3, v4, v5 + uxtl v16.8h, v1.8b + uxtl v17.8h, v2.8b + uxtl v18.8h, v3.8b + uxtl v19.8h, v4.8b + uxtl v20.8h, v5.8b + uxtl2 v23.8h, v1.16b + uxtl2 v24.8h, v2.16b + uxtl2 v25.8h, v3.16b + uxtl2 v26.8h, v4.16b + uxtl2 v27.8h, v5.16b + mul_mla_4 v1, v16, v17, v18, v19, .8h + mul_mla_4 v16, v17, v18, v19, v20, .8h + mul_mla_4 v2, v23, v24, v25, v26, .8h + mul_mla_4 v17, v24, v25, v26, v27, .8h + shift_store_16 \type, \d_strd, v1, v2, v16, v17 + b.le 0f + load_16b \sr2, \src, \s_strd, v6, v7 + uxtl v21.8h, v6.8b + uxtl v22.8h, v7.8b + uxtl2 v28.8h, v6.16b + uxtl2 v29.8h, v7.16b + mul_mla_4 v1, v18, v19, v20, v21, .8h + mul_mla_4 v3, v19, v20, v21, v22, .8h + mul_mla_4 v2, v25, v26, v27, v28, .8h + mul_mla_4 v4, v26, v27, v28, v29, .8h + shift_store_16 \type, \d_strd, v1, v2, v3, v4 +0: + ret + +L(\type\()_8tap_v_tbl): + .hword L(\type\()_8tap_v_tbl) - 1280b + .hword L(\type\()_8tap_v_tbl) - 640b + .hword L(\type\()_8tap_v_tbl) - 320b + .hword L(\type\()_8tap_v_tbl) - 160b + .hword L(\type\()_8tap_v_tbl) - 80b + .hword L(\type\()_8tap_v_tbl) - 40b + .hword L(\type\()_8tap_v_tbl) - 20b + .hword 0 + +L(\type\()_8tap_hv): + cmp \h, #4 + ubfx w9, \my, #7, #7 + and \my, \my, #0x7f + b.le 4f + mov \my, w9 +4: + add \xmy, x10, \my, uxtw #3 + + adr x9, L(\type\()_8tap_hv_tbl) + ldrh w8, [x9, x8, lsl #1] + sub x9, x9, w8, uxtw + br x9 + +20: +.ifc \type, put + add \xmx, \xmx, #2 + ld1 {v0.s}[0], [\xmx] + b.gt 280f + add \xmy, \xmy, #2 + ld1 {v1.s}[0], [\xmy] + + // 2x2, 2x4 hv + sub \sr2, \src, #1 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + sxtl v1.8h, v1.8b + mov x15, x30 + + ld1 {v28.8b}, [\src], \s_strd + uxtl v28.8h, v28.8b + ext v29.16b, v28.16b, v28.16b, #2 + mul v28.4h, v28.4h, v0.4h + mul v29.4h, v29.4h, v0.4h + addp v28.4h, v28.4h, v29.4h + addp v16.4h, v28.4h, v28.4h + srshr v16.4h, v16.4h, #2 + bl L(\type\()_8tap_filter_2) + + trn1 v16.2s, v16.2s, v28.2s + mov v17.8b, v28.8b + +2: + bl L(\type\()_8tap_filter_2) + + ext v18.8b, v17.8b, v28.8b, #4 + smull v2.4s, v16.4h, v1.h[0] + smlal v2.4s, v17.4h, v1.h[1] + smlal v2.4s, v18.4h, v1.h[2] + smlal v2.4s, v28.4h, v1.h[3] + + sqrshrn v2.4h, v2.4s, #\shift_hv + sqxtun v2.8b, v2.8h + subs \h, \h, #2 + st1 {v2.h}[0], [\dst], \d_strd + st1 {v2.h}[1], [\ds2], \d_strd + b.le 0f + mov v16.8b, v18.8b + mov v17.8b, v28.8b + b 2b + +280: // 2x8, 2x16, 2x32 hv + ld1 {v1.8b}, [\xmy] + sub \src, \src, #1 + sub \sr2, \src, \s_strd, lsl #1 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + sxtl v1.8h, v1.8b + mov x15, x30 + + ld1 {v28.8b}, [\src], \s_strd + uxtl v28.8h, v28.8b + ext v29.16b, v28.16b, v28.16b, #2 + mul v28.4h, v28.4h, v0.4h + mul v29.4h, v29.4h, v0.4h + addp v28.4h, v28.4h, v29.4h + addp v16.4h, v28.4h, v28.4h + srshr v16.4h, v16.4h, #2 + + bl L(\type\()_8tap_filter_2) + trn1 v16.2s, v16.2s, v28.2s + mov v17.8b, v28.8b + bl L(\type\()_8tap_filter_2) + ext v18.8b, v17.8b, v28.8b, #4 + mov v19.8b, v28.8b + bl L(\type\()_8tap_filter_2) + ext v20.8b, v19.8b, v28.8b, #4 + mov v21.8b, v28.8b + +28: + bl L(\type\()_8tap_filter_2) + ext v22.8b, v21.8b, v28.8b, #4 + smull v2.4s, v16.4h, v1.h[0] + smlal v2.4s, v17.4h, v1.h[1] + smlal v2.4s, v18.4h, v1.h[2] + smlal v2.4s, v19.4h, v1.h[3] + smlal v2.4s, v20.4h, v1.h[4] + smlal v2.4s, v21.4h, v1.h[5] + smlal v2.4s, v22.4h, v1.h[6] + smlal v2.4s, v28.4h, v1.h[7] + + sqrshrn v2.4h, v2.4s, #\shift_hv + sqxtun v2.8b, v2.8h + subs \h, \h, #2 + st1 {v2.h}[0], [\dst], \d_strd + st1 {v2.h}[1], [\ds2], \d_strd + b.le 0f + mov v16.8b, v18.8b + mov v17.8b, v19.8b + mov v18.8b, v20.8b + mov v19.8b, v21.8b + mov v20.8b, v22.8b + mov v21.8b, v28.8b + b 28b + +0: + br x15 + +L(\type\()_8tap_filter_2): + ld1 {v28.8b}, [\sr2], \s_strd + ld1 {v30.8b}, [\src], \s_strd + uxtl v28.8h, v28.8b + uxtl v30.8h, v30.8b + ext v29.16b, v28.16b, v28.16b, #2 + ext v31.16b, v30.16b, v30.16b, #2 + trn1 v27.2s, v28.2s, v30.2s + trn2 v30.2s, v28.2s, v30.2s + trn1 v28.2s, v29.2s, v31.2s + trn2 v31.2s, v29.2s, v31.2s + mul v27.4h, v27.4h, v0.h[0] + mla v27.4h, v28.4h, v0.h[1] + mla v27.4h, v30.4h, v0.h[2] + mla v27.4h, v31.4h, v0.h[3] + srshr v28.4h, v27.4h, #2 + ret +.endif + +40: + add \xmx, \xmx, #2 + ld1 {v0.s}[0], [\xmx] + b.gt 480f + add \xmy, \xmy, #2 + ld1 {v1.s}[0], [\xmy] + sub \sr2, \src, #1 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + sxtl v1.8h, v1.8b + mov x15, x30 + + // 4x2, 4x4 hv + ld1 {v26.8b}, [\src], \s_strd + uxtl v26.8h, v26.8b + ext v28.16b, v26.16b, v26.16b, #2 + ext v29.16b, v26.16b, v26.16b, #4 + ext v30.16b, v26.16b, v26.16b, #6 + mul v31.4h, v26.4h, v0.h[0] + mla v31.4h, v28.4h, v0.h[1] + mla v31.4h, v29.4h, v0.h[2] + mla v31.4h, v30.4h, v0.h[3] + srshr v16.4h, v31.4h, #2 + + bl L(\type\()_8tap_filter_4) + mov v17.8b, v28.8b + mov v18.8b, v29.8b + +4: + bl L(\type\()_8tap_filter_4) + // Interleaving the mul/mla chains actually hurts performance + // significantly on Cortex A53, thus keeping mul/mla tightly + // chained like this. + smull v2.4s, v16.4h, v1.h[0] + smlal v2.4s, v17.4h, v1.h[1] + smlal v2.4s, v18.4h, v1.h[2] + smlal v2.4s, v28.4h, v1.h[3] + smull v3.4s, v17.4h, v1.h[0] + smlal v3.4s, v18.4h, v1.h[1] + smlal v3.4s, v28.4h, v1.h[2] + smlal v3.4s, v29.4h, v1.h[3] + sqrshrn v2.4h, v2.4s, #\shift_hv + sqrshrn v3.4h, v3.4s, #\shift_hv + subs \h, \h, #2 +.ifc \type, put + sqxtun v2.8b, v2.8h + sqxtun v3.8b, v3.8h + st1 {v2.s}[0], [\dst], \d_strd + st1 {v3.s}[0], [\ds2], \d_strd +.else + st1 {v2.4h}, [\dst], \d_strd + st1 {v3.4h}, [\ds2], \d_strd +.endif + b.le 0f + mov v16.8b, v18.8b + mov v17.8b, v28.8b + mov v18.8b, v29.8b + b 4b + +480: // 4x8, 4x16, 4x32 hv + ld1 {v1.8b}, [\xmy] + sub \src, \src, #1 + sub \sr2, \src, \s_strd, lsl #1 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + sxtl v1.8h, v1.8b + mov x15, x30 + + ld1 {v26.8b}, [\src], \s_strd + uxtl v26.8h, v26.8b + ext v28.16b, v26.16b, v26.16b, #2 + ext v29.16b, v26.16b, v26.16b, #4 + ext v30.16b, v26.16b, v26.16b, #6 + mul v31.4h, v26.4h, v0.h[0] + mla v31.4h, v28.4h, v0.h[1] + mla v31.4h, v29.4h, v0.h[2] + mla v31.4h, v30.4h, v0.h[3] + srshr v16.4h, v31.4h, #2 + + bl L(\type\()_8tap_filter_4) + mov v17.8b, v28.8b + mov v18.8b, v29.8b + bl L(\type\()_8tap_filter_4) + mov v19.8b, v28.8b + mov v20.8b, v29.8b + bl L(\type\()_8tap_filter_4) + mov v21.8b, v28.8b + mov v22.8b, v29.8b + +48: + bl L(\type\()_8tap_filter_4) + smull v2.4s, v16.4h, v1.h[0] + smlal v2.4s, v17.4h, v1.h[1] + smlal v2.4s, v18.4h, v1.h[2] + smlal v2.4s, v19.4h, v1.h[3] + smlal v2.4s, v20.4h, v1.h[4] + smlal v2.4s, v21.4h, v1.h[5] + smlal v2.4s, v22.4h, v1.h[6] + smlal v2.4s, v28.4h, v1.h[7] + smull v3.4s, v17.4h, v1.h[0] + smlal v3.4s, v18.4h, v1.h[1] + smlal v3.4s, v19.4h, v1.h[2] + smlal v3.4s, v20.4h, v1.h[3] + smlal v3.4s, v21.4h, v1.h[4] + smlal v3.4s, v22.4h, v1.h[5] + smlal v3.4s, v28.4h, v1.h[6] + smlal v3.4s, v29.4h, v1.h[7] + sqrshrn v2.4h, v2.4s, #\shift_hv + sqrshrn v3.4h, v3.4s, #\shift_hv + subs \h, \h, #2 +.ifc \type, put + sqxtun v2.8b, v2.8h + sqxtun v3.8b, v3.8h + st1 {v2.s}[0], [\dst], \d_strd + st1 {v3.s}[0], [\ds2], \d_strd +.else + st1 {v2.4h}, [\dst], \d_strd + st1 {v3.4h}, [\ds2], \d_strd +.endif + b.le 0f + mov v16.8b, v18.8b + mov v17.8b, v19.8b + mov v18.8b, v20.8b + mov v19.8b, v21.8b + mov v20.8b, v22.8b + mov v21.8b, v28.8b + mov v22.8b, v29.8b + b 48b +0: + br x15 + +L(\type\()_8tap_filter_4): + ld1 {v26.8b}, [\sr2], \s_strd + ld1 {v27.8b}, [\src], \s_strd + uxtl v26.8h, v26.8b + uxtl v27.8h, v27.8b + ext v28.16b, v26.16b, v26.16b, #2 + ext v29.16b, v26.16b, v26.16b, #4 + ext v30.16b, v26.16b, v26.16b, #6 + mul v31.4h, v26.4h, v0.h[0] + mla v31.4h, v28.4h, v0.h[1] + mla v31.4h, v29.4h, v0.h[2] + mla v31.4h, v30.4h, v0.h[3] + ext v28.16b, v27.16b, v27.16b, #2 + ext v29.16b, v27.16b, v27.16b, #4 + ext v30.16b, v27.16b, v27.16b, #6 + mul v27.4h, v27.4h, v0.h[0] + mla v27.4h, v28.4h, v0.h[1] + mla v27.4h, v29.4h, v0.h[2] + mla v27.4h, v30.4h, v0.h[3] + srshr v28.4h, v31.4h, #2 + srshr v29.4h, v27.4h, #2 + ret + +80: +160: +320: + b.gt 880f + add \xmy, \xmy, #2 + ld1 {v0.8b}, [\xmx] + ld1 {v1.s}[0], [\xmy] + sub \src, \src, #3 + sub \src, \src, \s_strd + sxtl v0.8h, v0.8b + sxtl v1.8h, v1.8b + mov x15, x30 + mov \my, \h + +164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + + ld1 {v28.8b, v29.8b}, [\src], \s_strd + uxtl v28.8h, v28.8b + uxtl v29.8h, v29.8b + mul v24.8h, v28.8h, v0.h[0] +.irpc i, 1234567 + ext v26.16b, v28.16b, v29.16b, #(2*\i) + mla v24.8h, v26.8h, v0.h[\i] +.endr + srshr v16.8h, v24.8h, #2 + + bl L(\type\()_8tap_filter_8) + mov v17.16b, v24.16b + mov v18.16b, v25.16b + +8: + smull v2.4s, v16.4h, v1.h[0] + smull2 v3.4s, v16.8h, v1.h[0] + bl L(\type\()_8tap_filter_8) + smull v4.4s, v17.4h, v1.h[0] + smull2 v5.4s, v17.8h, v1.h[0] + smlal v2.4s, v17.4h, v1.h[1] + smlal2 v3.4s, v17.8h, v1.h[1] + smlal v4.4s, v18.4h, v1.h[1] + smlal2 v5.4s, v18.8h, v1.h[1] + smlal v2.4s, v18.4h, v1.h[2] + smlal2 v3.4s, v18.8h, v1.h[2] + smlal v4.4s, v24.4h, v1.h[2] + smlal2 v5.4s, v24.8h, v1.h[2] + smlal v2.4s, v24.4h, v1.h[3] + smlal2 v3.4s, v24.8h, v1.h[3] + smlal v4.4s, v25.4h, v1.h[3] + smlal2 v5.4s, v25.8h, v1.h[3] + sqrshrn v2.4h, v2.4s, #\shift_hv + sqrshrn2 v2.8h, v3.4s, #\shift_hv + sqrshrn v4.4h, v4.4s, #\shift_hv + sqrshrn2 v4.8h, v5.4s, #\shift_hv + subs \h, \h, #2 +.ifc \type, put + sqxtun v2.8b, v2.8h + sqxtun v4.8b, v4.8h + st1 {v2.8b}, [\dst], \d_strd + st1 {v4.8b}, [\ds2], \d_strd +.else + st1 {v2.8h}, [\dst], \d_strd + st1 {v4.8h}, [\ds2], \d_strd +.endif + b.le 9f + mov v16.16b, v18.16b + mov v17.16b, v24.16b + mov v18.16b, v25.16b + b 8b +9: + subs \w, \w, #8 + b.le 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + msub \src, \s_strd, \xmy, \src + msub \dst, \d_strd, \xmy, \dst + sub \src, \src, \s_strd, lsl #2 + mov \h, \my + add \src, \src, #8 +.ifc \type, put + add \dst, \dst, #8 +.else + add \dst, \dst, #16 +.endif + b 164b + +880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv +640: +1280: + ld1 {v0.8b}, [\xmx] + ld1 {v1.8b}, [\xmy] + sub \src, \src, #3 + sub \src, \src, \s_strd + sub \src, \src, \s_strd, lsl #1 + sxtl v0.8h, v0.8b + sxtl v1.8h, v1.8b + mov x15, x30 + mov \my, \h + +168: + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + + ld1 {v28.8b, v29.8b}, [\src], \s_strd + uxtl v28.8h, v28.8b + uxtl v29.8h, v29.8b + mul v24.8h, v28.8h, v0.h[0] +.irpc i, 1234567 + ext v26.16b, v28.16b, v29.16b, #(2*\i) + mla v24.8h, v26.8h, v0.h[\i] +.endr + srshr v16.8h, v24.8h, #2 + + bl L(\type\()_8tap_filter_8) + mov v17.16b, v24.16b + mov v18.16b, v25.16b + bl L(\type\()_8tap_filter_8) + mov v19.16b, v24.16b + mov v20.16b, v25.16b + bl L(\type\()_8tap_filter_8) + mov v21.16b, v24.16b + mov v22.16b, v25.16b + +88: + smull v2.4s, v16.4h, v1.h[0] + smull2 v3.4s, v16.8h, v1.h[0] + bl L(\type\()_8tap_filter_8) + smull v4.4s, v17.4h, v1.h[0] + smull2 v5.4s, v17.8h, v1.h[0] + smlal v2.4s, v17.4h, v1.h[1] + smlal2 v3.4s, v17.8h, v1.h[1] + smlal v4.4s, v18.4h, v1.h[1] + smlal2 v5.4s, v18.8h, v1.h[1] + smlal v2.4s, v18.4h, v1.h[2] + smlal2 v3.4s, v18.8h, v1.h[2] + smlal v4.4s, v19.4h, v1.h[2] + smlal2 v5.4s, v19.8h, v1.h[2] + smlal v2.4s, v19.4h, v1.h[3] + smlal2 v3.4s, v19.8h, v1.h[3] + smlal v4.4s, v20.4h, v1.h[3] + smlal2 v5.4s, v20.8h, v1.h[3] + smlal v2.4s, v20.4h, v1.h[4] + smlal2 v3.4s, v20.8h, v1.h[4] + smlal v4.4s, v21.4h, v1.h[4] + smlal2 v5.4s, v21.8h, v1.h[4] + smlal v2.4s, v21.4h, v1.h[5] + smlal2 v3.4s, v21.8h, v1.h[5] + smlal v4.4s, v22.4h, v1.h[5] + smlal2 v5.4s, v22.8h, v1.h[5] + smlal v2.4s, v22.4h, v1.h[6] + smlal2 v3.4s, v22.8h, v1.h[6] + smlal v4.4s, v24.4h, v1.h[6] + smlal2 v5.4s, v24.8h, v1.h[6] + smlal v2.4s, v24.4h, v1.h[7] + smlal2 v3.4s, v24.8h, v1.h[7] + smlal v4.4s, v25.4h, v1.h[7] + smlal2 v5.4s, v25.8h, v1.h[7] + sqrshrn v2.4h, v2.4s, #\shift_hv + sqrshrn2 v2.8h, v3.4s, #\shift_hv + sqrshrn v4.4h, v4.4s, #\shift_hv + sqrshrn2 v4.8h, v5.4s, #\shift_hv + subs \h, \h, #2 +.ifc \type, put + sqxtun v2.8b, v2.8h + sqxtun v4.8b, v4.8h + st1 {v2.8b}, [\dst], \d_strd + st1 {v4.8b}, [\ds2], \d_strd +.else + st1 {v2.8h}, [\dst], \d_strd + st1 {v4.8h}, [\ds2], \d_strd +.endif + b.le 9f + mov v16.16b, v18.16b + mov v17.16b, v19.16b + mov v18.16b, v20.16b + mov v19.16b, v21.16b + mov v20.16b, v22.16b + mov v21.16b, v24.16b + mov v22.16b, v25.16b + b 88b +9: + subs \w, \w, #8 + b.le 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + msub \src, \s_strd, \xmy, \src + msub \dst, \d_strd, \xmy, \dst + sub \src, \src, \s_strd, lsl #3 + mov \h, \my + add \src, \src, #8 +.ifc \type, put + add \dst, \dst, #8 +.else + add \dst, \dst, #16 +.endif + b 168b +0: + br x15 + +L(\type\()_8tap_filter_8): + ld1 {v28.8b, v29.8b}, [\sr2], \s_strd + ld1 {v30.8b, v31.8b}, [\src], \s_strd + uxtl v28.8h, v28.8b + uxtl v29.8h, v29.8b + uxtl v30.8h, v30.8b + uxtl v31.8h, v31.8b + mul v24.8h, v28.8h, v0.h[0] + mul v25.8h, v30.8h, v0.h[0] +.irpc i, 1234567 + ext v26.16b, v28.16b, v29.16b, #(2*\i) + ext v27.16b, v30.16b, v31.16b, #(2*\i) + mla v24.8h, v26.8h, v0.h[\i] + mla v25.8h, v27.8h, v0.h[\i] +.endr + srshr v24.8h, v24.8h, #2 + srshr v25.8h, v25.8h, #2 + ret + +L(\type\()_8tap_hv_tbl): + .hword L(\type\()_8tap_hv_tbl) - 1280b + .hword L(\type\()_8tap_hv_tbl) - 640b + .hword L(\type\()_8tap_hv_tbl) - 320b + .hword L(\type\()_8tap_hv_tbl) - 160b + .hword L(\type\()_8tap_hv_tbl) - 80b + .hword L(\type\()_8tap_hv_tbl) - 40b + .hword L(\type\()_8tap_hv_tbl) - 20b + .hword 0 +endfunc + + +function \type\()_bilin_8bpc_neon, export=1 + dup v1.16b, \mx + dup v3.16b, \my + mov w9, #16 + sub w8, w9, \mx + sub w9, w9, \my + dup v0.16b, w8 + dup v2.16b, w9 +.ifc \type, prep + uxtw \d_strd, \w + lsl \d_strd, \d_strd, #1 +.endif + + clz w8, \w + sub w8, w8, #24 + cbnz \mx, L(\type\()_bilin_h) + cbnz \my, L(\type\()_bilin_v) + b \type\()_neon + +L(\type\()_bilin_h): + cbnz \my, L(\type\()_bilin_hv) + + adr x9, L(\type\()_bilin_h_tbl) + ldrh w8, [x9, x8, lsl #1] + sub x9, x9, w8, uxtw + br x9 + +20: // 2xN h +.ifc \type, put + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 +2: + ld1 {v4.s}[0], [\src], \s_strd + ld1 {v6.s}[0], [\sr2], \s_strd + ext v5.8b, v4.8b, v4.8b, #1 + ext v7.8b, v6.8b, v6.8b, #1 + trn1 v4.4h, v4.4h, v6.4h + trn1 v5.4h, v5.4h, v7.4h + subs \h, \h, #2 + umull v4.8h, v4.8b, v0.8b + umlal v4.8h, v5.8b, v1.8b + uqrshrn v4.8b, v4.8h, #4 + st1 {v4.h}[0], [\dst], \d_strd + st1 {v4.h}[1], [\ds2], \d_strd + b.gt 2b + ret +.endif + +40: // 4xN h + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 +4: + ld1 {v4.8b}, [\src], \s_strd + ld1 {v6.8b}, [\sr2], \s_strd + ext v5.8b, v4.8b, v4.8b, #1 + ext v7.8b, v6.8b, v6.8b, #1 + trn1 v4.2s, v4.2s, v6.2s + trn1 v5.2s, v5.2s, v7.2s + subs \h, \h, #2 + umull v4.8h, v4.8b, v0.8b + umlal v4.8h, v5.8b, v1.8b +.ifc \type, put + uqrshrn v4.8b, v4.8h, #4 + st1 {v4.s}[0], [\dst], \d_strd + st1 {v4.s}[1], [\ds2], \d_strd +.else + st1 {v4.d}[0], [\dst], \d_strd + st1 {v4.d}[1], [\ds2], \d_strd +.endif + b.gt 4b + ret + +80: // 8xN h + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 +8: + ld1 {v4.16b}, [\src], \s_strd + ld1 {v6.16b}, [\sr2], \s_strd + ext v5.16b, v4.16b, v4.16b, #1 + ext v7.16b, v6.16b, v6.16b, #1 + subs \h, \h, #2 + umull v4.8h, v4.8b, v0.8b + umull v6.8h, v6.8b, v0.8b + umlal v4.8h, v5.8b, v1.8b + umlal v6.8h, v7.8b, v1.8b +.ifc \type, put + uqrshrn v4.8b, v4.8h, #4 + uqrshrn v6.8b, v6.8h, #4 + st1 {v4.8b}, [\dst], \d_strd + st1 {v6.8b}, [\ds2], \d_strd +.else + st1 {v4.8h}, [\dst], \d_strd + st1 {v6.8h}, [\ds2], \d_strd +.endif + b.gt 8b + ret +160: +320: +640: +1280: // 16xN, 32xN, ... h + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + + sub \s_strd, \s_strd, \w, uxtw + sub \s_strd, \s_strd, #8 +.ifc \type, put + lsl \d_strd, \d_strd, #1 + sub \d_strd, \d_strd, \w, uxtw +.endif +161: + ld1 {v16.d}[1], [\src], #8 + ld1 {v20.d}[1], [\sr2], #8 + mov \mx, \w + +16: + ld1 {v18.16b}, [\src], #16 + ld1 {v22.16b}, [\sr2], #16 + ext v17.16b, v16.16b, v18.16b, #8 + ext v19.16b, v16.16b, v18.16b, #9 + ext v21.16b, v20.16b, v22.16b, #8 + ext v23.16b, v20.16b, v22.16b, #9 + umull v16.8h, v17.8b, v0.8b + umull2 v17.8h, v17.16b, v0.16b + umull v20.8h, v21.8b, v0.8b + umull2 v21.8h, v21.16b, v0.16b + umlal v16.8h, v19.8b, v1.8b + umlal2 v17.8h, v19.16b, v1.16b + umlal v20.8h, v23.8b, v1.8b + umlal2 v21.8h, v23.16b, v1.16b + subs \mx, \mx, #16 +.ifc \type, put + uqrshrn v16.8b, v16.8h, #4 + uqrshrn2 v16.16b, v17.8h, #4 + uqrshrn v20.8b, v20.8h, #4 + uqrshrn2 v20.16b, v21.8h, #4 + st1 {v16.16b}, [\dst], #16 + st1 {v20.16b}, [\ds2], #16 +.else + st1 {v16.8h, v17.8h}, [\dst], #32 + st1 {v20.8h, v21.8h}, [\ds2], #32 +.endif + b.le 9f + + mov v16.16b, v18.16b + mov v20.16b, v22.16b + b 16b + +9: + add \dst, \dst, \d_strd + add \ds2, \ds2, \d_strd + add \src, \src, \s_strd + add \sr2, \sr2, \s_strd + + subs \h, \h, #2 + b.gt 161b + ret + +L(\type\()_bilin_h_tbl): + .hword L(\type\()_bilin_h_tbl) - 1280b + .hword L(\type\()_bilin_h_tbl) - 640b + .hword L(\type\()_bilin_h_tbl) - 320b + .hword L(\type\()_bilin_h_tbl) - 160b + .hword L(\type\()_bilin_h_tbl) - 80b + .hword L(\type\()_bilin_h_tbl) - 40b + .hword L(\type\()_bilin_h_tbl) - 20b + .hword 0 + + +L(\type\()_bilin_v): + cmp \h, #4 + adr x9, L(\type\()_bilin_v_tbl) + ldrh w8, [x9, x8, lsl #1] + sub x9, x9, w8, uxtw + br x9 + +20: // 2xN v +.ifc \type, put + cmp \h, #2 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + // 2x2 v + ld1 {v16.h}[0], [\src], \s_strd + b.gt 24f + ld1 {v17.h}[0], [\sr2], \s_strd + ld1 {v18.h}[0], [\src], \s_strd + trn1 v16.4h, v16.4h, v17.4h + trn1 v17.4h, v17.4h, v18.4h + umull v4.8h, v16.8b, v2.8b + umlal v4.8h, v17.8b, v3.8b + uqrshrn v4.8b, v4.8h, #4 + st1 {v4.h}[0], [\dst] + st1 {v4.h}[1], [\ds2] + ret +24: // 2x4, 2x8, ... v + ld1 {v17.h}[0], [\sr2], \s_strd + ld1 {v18.h}[0], [\src], \s_strd + ld1 {v19.h}[0], [\sr2], \s_strd + ld1 {v20.h}[0], [\src], \s_strd + trn1 v16.4h, v16.4h, v17.4h + trn1 v17.4h, v17.4h, v18.4h + trn1 v18.4h, v18.4h, v19.4h + trn1 v19.4h, v19.4h, v20.4h + trn1 v16.2s, v16.2s, v18.2s + trn1 v17.2s, v17.2s, v19.2s + umull v4.8h, v16.8b, v2.8b + umlal v4.8h, v17.8b, v3.8b + subs \h, \h, #4 + uqrshrn v4.8b, v4.8h, #4 + st1 {v4.h}[0], [\dst], \d_strd + st1 {v4.h}[1], [\ds2], \d_strd + st1 {v4.h}[2], [\dst], \d_strd + st1 {v4.h}[3], [\ds2], \d_strd + b.le 0f + mov v16.8b, v20.8b + b 24b +0: + ret +.endif + +40: // 4xN v + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + ld1 {v16.s}[0], [\src], \s_strd +4: + ld1 {v17.s}[0], [\sr2], \s_strd + ld1 {v18.s}[0], [\src], \s_strd + trn1 v16.2s, v16.2s, v17.2s + trn1 v17.2s, v17.2s, v18.2s + umull v4.8h, v16.8b, v2.8b + umlal v4.8h, v17.8b, v3.8b + subs \h, \h, #2 +.ifc \type, put + uqrshrn v4.8b, v4.8h, #4 + st1 {v4.s}[0], [\dst], \d_strd + st1 {v4.s}[1], [\ds2], \d_strd +.else + st1 {v4.d}[0], [\dst], \d_strd + st1 {v4.d}[1], [\ds2], \d_strd +.endif + b.le 0f + mov v16.8b, v18.8b + b 4b +0: + ret + +80: // 8xN v + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + ld1 {v16.8b}, [\src], \s_strd +8: + ld1 {v17.8b}, [\sr2], \s_strd + ld1 {v18.8b}, [\src], \s_strd + umull v4.8h, v16.8b, v2.8b + umull v5.8h, v17.8b, v2.8b + umlal v4.8h, v17.8b, v3.8b + umlal v5.8h, v18.8b, v3.8b + subs \h, \h, #2 +.ifc \type, put + uqrshrn v4.8b, v4.8h, #4 + uqrshrn v5.8b, v5.8h, #4 + st1 {v4.8b}, [\dst], \d_strd + st1 {v5.8b}, [\ds2], \d_strd +.else + st1 {v4.8h}, [\dst], \d_strd + st1 {v5.8h}, [\ds2], \d_strd +.endif + b.le 0f + mov v16.8b, v18.8b + b 8b +0: + ret + +160: // 16xN, 32xN, ... +320: +640: +1280: + mov \my, \h +1: + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + ld1 {v16.16b}, [\src], \s_strd +2: + ld1 {v17.16b}, [\sr2], \s_strd + ld1 {v18.16b}, [\src], \s_strd + umull v4.8h, v16.8b, v2.8b + umull2 v5.8h, v16.16b, v2.16b + umull v6.8h, v17.8b, v2.8b + umull2 v7.8h, v17.16b, v2.16b + umlal v4.8h, v17.8b, v3.8b + umlal2 v5.8h, v17.16b, v3.16b + umlal v6.8h, v18.8b, v3.8b + umlal2 v7.8h, v18.16b, v3.16b + subs \h, \h, #2 +.ifc \type, put + uqrshrn v4.8b, v4.8h, #4 + uqrshrn2 v4.16b, v5.8h, #4 + uqrshrn v6.8b, v6.8h, #4 + uqrshrn2 v6.16b, v7.8h, #4 + st1 {v4.16b}, [\dst], \d_strd + st1 {v6.16b}, [\ds2], \d_strd +.else + st1 {v4.8h, v5.8h}, [\dst], \d_strd + st1 {v6.8h, v7.8h}, [\ds2], \d_strd +.endif + b.le 9f + mov v16.16b, v18.16b + b 2b +9: + subs \w, \w, #16 + b.le 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + msub \src, \s_strd, \xmy, \src + msub \dst, \d_strd, \xmy, \dst + sub \src, \src, \s_strd, lsl #1 + mov \h, \my + add \src, \src, #16 +.ifc \type, put + add \dst, \dst, #16 +.else + add \dst, \dst, #32 +.endif + b 1b +0: + ret + +L(\type\()_bilin_v_tbl): + .hword L(\type\()_bilin_v_tbl) - 1280b + .hword L(\type\()_bilin_v_tbl) - 640b + .hword L(\type\()_bilin_v_tbl) - 320b + .hword L(\type\()_bilin_v_tbl) - 160b + .hword L(\type\()_bilin_v_tbl) - 80b + .hword L(\type\()_bilin_v_tbl) - 40b + .hword L(\type\()_bilin_v_tbl) - 20b + .hword 0 + +L(\type\()_bilin_hv): + uxtl v2.8h, v2.8b + uxtl v3.8h, v3.8b + adr x9, L(\type\()_bilin_hv_tbl) + ldrh w8, [x9, x8, lsl #1] + sub x9, x9, w8, uxtw + br x9 + +20: // 2xN hv +.ifc \type, put + add \sr2, \src, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + ld1 {v28.s}[0], [\src], \s_strd + ext v29.8b, v28.8b, v28.8b, #1 + umull v16.8h, v28.8b, v0.8b + umlal v16.8h, v29.8b, v1.8b + +2: + ld1 {v28.s}[0], [\sr2], \s_strd + ld1 {v30.s}[0], [\src], \s_strd + ext v29.8b, v28.8b, v28.8b, #1 + ext v31.8b, v30.8b, v30.8b, #1 + trn1 v28.4h, v28.4h, v30.4h + trn1 v29.4h, v29.4h, v31.4h + umull v17.8h, v28.8b, v0.8b + umlal v17.8h, v29.8b, v1.8b + + trn1 v16.2s, v16.2s, v17.2s + + mul v4.4h, v16.4h, v2.4h + mla v4.4h, v17.4h, v3.4h + uqrshrn v4.8b, v4.8h, #8 + subs \h, \h, #2 + st1 {v4.h}[0], [\dst], \d_strd + st1 {v4.h}[1], [\ds2], \d_strd + b.le 0f + trn2 v16.2s, v17.2s, v17.2s + b 2b +0: + ret +.endif + +40: // 4xN hv + add \sr2, \src, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + ld1 {v28.8b}, [\src], \s_strd + ext v29.8b, v28.8b, v28.8b, #1 + umull v16.8h, v28.8b, v0.8b + umlal v16.8h, v29.8b, v1.8b + +4: + ld1 {v28.8b}, [\sr2], \s_strd + ld1 {v30.8b}, [\src], \s_strd + ext v29.8b, v28.8b, v28.8b, #1 + ext v31.8b, v30.8b, v30.8b, #1 + trn1 v28.2s, v28.2s, v30.2s + trn1 v29.2s, v29.2s, v31.2s + umull v17.8h, v28.8b, v0.8b + umlal v17.8h, v29.8b, v1.8b + + trn1 v16.2d, v16.2d, v17.2d + + mul v4.8h, v16.8h, v2.8h + mla v4.8h, v17.8h, v3.8h + subs \h, \h, #2 +.ifc \type, put + uqrshrn v4.8b, v4.8h, #8 + st1 {v4.s}[0], [\dst], \d_strd + st1 {v4.s}[1], [\ds2], \d_strd +.else + urshr v4.8h, v4.8h, #4 + st1 {v4.d}[0], [\dst], \d_strd + st1 {v4.d}[1], [\ds2], \d_strd +.endif + b.le 0f + trn2 v16.2d, v17.2d, v17.2d + b 4b +0: + ret + +80: // 8xN, 16xN, ... hv +160: +320: +640: +1280: + mov \my, \h + +1: + add \sr2, \src, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + ld1 {v28.16b}, [\src], \s_strd + ext v29.16b, v28.16b, v28.16b, #1 + umull v16.8h, v28.8b, v0.8b + umlal v16.8h, v29.8b, v1.8b + +2: + ld1 {v28.16b}, [\sr2], \s_strd + ld1 {v30.16b}, [\src], \s_strd + ext v29.16b, v28.16b, v28.16b, #1 + ext v31.16b, v30.16b, v30.16b, #1 + umull v17.8h, v28.8b, v0.8b + umlal v17.8h, v29.8b, v1.8b + umull v18.8h, v30.8b, v0.8b + umlal v18.8h, v31.8b, v1.8b + + mul v4.8h, v16.8h, v2.8h + mla v4.8h, v17.8h, v3.8h + mul v5.8h, v17.8h, v2.8h + mla v5.8h, v18.8h, v3.8h + subs \h, \h, #2 +.ifc \type, put + uqrshrn v4.8b, v4.8h, #8 + uqrshrn v5.8b, v5.8h, #8 + st1 {v4.8b}, [\dst], \d_strd + st1 {v5.8b}, [\ds2], \d_strd +.else + urshr v4.8h, v4.8h, #4 + urshr v5.8h, v5.8h, #4 + st1 {v4.8h}, [\dst], \d_strd + st1 {v5.8h}, [\ds2], \d_strd +.endif + b.le 9f + mov v16.16b, v18.16b + b 2b +9: + subs \w, \w, #8 + b.le 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + msub \src, \s_strd, \xmy, \src + msub \dst, \d_strd, \xmy, \dst + sub \src, \src, \s_strd, lsl #1 + mov \h, \my + add \src, \src, #8 +.ifc \type, put + add \dst, \dst, #8 +.else + add \dst, \dst, #16 +.endif + b 1b +0: + ret + +L(\type\()_bilin_hv_tbl): + .hword L(\type\()_bilin_hv_tbl) - 1280b + .hword L(\type\()_bilin_hv_tbl) - 640b + .hword L(\type\()_bilin_hv_tbl) - 320b + .hword L(\type\()_bilin_hv_tbl) - 160b + .hword L(\type\()_bilin_hv_tbl) - 80b + .hword L(\type\()_bilin_hv_tbl) - 40b + .hword L(\type\()_bilin_hv_tbl) - 20b + .hword 0 +endfunc +.endm + +filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10 +filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6 + +.macro load_filter_row dst, src, inc + asr w13, \src, #10 + ldr \dst, [x11, w13, sxtw #3] + add \src, \src, \inc +.endm + +function warp_filter_horz_neon + add w12, w5, #512 + + ld1 {v16.8b, v17.8b}, [x2], x3 + + load_filter_row d0, w12, w7 + uxtl v16.8h, v16.8b + load_filter_row d1, w12, w7 + uxtl v17.8h, v17.8b + load_filter_row d2, w12, w7 + sxtl v0.8h, v0.8b + load_filter_row d3, w12, w7 + sxtl v1.8h, v1.8b + load_filter_row d4, w12, w7 + sxtl v2.8h, v2.8b + load_filter_row d5, w12, w7 + sxtl v3.8h, v3.8b + load_filter_row d6, w12, w7 + sxtl v4.8h, v4.8b + load_filter_row d7, w12, w7 + sxtl v5.8h, v5.8b + ext v18.16b, v16.16b, v17.16b, #2*1 + mul v23.8h, v16.8h, v0.8h + sxtl v6.8h, v6.8b + ext v19.16b, v16.16b, v17.16b, #2*2 + mul v18.8h, v18.8h, v1.8h + sxtl v7.8h, v7.8b + ext v20.16b, v16.16b, v17.16b, #2*3 + mul v19.8h, v19.8h, v2.8h + ext v21.16b, v16.16b, v17.16b, #2*4 + saddlp v23.4s, v23.8h + mul v20.8h, v20.8h, v3.8h + ext v22.16b, v16.16b, v17.16b, #2*5 + saddlp v18.4s, v18.8h + mul v21.8h, v21.8h, v4.8h + saddlp v19.4s, v19.8h + mul v22.8h, v22.8h, v5.8h + saddlp v20.4s, v20.8h + saddlp v21.4s, v21.8h + saddlp v22.4s, v22.8h + addp v18.4s, v23.4s, v18.4s + ext v23.16b, v16.16b, v17.16b, #2*6 + addp v19.4s, v19.4s, v20.4s + mul v23.8h, v23.8h, v6.8h + ext v20.16b, v16.16b, v17.16b, #2*7 + mul v20.8h, v20.8h, v7.8h + saddlp v23.4s, v23.8h + addp v21.4s, v21.4s, v22.4s + saddlp v20.4s, v20.8h + addp v20.4s, v23.4s, v20.4s + addp v18.4s, v18.4s, v19.4s + addp v20.4s, v21.4s, v20.4s + + add w5, w5, w8 + + rshrn v16.4h, v18.4s, #3 + rshrn2 v16.8h, v20.4s, #3 + + ret +endfunc + +// void dav1d_warp_affine_8x8_8bpc_neon( +// pixel *dst, const ptrdiff_t dst_stride, +// const pixel *src, const ptrdiff_t src_stride, +// const int16_t *const abcd, int mx, int my) +.macro warp t, shift +function warp_affine_8x8\t\()_8bpc_neon, export=1 + ldr x4, [x4] + sbfx x7, x4, #0, #16 + sbfx x8, x4, #16, #16 + sbfx x9, x4, #32, #16 + sbfx x4, x4, #48, #16 + mov w10, #8 + sub x2, x2, x3, lsl #1 + sub x2, x2, x3 + sub x2, x2, #3 + movrel x11, X(mc_warp_filter), 64*8 + mov x15, x30 +.ifnb \t + lsl x1, x1, #1 +.endif + + bl warp_filter_horz_neon + mov v24.16b, v16.16b + bl warp_filter_horz_neon + mov v25.16b, v16.16b + bl warp_filter_horz_neon + mov v26.16b, v16.16b + bl warp_filter_horz_neon + mov v27.16b, v16.16b + bl warp_filter_horz_neon + mov v28.16b, v16.16b + bl warp_filter_horz_neon + mov v29.16b, v16.16b + bl warp_filter_horz_neon + mov v30.16b, v16.16b + +1: + add w14, w6, #512 + bl warp_filter_horz_neon + mov v31.16b, v16.16b + + load_filter_row d0, w14, w9 + load_filter_row d1, w14, w9 + load_filter_row d2, w14, w9 + load_filter_row d3, w14, w9 + load_filter_row d4, w14, w9 + load_filter_row d5, w14, w9 + load_filter_row d6, w14, w9 + load_filter_row d7, w14, w9 + transpose_8x8b v0, v1, v2, v3, v4, v5, v6, v7, v16, v17 + sxtl v0.8h, v0.8b + sxtl v1.8h, v1.8b + sxtl v2.8h, v2.8b + sxtl v3.8h, v3.8b + sxtl v4.8h, v4.8b + sxtl v5.8h, v5.8b + sxtl v6.8h, v6.8b + sxtl v7.8h, v7.8b + + // This ordering of smull/smlal/smull2/smlal2 is highly + // beneficial for Cortex A53 here. + smull v16.4s, v24.4h, v0.4h + smlal v16.4s, v25.4h, v1.4h + smlal v16.4s, v26.4h, v2.4h + smlal v16.4s, v27.4h, v3.4h + smlal v16.4s, v28.4h, v4.4h + smlal v16.4s, v29.4h, v5.4h + smlal v16.4s, v30.4h, v6.4h + smlal v16.4s, v31.4h, v7.4h + smull2 v17.4s, v24.8h, v0.8h + smlal2 v17.4s, v25.8h, v1.8h + smlal2 v17.4s, v26.8h, v2.8h + smlal2 v17.4s, v27.8h, v3.8h + smlal2 v17.4s, v28.8h, v4.8h + smlal2 v17.4s, v29.8h, v5.8h + smlal2 v17.4s, v30.8h, v6.8h + smlal2 v17.4s, v31.8h, v7.8h + + mov v24.16b, v25.16b + mov v25.16b, v26.16b + sqrshrn v16.4h, v16.4s, #\shift + mov v26.16b, v27.16b + sqrshrn2 v16.8h, v17.4s, #\shift + mov v27.16b, v28.16b + mov v28.16b, v29.16b +.ifb \t + sqxtun v16.8b, v16.8h +.endif + mov v29.16b, v30.16b + mov v30.16b, v31.16b + subs w10, w10, #1 +.ifnb \t + st1 {v16.8h}, [x0], x1 +.else + st1 {v16.8b}, [x0], x1 +.endif + + add w6, w6, w4 + b.gt 1b + + br x15 +endfunc +.endm + +warp , 11 +warp t, 7 + +// void dav1d_emu_edge_8bpc_neon( +// const intptr_t bw, const intptr_t bh, +// const intptr_t iw, const intptr_t ih, +// const intptr_t x, const intptr_t y, +// pixel *dst, const ptrdiff_t dst_stride, +// const pixel *ref, const ptrdiff_t ref_stride) +function emu_edge_8bpc_neon, export=1 + ldp x8, x9, [sp] + + // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) + // ref += iclip(x, 0, iw - 1) + sub x12, x3, #1 // ih - 1 + cmp x5, x3 + sub x13, x2, #1 // iw - 1 + csel x12, x12, x5, ge // min(y, ih - 1) + cmp x4, x2 + bic x12, x12, x12, asr #63 // max(min(y, ih - 1), 0) + csel x13, x13, x4, ge // min(x, iw - 1) + bic x13, x13, x13, asr #63 // max(min(x, iw - 1), 0) + madd x8, x12, x9, x8 // ref += iclip() * stride + add x8, x8, x13 // ref += iclip() + + // bottom_ext = iclip(y + bh - ih, 0, bh - 1) + // top_ext = iclip(-y, 0, bh - 1) + add x10, x5, x1 // y + bh + neg x5, x5 // -y + sub x10, x10, x3 // y + bh - ih + sub x12, x1, #1 // bh - 1 + cmp x10, x1 + bic x5, x5, x5, asr #63 // max(-y, 0) + csel x10, x10, x12, lt // min(y + bh - ih, bh-1) + cmp x5, x1 + bic x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0) + csel x5, x5, x12, lt // min(max(-y, 0), bh-1) + + // right_ext = iclip(x + bw - iw, 0, bw - 1) + // left_ext = iclip(-x, 0, bw - 1) + add x11, x4, x0 // x + bw + neg x4, x4 // -x + sub x11, x11, x2 // x + bw - iw + sub x13, x0, #1 // bw - 1 + cmp x11, x0 + bic x4, x4, x4, asr #63 // max(-x, 0) + csel x11, x11, x13, lt // min(x + bw - iw, bw-1) + cmp x4, x0 + bic x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0) + csel x4, x4, x13, lt // min(max(-x, 0), bw - 1) + + // center_h = bh - top_ext - bottom_ext + // dst += top_ext * PXSTRIDE(dst_stride) + // center_w = bw - left_ext - right_ext + sub x1, x1, x5 // bh - top_ext + madd x6, x5, x7, x6 + sub x2, x0, x4 // bw - left_ext + sub x1, x1, x10 // center_h = bh - top_ext - bottom_ext + sub x2, x2, x11 // center_w = bw - left_ext - right_ext + + mov x14, x6 // backup of dst + +.macro v_loop need_left, need_right +0: +.if \need_left + ld1r {v0.16b}, [x8] + mov x12, x6 // out = dst + mov x3, x4 +1: + subs x3, x3, #16 + st1 {v0.16b}, [x12], #16 + b.gt 1b +.endif + mov x13, x8 + add x12, x6, x4 // out = dst + left_ext + mov x3, x2 +1: + ld1 {v0.16b, v1.16b}, [x13], #32 + subs x3, x3, #32 + st1 {v0.16b, v1.16b}, [x12], #32 + b.gt 1b +.if \need_right + add x3, x8, x2 // in + center_w + sub x3, x3, #1 // in + center_w - 1 + add x12, x6, x4 // dst + left_ext + ld1r {v0.16b}, [x3] + add x12, x12, x2 // out = dst + left_ext + center_w + mov x3, x11 +1: + subs x3, x3, #16 + st1 {v0.16b}, [x12], #16 + b.gt 1b +.endif + + subs x1, x1, #1 // center_h-- + add x6, x6, x7 + add x8, x8, x9 + b.gt 0b +.endm + + cbz x4, 2f + // need_left + cbz x11, 3f + // need_left + need_right + v_loop 1, 1 + b 5f + +2: + // !need_left + cbz x11, 4f + // !need_left + need_right + v_loop 0, 1 + b 5f + +3: + // need_left + !need_right + v_loop 1, 0 + b 5f + +4: + // !need_left + !need_right + v_loop 0, 0 + +5: + + cbz x10, 3f + // need_bottom + sub x8, x6, x7 // ref = dst - stride + mov x4, x0 +1: + ld1 {v0.16b, v1.16b}, [x8], #32 + mov x3, x10 +2: + subs x3, x3, #1 + st1 {v0.16b, v1.16b}, [x6], x7 + b.gt 2b + msub x6, x7, x10, x6 // dst -= bottom_ext * stride + subs x4, x4, #32 // bw -= 32 + add x6, x6, #32 // dst += 32 + b.gt 1b + +3: + cbz x5, 3f + // need_top + msub x6, x7, x5, x14 // dst = stored_dst - top_ext * stride +1: + ld1 {v0.16b, v1.16b}, [x14], #32 + mov x3, x5 +2: + subs x3, x3, #1 + st1 {v0.16b, v1.16b}, [x6], x7 + b.gt 2b + msub x6, x7, x5, x6 // dst -= top_ext * stride + subs x0, x0, #32 // bw -= 32 + add x6, x6, #32 // dst += 32 + b.gt 1b + +3: + ret +endfunc diff --git a/third_party/dav1d/src/arm/64/mc16.S b/third_party/dav1d/src/arm/64/mc16.S new file mode 100644 index 0000000000..c00b709e68 --- /dev/null +++ b/third_party/dav1d/src/arm/64/mc16.S @@ -0,0 +1,3575 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Janne Grunau + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +#define PREP_BIAS 8192 + +.macro avg d0, d1, t0, t1, t2, t3 + ld1 {\t0\().8h,\t1\().8h}, [x2], 32 + ld1 {\t2\().8h,\t3\().8h}, [x3], 32 + sqadd \t0\().8h, \t0\().8h, \t2\().8h + sqadd \t1\().8h, \t1\().8h, \t3\().8h + smax \t0\().8h, \t0\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits + smax \t1\().8h, \t1\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits + sqsub \t0\().8h, \t0\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits + sqsub \t1\().8h, \t1\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits + sshl \d0\().8h, \t0\().8h, v29.8h // -(intermediate_bits+1) + sshl \d1\().8h, \t1\().8h, v29.8h // -(intermediate_bits+1) +.endm + +.macro w_avg d0, d1, t0, t1, t2, t3 + ld1 {\t0\().8h,\t1\().8h}, [x2], 32 + ld1 {\t2\().8h,\t3\().8h}, [x3], 32 + // This difference requires a 17 bit range, and all bits are + // significant for the following multiplication. + ssubl \d0\().4s, \t2\().4h, \t0\().4h + ssubl2 \t0\().4s, \t2\().8h, \t0\().8h + ssubl \d1\().4s, \t3\().4h, \t1\().4h + ssubl2 \t1\().4s, \t3\().8h, \t1\().8h + mul \d0\().4s, \d0\().4s, v27.4s + mul \t0\().4s, \t0\().4s, v27.4s + mul \d1\().4s, \d1\().4s, v27.4s + mul \t1\().4s, \t1\().4s, v27.4s + sshr \d0\().4s, \d0\().4s, #4 + sshr \t0\().4s, \t0\().4s, #4 + sshr \d1\().4s, \d1\().4s, #4 + sshr \t1\().4s, \t1\().4s, #4 + saddw \d0\().4s, \d0\().4s, \t2\().4h + saddw2 \t0\().4s, \t0\().4s, \t2\().8h + saddw \d1\().4s, \d1\().4s, \t3\().4h + saddw2 \t1\().4s, \t1\().4s, \t3\().8h + xtn \d0\().4h, \d0\().4s + xtn2 \d0\().8h, \t0\().4s + xtn \d1\().4h, \d1\().4s + xtn2 \d1\().8h, \t1\().4s + srshl \d0\().8h, \d0\().8h, v29.8h // -intermediate_bits + srshl \d1\().8h, \d1\().8h, v29.8h // -intermediate_bits + add \d0\().8h, \d0\().8h, v28.8h // PREP_BIAS >> intermediate_bits + add \d1\().8h, \d1\().8h, v28.8h // PREP_BIAS >> intermediate_bits + smin \d0\().8h, \d0\().8h, v31.8h // bitdepth_max + smin \d1\().8h, \d1\().8h, v31.8h // bitdepth_max + smax \d0\().8h, \d0\().8h, v30.8h // 0 + smax \d1\().8h, \d1\().8h, v30.8h // 0 +.endm + +.macro mask d0, d1, t0, t1, t2, t3 + ld1 {v27.16b}, [x6], 16 + ld1 {\t0\().8h,\t1\().8h}, [x2], 32 + neg v27.16b, v27.16b + ld1 {\t2\().8h,\t3\().8h}, [x3], 32 + sxtl v26.8h, v27.8b + sxtl2 v27.8h, v27.16b + sxtl v24.4s, v26.4h + sxtl2 v25.4s, v26.8h + sxtl v26.4s, v27.4h + sxtl2 v27.4s, v27.8h + ssubl \d0\().4s, \t2\().4h, \t0\().4h + ssubl2 \t0\().4s, \t2\().8h, \t0\().8h + ssubl \d1\().4s, \t3\().4h, \t1\().4h + ssubl2 \t1\().4s, \t3\().8h, \t1\().8h + mul \d0\().4s, \d0\().4s, v24.4s + mul \t0\().4s, \t0\().4s, v25.4s + mul \d1\().4s, \d1\().4s, v26.4s + mul \t1\().4s, \t1\().4s, v27.4s + sshr \d0\().4s, \d0\().4s, #6 + sshr \t0\().4s, \t0\().4s, #6 + sshr \d1\().4s, \d1\().4s, #6 + sshr \t1\().4s, \t1\().4s, #6 + saddw \d0\().4s, \d0\().4s, \t2\().4h + saddw2 \t0\().4s, \t0\().4s, \t2\().8h + saddw \d1\().4s, \d1\().4s, \t3\().4h + saddw2 \t1\().4s, \t1\().4s, \t3\().8h + xtn \d0\().4h, \d0\().4s + xtn2 \d0\().8h, \t0\().4s + xtn \d1\().4h, \d1\().4s + xtn2 \d1\().8h, \t1\().4s + srshl \d0\().8h, \d0\().8h, v29.8h // -intermediate_bits + srshl \d1\().8h, \d1\().8h, v29.8h // -intermediate_bits + add \d0\().8h, \d0\().8h, v28.8h // PREP_BIAS >> intermediate_bits + add \d1\().8h, \d1\().8h, v28.8h // PREP_BIAS >> intermediate_bits + smin \d0\().8h, \d0\().8h, v31.8h // bitdepth_max + smin \d1\().8h, \d1\().8h, v31.8h // bitdepth_max + smax \d0\().8h, \d0\().8h, v30.8h // 0 + smax \d1\().8h, \d1\().8h, v30.8h // 0 +.endm + +.macro bidir_fn type, bdmax +function \type\()_16bpc_neon, export=1 + clz w4, w4 +.ifnc \type, avg + dup v31.8h, \bdmax // bitdepth_max + movi v30.8h, #0 +.endif + clz w7, \bdmax + sub w7, w7, #18 // intermediate_bits = clz(bitdepth_max) - 18 +.ifc \type, avg + mov w9, #1 + mov w8, #-2*PREP_BIAS + lsl w9, w9, w7 // 1 << intermediate_bits + add w7, w7, #1 + sub w8, w8, w9 // -2*PREP_BIAS - 1 << intermediate_bits + neg w7, w7 // -(intermediate_bits+1) + dup v28.8h, w8 // -2*PREP_BIAS - 1 << intermediate_bits + dup v29.8h, w7 // -(intermediate_bits+1) +.else + mov w8, #PREP_BIAS + lsr w8, w8, w7 // PREP_BIAS >> intermediate_bits + neg w7, w7 // -intermediate_bits + dup v28.8h, w8 // PREP_BIAS >> intermediate_bits + dup v29.8h, w7 // -intermediate_bits +.endif +.ifc \type, w_avg + dup v27.4s, w6 + neg v27.4s, v27.4s +.endif + adr x7, L(\type\()_tbl) + sub w4, w4, #24 + \type v4, v5, v0, v1, v2, v3 + ldrh w4, [x7, x4, lsl #1] + sub x7, x7, w4, uxtw + br x7 +40: + add x7, x0, x1 + lsl x1, x1, #1 +4: + subs w5, w5, #4 + st1 {v4.d}[0], [x0], x1 + st1 {v4.d}[1], [x7], x1 + st1 {v5.d}[0], [x0], x1 + st1 {v5.d}[1], [x7], x1 + b.le 0f + \type v4, v5, v0, v1, v2, v3 + b 4b +80: + add x7, x0, x1 + lsl x1, x1, #1 +8: + st1 {v4.8h}, [x0], x1 + subs w5, w5, #2 + st1 {v5.8h}, [x7], x1 + b.le 0f + \type v4, v5, v0, v1, v2, v3 + b 8b +16: + \type v6, v7, v0, v1, v2, v3 + st1 {v4.8h, v5.8h}, [x0], x1 + subs w5, w5, #2 + st1 {v6.8h, v7.8h}, [x0], x1 + b.le 0f + \type v4, v5, v0, v1, v2, v3 + b 16b +32: + \type v6, v7, v0, v1, v2, v3 + subs w5, w5, #1 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 + b.le 0f + \type v4, v5, v0, v1, v2, v3 + b 32b +640: + add x7, x0, #64 +64: + \type v6, v7, v0, v1, v2, v3 + \type v16, v17, v0, v1, v2, v3 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 + \type v18, v19, v0, v1, v2, v3 + subs w5, w5, #1 + st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1 + b.le 0f + \type v4, v5, v0, v1, v2, v3 + b 64b +1280: + add x7, x0, #64 + mov x8, #128 + sub x1, x1, #128 +128: + \type v6, v7, v0, v1, v2, v3 + \type v16, v17, v0, v1, v2, v3 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x8 + \type v18, v19, v0, v1, v2, v3 + st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x8 + \type v4, v5, v0, v1, v2, v3 + \type v6, v7, v0, v1, v2, v3 + \type v16, v17, v0, v1, v2, v3 + subs w5, w5, #1 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 + \type v18, v19, v0, v1, v2, v3 + st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1 + b.le 0f + \type v4, v5, v0, v1, v2, v3 + b 128b +0: + ret +L(\type\()_tbl): + .hword L(\type\()_tbl) - 1280b + .hword L(\type\()_tbl) - 640b + .hword L(\type\()_tbl) - 32b + .hword L(\type\()_tbl) - 16b + .hword L(\type\()_tbl) - 80b + .hword L(\type\()_tbl) - 40b +endfunc +.endm + +bidir_fn avg, w6 +bidir_fn w_avg, w7 +bidir_fn mask, w7 + + +.macro w_mask_fn type +function w_mask_\type\()_16bpc_neon, export=1 + ldr w8, [sp] + clz w9, w4 + adr x10, L(w_mask_\type\()_tbl) + dup v31.8h, w8 // bitdepth_max + sub w9, w9, #24 + clz w8, w8 // clz(bitdepth_max) + ldrh w9, [x10, x9, lsl #1] + sub x10, x10, w9, uxtw + sub w8, w8, #12 // sh = intermediate_bits + 6 = clz(bitdepth_max) - 12 + mov w9, #PREP_BIAS*64 + neg w8, w8 // -sh + mov w11, #27615 // (64 + 1 - 38)<> mask_sh + ushr v21.8h, v21.8h, #10 + add v4.4s, v4.4s, v30.4s // += PREP_BIAS*64 + add v5.4s, v5.4s, v30.4s + add v6.4s, v6.4s, v30.4s + add v7.4s, v7.4s, v30.4s + uxtl v22.4s, v20.4h + uxtl2 v23.4s, v20.8h + uxtl v24.4s, v21.4h + uxtl2 v25.4s, v21.8h + mla v4.4s, v16.4s, v22.4s // (tmp2-tmp1)*(64-m) + mla v5.4s, v17.4s, v23.4s + mla v6.4s, v18.4s, v24.4s + mla v7.4s, v19.4s, v25.4s + srshl v4.4s, v4.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh + srshl v5.4s, v5.4s, v29.4s + srshl v6.4s, v6.4s, v29.4s + srshl v7.4s, v7.4s, v29.4s + sqxtun v4.4h, v4.4s // iclip_pixel + sqxtun2 v4.8h, v5.4s + sqxtun v5.4h, v6.4s + sqxtun2 v5.8h, v7.4s + umin v4.8h, v4.8h, v31.8h // iclip_pixel + umin v5.8h, v5.8h, v31.8h +.if \type == 444 + xtn v20.8b, v20.8h // 64 - m + xtn2 v20.16b, v21.8h + sub v20.16b, v1.16b, v20.16b // m + st1 {v20.16b}, [x6], #16 +.elseif \type == 422 + addp v20.8h, v20.8h, v21.8h // (64 - m) + (64 - n) (column wise addition) + xtn v20.8b, v20.8h + uhsub v20.8b, v3.8b, v20.8b // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 + st1 {v20.8b}, [x6], #8 +.elseif \type == 420 + trn1 v24.2d, v20.2d, v21.2d + trn2 v25.2d, v20.2d, v21.2d + add v24.8h, v24.8h, v25.8h // (64 - my1) + (64 - my2) (row wise addition) + addp v20.8h, v24.8h, v24.8h // (128 - m) + (128 - n) (column wise addition) + sub v20.4h, v3.4h, v20.4h // (256 - sign) - ((128 - m) + (128 - n)) + rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 + st1 {v20.s}[0], [x6], #4 +.endif + st1 {v4.d}[0], [x0], x1 + st1 {v4.d}[1], [x12], x1 + st1 {v5.d}[0], [x0], x1 + st1 {v5.d}[1], [x12], x1 + b.gt 4b + ret +8: + ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 + ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 + subs w5, w5, #2 + sabd v20.8h, v4.8h, v6.8h // abs(tmp1 - tmp2) + sabd v21.8h, v5.8h, v7.8h + ssubl v16.4s, v6.4h, v4.4h // tmp2 - tmp1 (requires 17 bit) + ssubl2 v17.4s, v6.8h, v4.8h + ssubl v18.4s, v7.4h, v5.4h + ssubl2 v19.4s, v7.8h, v5.8h + uqsub v20.8h, v0.8h, v20.8h // 27615 - abs() + uqsub v21.8h, v0.8h, v21.8h + sshll2 v7.4s, v5.8h, #6 // tmp1 << 6 + sshll v6.4s, v5.4h, #6 + sshll2 v5.4s, v4.8h, #6 + sshll v4.4s, v4.4h, #6 + ushr v20.8h, v20.8h, #10 // 64-m = (27615 - abs()) >> mask_sh + ushr v21.8h, v21.8h, #10 + add v4.4s, v4.4s, v30.4s // += PREP_BIAS*64 + add v5.4s, v5.4s, v30.4s + add v6.4s, v6.4s, v30.4s + add v7.4s, v7.4s, v30.4s + uxtl v22.4s, v20.4h + uxtl2 v23.4s, v20.8h + uxtl v24.4s, v21.4h + uxtl2 v25.4s, v21.8h + mla v4.4s, v16.4s, v22.4s // (tmp2-tmp1)*(64-m) + mla v5.4s, v17.4s, v23.4s + mla v6.4s, v18.4s, v24.4s + mla v7.4s, v19.4s, v25.4s + srshl v4.4s, v4.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh + srshl v5.4s, v5.4s, v29.4s + srshl v6.4s, v6.4s, v29.4s + srshl v7.4s, v7.4s, v29.4s + sqxtun v4.4h, v4.4s // iclip_pixel + sqxtun2 v4.8h, v5.4s + sqxtun v5.4h, v6.4s + sqxtun2 v5.8h, v7.4s + umin v4.8h, v4.8h, v31.8h // iclip_pixel + umin v5.8h, v5.8h, v31.8h +.if \type == 444 + xtn v20.8b, v20.8h // 64 - m + xtn2 v20.16b, v21.8h + sub v20.16b, v1.16b, v20.16b // m + st1 {v20.16b}, [x6], #16 +.elseif \type == 422 + addp v20.8h, v20.8h, v21.8h // (64 - m) + (64 - n) (column wise addition) + xtn v20.8b, v20.8h + uhsub v20.8b, v3.8b, v20.8b // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 + st1 {v20.8b}, [x6], #8 +.elseif \type == 420 + add v20.8h, v20.8h, v21.8h // (64 - my1) + (64 - my2) (row wise addition) + addp v20.8h, v20.8h, v20.8h // (128 - m) + (128 - n) (column wise addition) + sub v20.4h, v3.4h, v20.4h // (256 - sign) - ((128 - m) + (128 - n)) + rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 + st1 {v20.s}[0], [x6], #4 +.endif + st1 {v4.8h}, [x0], x1 + st1 {v5.8h}, [x12], x1 + b.gt 8b + ret +1280: +640: +320: +160: + mov w11, w4 + sub x1, x1, w4, uxtw #1 +.if \type == 444 + add x10, x6, w4, uxtw +.elseif \type == 422 + add x10, x6, x11, lsr #1 +.endif + add x9, x3, w4, uxtw #1 + add x7, x2, w4, uxtw #1 +161: + mov w8, w4 +16: + ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 + ld1 {v16.8h, v17.8h}, [x3], #32 // tmp2 + ld1 {v6.8h, v7.8h}, [x7], #32 + ld1 {v18.8h, v19.8h}, [x9], #32 + subs w8, w8, #16 + sabd v20.8h, v4.8h, v16.8h // abs(tmp1 - tmp2) + sabd v21.8h, v5.8h, v17.8h + ssubl v22.4s, v16.4h, v4.4h // tmp2 - tmp1 (requires 17 bit) + ssubl2 v23.4s, v16.8h, v4.8h + ssubl v24.4s, v17.4h, v5.4h + ssubl2 v25.4s, v17.8h, v5.8h + uqsub v20.8h, v0.8h, v20.8h // 27615 - abs() + uqsub v21.8h, v0.8h, v21.8h + sshll2 v27.4s, v5.8h, #6 // tmp1 << 6 + sshll v26.4s, v5.4h, #6 + sshll2 v5.4s, v4.8h, #6 + sshll v4.4s, v4.4h, #6 + ushr v20.8h, v20.8h, #10 // 64-m = (27615 - abs()) >> mask_sh + ushr v21.8h, v21.8h, #10 + add v4.4s, v4.4s, v30.4s // += PREP_BIAS*64 + add v5.4s, v5.4s, v30.4s + add v26.4s, v26.4s, v30.4s + add v27.4s, v27.4s, v30.4s + uxtl v16.4s, v20.4h + uxtl2 v17.4s, v20.8h + uxtl v28.4s, v21.4h + mla v4.4s, v22.4s, v16.4s // (tmp2-tmp1)*(64-m) + uxtl2 v16.4s, v21.8h + mla v5.4s, v23.4s, v17.4s + mla v26.4s, v24.4s, v28.4s + mla v27.4s, v25.4s, v16.4s + srshl v4.4s, v4.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh + srshl v5.4s, v5.4s, v29.4s + srshl v26.4s, v26.4s, v29.4s + srshl v27.4s, v27.4s, v29.4s + sqxtun v4.4h, v4.4s // iclip_pixel + sqxtun2 v4.8h, v5.4s + sqxtun v5.4h, v26.4s + sqxtun2 v5.8h, v27.4s + + // Start of other half + sabd v22.8h, v6.8h, v18.8h // abs(tmp1 - tmp2) + sabd v23.8h, v7.8h, v19.8h + + umin v4.8h, v4.8h, v31.8h // iclip_pixel + umin v5.8h, v5.8h, v31.8h + + ssubl v16.4s, v18.4h, v6.4h // tmp2 - tmp1 (requires 17 bit) + ssubl2 v17.4s, v18.8h, v6.8h + ssubl v18.4s, v19.4h, v7.4h + ssubl2 v19.4s, v19.8h, v7.8h + uqsub v22.8h, v0.8h, v22.8h // 27615 - abs() + uqsub v23.8h, v0.8h, v23.8h + sshll v24.4s, v6.4h, #6 // tmp1 << 6 + sshll2 v25.4s, v6.8h, #6 + sshll v26.4s, v7.4h, #6 + sshll2 v27.4s, v7.8h, #6 + ushr v22.8h, v22.8h, #10 // 64-m = (27615 - abs()) >> mask_sh + ushr v23.8h, v23.8h, #10 + add v24.4s, v24.4s, v30.4s // += PREP_BIAS*64 + add v25.4s, v25.4s, v30.4s + add v26.4s, v26.4s, v30.4s + add v27.4s, v27.4s, v30.4s + uxtl v6.4s, v22.4h + uxtl2 v7.4s, v22.8h + uxtl v28.4s, v23.4h + mla v24.4s, v16.4s, v6.4s // (tmp2-tmp1)*(64-m) + uxtl2 v6.4s, v23.8h + mla v25.4s, v17.4s, v7.4s + mla v26.4s, v18.4s, v28.4s + mla v27.4s, v19.4s, v6.4s + srshl v24.4s, v24.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh + srshl v25.4s, v25.4s, v29.4s + srshl v26.4s, v26.4s, v29.4s + srshl v27.4s, v27.4s, v29.4s + sqxtun v6.4h, v24.4s // iclip_pixel + sqxtun2 v6.8h, v25.4s + sqxtun v7.4h, v26.4s + sqxtun2 v7.8h, v27.4s + umin v6.8h, v6.8h, v31.8h // iclip_pixel + umin v7.8h, v7.8h, v31.8h +.if \type == 444 + xtn v20.8b, v20.8h // 64 - m + xtn2 v20.16b, v21.8h + xtn v21.8b, v22.8h + xtn2 v21.16b, v23.8h + sub v20.16b, v1.16b, v20.16b // m + sub v21.16b, v1.16b, v21.16b + st1 {v20.16b}, [x6], #16 + st1 {v21.16b}, [x10], #16 +.elseif \type == 422 + addp v20.8h, v20.8h, v21.8h // (64 - m) + (64 - n) (column wise addition) + addp v21.8h, v22.8h, v23.8h + xtn v20.8b, v20.8h + xtn v21.8b, v21.8h + uhsub v20.8b, v3.8b, v20.8b // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 + uhsub v21.8b, v3.8b, v21.8b + st1 {v20.8b}, [x6], #8 + st1 {v21.8b}, [x10], #8 +.elseif \type == 420 + add v20.8h, v20.8h, v22.8h // (64 - my1) + (64 - my2) (row wise addition) + add v21.8h, v21.8h, v23.8h + addp v20.8h, v20.8h, v21.8h // (128 - m) + (128 - n) (column wise addition) + sub v20.8h, v3.8h, v20.8h // (256 - sign) - ((128 - m) + (128 - n)) + rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 + st1 {v20.8b}, [x6], #8 +.endif + st1 {v4.8h, v5.8h}, [x0], #32 + st1 {v6.8h, v7.8h}, [x12], #32 + b.gt 16b + subs w5, w5, #2 + add x2, x2, w4, uxtw #1 + add x3, x3, w4, uxtw #1 + add x7, x7, w4, uxtw #1 + add x9, x9, w4, uxtw #1 +.if \type == 444 + add x6, x6, w4, uxtw + add x10, x10, w4, uxtw +.elseif \type == 422 + add x6, x6, x11, lsr #1 + add x10, x10, x11, lsr #1 +.endif + add x0, x0, x1 + add x12, x12, x1 + b.gt 161b + ret +L(w_mask_\type\()_tbl): + .hword L(w_mask_\type\()_tbl) - 1280b + .hword L(w_mask_\type\()_tbl) - 640b + .hword L(w_mask_\type\()_tbl) - 320b + .hword L(w_mask_\type\()_tbl) - 160b + .hword L(w_mask_\type\()_tbl) - 8b + .hword L(w_mask_\type\()_tbl) - 4b +endfunc +.endm + +w_mask_fn 444 +w_mask_fn 422 +w_mask_fn 420 + + +function blend_16bpc_neon, export=1 + adr x6, L(blend_tbl) + clz w3, w3 + sub w3, w3, #26 + ldrh w3, [x6, x3, lsl #1] + sub x6, x6, w3, uxtw + add x8, x0, x1 + br x6 +40: + lsl x1, x1, #1 +4: + ld1 {v2.8b}, [x5], #8 + ld1 {v1.8h}, [x2], #16 + ld1 {v0.d}[0], [x0] + neg v2.8b, v2.8b // -m + subs w4, w4, #2 + ld1 {v0.d}[1], [x8] + sxtl v2.8h, v2.8b + shl v2.8h, v2.8h, #9 // -m << 9 + sub v1.8h, v0.8h, v1.8h // a - b + sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6 + add v0.8h, v0.8h, v1.8h + st1 {v0.d}[0], [x0], x1 + st1 {v0.d}[1], [x8], x1 + b.gt 4b + ret +80: + lsl x1, x1, #1 +8: + ld1 {v4.16b}, [x5], #16 + ld1 {v2.8h, v3.8h}, [x2], #32 + neg v5.16b, v4.16b // -m + ld1 {v0.8h}, [x0] + ld1 {v1.8h}, [x8] + sxtl v4.8h, v5.8b + sxtl2 v5.8h, v5.16b + shl v4.8h, v4.8h, #9 // -m << 9 + shl v5.8h, v5.8h, #9 + sub v2.8h, v0.8h, v2.8h // a - b + sub v3.8h, v1.8h, v3.8h + subs w4, w4, #2 + sqrdmulh v2.8h, v2.8h, v4.8h // ((a-b)*-m + 32) >> 6 + sqrdmulh v3.8h, v3.8h, v5.8h + add v0.8h, v0.8h, v2.8h + add v1.8h, v1.8h, v3.8h + st1 {v0.8h}, [x0], x1 + st1 {v1.8h}, [x8], x1 + b.gt 8b + ret +160: + lsl x1, x1, #1 +16: + ld1 {v16.16b, v17.16b}, [x5], #32 + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 + subs w4, w4, #2 + neg v18.16b, v16.16b // -m + neg v19.16b, v17.16b + ld1 {v0.8h, v1.8h}, [x0] + sxtl v16.8h, v18.8b + sxtl2 v17.8h, v18.16b + sxtl v18.8h, v19.8b + sxtl2 v19.8h, v19.16b + ld1 {v2.8h, v3.8h}, [x8] + shl v16.8h, v16.8h, #9 // -m << 9 + shl v17.8h, v17.8h, #9 + shl v18.8h, v18.8h, #9 + shl v19.8h, v19.8h, #9 + sub v4.8h, v0.8h, v4.8h // a - b + sub v5.8h, v1.8h, v5.8h + sub v6.8h, v2.8h, v6.8h + sub v7.8h, v3.8h, v7.8h + sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6 + sqrdmulh v5.8h, v5.8h, v17.8h + sqrdmulh v6.8h, v6.8h, v18.8h + sqrdmulh v7.8h, v7.8h, v19.8h + add v0.8h, v0.8h, v4.8h + add v1.8h, v1.8h, v5.8h + add v2.8h, v2.8h, v6.8h + add v3.8h, v3.8h, v7.8h + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v2.8h, v3.8h}, [x8], x1 + b.gt 16b + ret +32: + ld1 {v16.16b, v17.16b}, [x5], #32 + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 + subs w4, w4, #1 + neg v18.16b, v16.16b // -m + neg v19.16b, v17.16b + sxtl v16.8h, v18.8b + sxtl2 v17.8h, v18.16b + sxtl v18.8h, v19.8b + sxtl2 v19.8h, v19.16b + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] + shl v16.8h, v16.8h, #9 // -m << 9 + shl v17.8h, v17.8h, #9 + shl v18.8h, v18.8h, #9 + shl v19.8h, v19.8h, #9 + sub v4.8h, v0.8h, v4.8h // a - b + sub v5.8h, v1.8h, v5.8h + sub v6.8h, v2.8h, v6.8h + sub v7.8h, v3.8h, v7.8h + sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6 + sqrdmulh v5.8h, v5.8h, v17.8h + sqrdmulh v6.8h, v6.8h, v18.8h + sqrdmulh v7.8h, v7.8h, v19.8h + add v0.8h, v0.8h, v4.8h + add v1.8h, v1.8h, v5.8h + add v2.8h, v2.8h, v6.8h + add v3.8h, v3.8h, v7.8h + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + b.gt 32b + ret +L(blend_tbl): + .hword L(blend_tbl) - 32b + .hword L(blend_tbl) - 160b + .hword L(blend_tbl) - 80b + .hword L(blend_tbl) - 40b +endfunc + +function blend_h_16bpc_neon, export=1 + adr x6, L(blend_h_tbl) + movrel x5, X(obmc_masks) + add x5, x5, w4, uxtw + sub w4, w4, w4, lsr #2 + clz w7, w3 + add x8, x0, x1 + lsl x1, x1, #1 + sub w7, w7, #24 + ldrh w7, [x6, x7, lsl #1] + sub x6, x6, w7, uxtw + br x6 +2: + ld2r {v2.8b, v3.8b}, [x5], #2 + ld1 {v1.4h}, [x2], #8 + ext v2.8b, v2.8b, v3.8b, #6 + subs w4, w4, #2 + neg v2.8b, v2.8b // -m + ld1 {v0.s}[0], [x0] + ld1 {v0.s}[1], [x8] + sxtl v2.8h, v2.8b + shl v2.4h, v2.4h, #9 // -m << 9 + sub v1.4h, v0.4h, v1.4h // a - b + sqrdmulh v1.4h, v1.4h, v2.4h // ((a-b)*-m + 32) >> 6 + add v0.4h, v0.4h, v1.4h + st1 {v0.s}[0], [x0], x1 + st1 {v0.s}[1], [x8], x1 + b.gt 2b + ret +4: + ld2r {v2.8b, v3.8b}, [x5], #2 + ld1 {v1.8h}, [x2], #16 + ext v2.8b, v2.8b, v3.8b, #4 + subs w4, w4, #2 + neg v2.8b, v2.8b // -m + ld1 {v0.d}[0], [x0] + ld1 {v0.d}[1], [x8] + sxtl v2.8h, v2.8b + shl v2.8h, v2.8h, #9 // -m << 9 + sub v1.8h, v0.8h, v1.8h // a - b + sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6 + add v0.8h, v0.8h, v1.8h + st1 {v0.d}[0], [x0], x1 + st1 {v0.d}[1], [x8], x1 + b.gt 4b + ret +8: + ld2r {v4.8b, v5.8b}, [x5], #2 + ld1 {v2.8h, v3.8h}, [x2], #32 + neg v4.8b, v4.8b // -m + neg v5.8b, v5.8b + ld1 {v0.8h}, [x0] + subs w4, w4, #2 + sxtl v4.8h, v4.8b + sxtl v5.8h, v5.8b + ld1 {v1.8h}, [x8] + shl v4.8h, v4.8h, #9 // -m << 9 + shl v5.8h, v5.8h, #9 + sub v2.8h, v0.8h, v2.8h // a - b + sub v3.8h, v1.8h, v3.8h + sqrdmulh v2.8h, v2.8h, v4.8h // ((a-b)*-m + 32) >> 6 + sqrdmulh v3.8h, v3.8h, v5.8h + add v0.8h, v0.8h, v2.8h + add v1.8h, v1.8h, v3.8h + st1 {v0.8h}, [x0], x1 + st1 {v1.8h}, [x8], x1 + b.gt 8b + ret +16: + ld2r {v16.8b, v17.8b}, [x5], #2 + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 + neg v16.8b, v16.8b // -m + neg v17.8b, v17.8b + ld1 {v0.8h, v1.8h}, [x0] + ld1 {v2.8h, v3.8h}, [x8] + subs w4, w4, #2 + sxtl v16.8h, v16.8b + sxtl v17.8h, v17.8b + shl v16.8h, v16.8h, #9 // -m << 9 + shl v17.8h, v17.8h, #9 + sub v4.8h, v0.8h, v4.8h // a - b + sub v5.8h, v1.8h, v5.8h + sub v6.8h, v2.8h, v6.8h + sub v7.8h, v3.8h, v7.8h + sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6 + sqrdmulh v5.8h, v5.8h, v16.8h + sqrdmulh v6.8h, v6.8h, v17.8h + sqrdmulh v7.8h, v7.8h, v17.8h + add v0.8h, v0.8h, v4.8h + add v1.8h, v1.8h, v5.8h + add v2.8h, v2.8h, v6.8h + add v3.8h, v3.8h, v7.8h + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v2.8h, v3.8h}, [x8], x1 + b.gt 16b + ret +1280: +640: +320: + sub x1, x1, w3, uxtw #1 + add x7, x2, w3, uxtw #1 +321: + ld2r {v24.8b, v25.8b}, [x5], #2 + mov w6, w3 + neg v24.8b, v24.8b // -m + neg v25.8b, v25.8b + sxtl v24.8h, v24.8b + sxtl v25.8h, v25.8b + shl v24.8h, v24.8h, #9 // -m << 9 + shl v25.8h, v25.8h, #9 +32: + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64 + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] + subs w6, w6, #32 + sub v16.8h, v0.8h, v16.8h // a - b + sub v17.8h, v1.8h, v17.8h + sub v18.8h, v2.8h, v18.8h + sub v19.8h, v3.8h, v19.8h + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64 + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x8] + sqrdmulh v16.8h, v16.8h, v24.8h // ((a-b)*-m + 32) >> 6 + sqrdmulh v17.8h, v17.8h, v24.8h + sqrdmulh v18.8h, v18.8h, v24.8h + sqrdmulh v19.8h, v19.8h, v24.8h + sub v20.8h, v4.8h, v20.8h // a - b + sub v21.8h, v5.8h, v21.8h + sub v22.8h, v6.8h, v22.8h + sub v23.8h, v7.8h, v23.8h + add v0.8h, v0.8h, v16.8h + add v1.8h, v1.8h, v17.8h + add v2.8h, v2.8h, v18.8h + add v3.8h, v3.8h, v19.8h + sqrdmulh v20.8h, v20.8h, v25.8h // ((a-b)*-m + 32) >> 6 + sqrdmulh v21.8h, v21.8h, v25.8h + sqrdmulh v22.8h, v22.8h, v25.8h + sqrdmulh v23.8h, v23.8h, v25.8h + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v4.8h, v4.8h, v20.8h + add v5.8h, v5.8h, v21.8h + add v6.8h, v6.8h, v22.8h + add v7.8h, v7.8h, v23.8h + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x8], #64 + b.gt 32b + subs w4, w4, #2 + add x0, x0, x1 + add x8, x8, x1 + add x2, x2, w3, uxtw #1 + add x7, x7, w3, uxtw #1 + b.gt 321b + ret +L(blend_h_tbl): + .hword L(blend_h_tbl) - 1280b + .hword L(blend_h_tbl) - 640b + .hword L(blend_h_tbl) - 320b + .hword L(blend_h_tbl) - 16b + .hword L(blend_h_tbl) - 8b + .hword L(blend_h_tbl) - 4b + .hword L(blend_h_tbl) - 2b +endfunc + +function blend_v_16bpc_neon, export=1 + adr x6, L(blend_v_tbl) + movrel x5, X(obmc_masks) + add x5, x5, w3, uxtw + clz w3, w3 + add x8, x0, x1 + lsl x1, x1, #1 + sub w3, w3, #26 + ldrh w3, [x6, x3, lsl #1] + sub x6, x6, w3, uxtw + br x6 +20: + ld1r {v2.8b}, [x5] + neg v2.8b, v2.8b // -m + sxtl v2.8h, v2.8b + shl v2.4h, v2.4h, #9 // -m << 9 +2: + ld1 {v1.s}[0], [x2], #4 + ld1 {v0.h}[0], [x0] + subs w4, w4, #2 + ld1 {v1.h}[1], [x2] + ld1 {v0.h}[1], [x8] + add x2, x2, #4 + sub v1.4h, v0.4h, v1.4h // a - b + sqrdmulh v1.4h, v1.4h, v2.4h // ((a-b)*-m + 32) >> 6 + add v0.4h, v0.4h, v1.4h + st1 {v0.h}[0], [x0], x1 + st1 {v0.h}[1], [x8], x1 + b.gt 2b + ret +40: + ld1r {v2.2s}, [x5] + sub x1, x1, #4 + neg v2.8b, v2.8b // -m + sxtl v2.8h, v2.8b + shl v2.8h, v2.8h, #9 // -m << 9 +4: + ld1 {v1.8h}, [x2], #16 + ld1 {v0.d}[0], [x0] + ld1 {v0.d}[1], [x8] + subs w4, w4, #2 + sub v1.8h, v0.8h, v1.8h // a - b + sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6 + add v0.8h, v0.8h, v1.8h + st1 {v0.s}[0], [x0], #4 + st1 {v0.s}[2], [x8], #4 + st1 {v0.h}[2], [x0], x1 + st1 {v0.h}[6], [x8], x1 + b.gt 4b + ret +80: + ld1 {v4.8b}, [x5] + sub x1, x1, #8 + neg v4.8b, v4.8b // -m + sxtl v4.8h, v4.8b + shl v4.8h, v4.8h, #9 // -m << 9 +8: + ld1 {v2.8h, v3.8h}, [x2], #32 + ld1 {v0.8h}, [x0] + ld1 {v1.8h}, [x8] + subs w4, w4, #2 + sub v2.8h, v0.8h, v2.8h // a - b + sub v3.8h, v1.8h, v3.8h + sqrdmulh v2.8h, v2.8h, v4.8h // ((a-b)*-m + 32) >> 6 + sqrdmulh v3.8h, v3.8h, v4.8h + add v0.8h, v0.8h, v2.8h + add v1.8h, v1.8h, v3.8h + st1 {v0.d}[0], [x0], #8 + st1 {v1.d}[0], [x8], #8 + st1 {v0.s}[2], [x0], x1 + st1 {v1.s}[2], [x8], x1 + b.gt 8b + ret +160: + ld1 {v16.8b, v17.8b}, [x5] + sub x1, x1, #16 + neg v16.8b, v16.8b // -m + neg v17.8b, v17.8b + sxtl v16.8h, v16.8b + sxtl v17.8h, v17.8b + shl v16.8h, v16.8h, #9 // -m << 9 + shl v17.4h, v17.4h, #9 +16: + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 + ld1 {v0.8h, v1.8h}, [x0] + subs w4, w4, #2 + ld1 {v2.8h, v3.8h}, [x8] + sub v4.8h, v0.8h, v4.8h // a - b + sub v5.4h, v1.4h, v5.4h + sub v6.8h, v2.8h, v6.8h + sub v7.4h, v3.4h, v7.4h + sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6 + sqrdmulh v5.4h, v5.4h, v17.4h + sqrdmulh v6.8h, v6.8h, v16.8h + sqrdmulh v7.4h, v7.4h, v17.4h + add v0.8h, v0.8h, v4.8h + add v1.4h, v1.4h, v5.4h + add v2.8h, v2.8h, v6.8h + add v3.4h, v3.4h, v7.4h + st1 {v0.8h}, [x0], #16 + st1 {v2.8h}, [x8], #16 + st1 {v1.4h}, [x0], x1 + st1 {v3.4h}, [x8], x1 + b.gt 16b + ret +320: + ld1 {v24.16b, v25.16b}, [x5] + neg v26.16b, v24.16b // -m + neg v27.8b, v25.8b + sxtl v24.8h, v26.8b + sxtl2 v25.8h, v26.16b + sxtl v26.8h, v27.8b + shl v24.8h, v24.8h, #9 // -m << 9 + shl v25.8h, v25.8h, #9 + shl v26.8h, v26.8h, #9 +32: + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64 + ld1 {v0.8h, v1.8h, v2.8h}, [x0] + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], #64 + ld1 {v4.8h, v5.8h, v6.8h}, [x8] + subs w4, w4, #2 + sub v16.8h, v0.8h, v16.8h // a - b + sub v17.8h, v1.8h, v17.8h + sub v18.8h, v2.8h, v18.8h + sub v20.8h, v4.8h, v20.8h + sub v21.8h, v5.8h, v21.8h + sub v22.8h, v6.8h, v22.8h + sqrdmulh v16.8h, v16.8h, v24.8h // ((a-b)*-m + 32) >> 6 + sqrdmulh v17.8h, v17.8h, v25.8h + sqrdmulh v18.8h, v18.8h, v26.8h + sqrdmulh v20.8h, v20.8h, v24.8h + sqrdmulh v21.8h, v21.8h, v25.8h + sqrdmulh v22.8h, v22.8h, v26.8h + add v0.8h, v0.8h, v16.8h + add v1.8h, v1.8h, v17.8h + add v2.8h, v2.8h, v18.8h + add v4.8h, v4.8h, v20.8h + add v5.8h, v5.8h, v21.8h + add v6.8h, v6.8h, v22.8h + st1 {v0.8h, v1.8h, v2.8h}, [x0], x1 + st1 {v4.8h, v5.8h, v6.8h}, [x8], x1 + b.gt 32b + ret +L(blend_v_tbl): + .hword L(blend_v_tbl) - 320b + .hword L(blend_v_tbl) - 160b + .hword L(blend_v_tbl) - 80b + .hword L(blend_v_tbl) - 40b + .hword L(blend_v_tbl) - 20b +endfunc + + +// This has got the same signature as the put_8tap functions, +// and assumes that x9 is set to (clz(w)-24). +function put_neon + adr x10, L(put_tbl) + ldrh w9, [x10, x9, lsl #1] + sub x10, x10, w9, uxtw + br x10 + +2: + ld1 {v0.s}[0], [x2], x3 + ld1 {v1.s}[0], [x2], x3 + subs w5, w5, #2 + st1 {v0.s}[0], [x0], x1 + st1 {v1.s}[0], [x0], x1 + b.gt 2b + ret +4: + ld1 {v0.4h}, [x2], x3 + ld1 {v1.4h}, [x2], x3 + subs w5, w5, #2 + st1 {v0.4h}, [x0], x1 + st1 {v1.4h}, [x0], x1 + b.gt 4b + ret +80: + add x8, x0, x1 + lsl x1, x1, #1 + add x9, x2, x3 + lsl x3, x3, #1 +8: + ld1 {v0.8h}, [x2], x3 + ld1 {v1.8h}, [x9], x3 + subs w5, w5, #2 + st1 {v0.8h}, [x0], x1 + st1 {v1.8h}, [x8], x1 + b.gt 8b + ret +16: + ldp x6, x7, [x2] + ldp x8, x9, [x2, #16] + stp x6, x7, [x0] + subs w5, w5, #1 + stp x8, x9, [x0, #16] + add x2, x2, x3 + add x0, x0, x1 + b.gt 16b + ret +32: + ldp x6, x7, [x2] + ldp x8, x9, [x2, #16] + stp x6, x7, [x0] + ldp x10, x11, [x2, #32] + stp x8, x9, [x0, #16] + subs w5, w5, #1 + ldp x12, x13, [x2, #48] + stp x10, x11, [x0, #32] + stp x12, x13, [x0, #48] + add x2, x2, x3 + add x0, x0, x1 + b.gt 32b + ret +64: + ldp q0, q1, [x2] + ldp q2, q3, [x2, #32] + stp q0, q1, [x0] + ldp q4, q5, [x2, #64] + stp q2, q3, [x0, #32] + ldp q6, q7, [x2, #96] + subs w5, w5, #1 + stp q4, q5, [x0, #64] + stp q6, q7, [x0, #96] + add x2, x2, x3 + add x0, x0, x1 + b.gt 64b + ret +128: + ldp q0, q1, [x2] + ldp q2, q3, [x2, #32] + stp q0, q1, [x0] + ldp q4, q5, [x2, #64] + stp q2, q3, [x0, #32] + ldp q6, q7, [x2, #96] + subs w5, w5, #1 + stp q4, q5, [x0, #64] + ldp q16, q17, [x2, #128] + stp q6, q7, [x0, #96] + ldp q18, q19, [x2, #160] + stp q16, q17, [x0, #128] + ldp q20, q21, [x2, #192] + stp q18, q19, [x0, #160] + ldp q22, q23, [x2, #224] + stp q20, q21, [x0, #192] + stp q22, q23, [x0, #224] + add x2, x2, x3 + add x0, x0, x1 + b.gt 128b + ret + +L(put_tbl): + .hword L(put_tbl) - 128b + .hword L(put_tbl) - 64b + .hword L(put_tbl) - 32b + .hword L(put_tbl) - 16b + .hword L(put_tbl) - 80b + .hword L(put_tbl) - 4b + .hword L(put_tbl) - 2b +endfunc + + +// This has got the same signature as the prep_8tap functions, +// and assumes that x9 is set to (clz(w)-24), w7 to intermediate_bits and +// x8 to w*2. +function prep_neon + adr x10, L(prep_tbl) + ldrh w9, [x10, x9, lsl #1] + dup v31.8h, w7 // intermediate_bits + movi v30.8h, #(PREP_BIAS >> 8), lsl #8 + sub x10, x10, w9, uxtw + br x10 + +40: + add x9, x1, x2 + lsl x2, x2, #1 +4: + ld1 {v0.d}[0], [x1], x2 + ld1 {v0.d}[1], [x9], x2 + subs w4, w4, #2 + sshl v0.8h, v0.8h, v31.8h + sub v0.8h, v0.8h, v30.8h + st1 {v0.8h}, [x0], #16 + b.gt 4b + ret +80: + add x9, x1, x2 + lsl x2, x2, #1 +8: + ld1 {v0.8h}, [x1], x2 + ld1 {v1.8h}, [x9], x2 + subs w4, w4, #2 + sshl v0.8h, v0.8h, v31.8h + sshl v1.8h, v1.8h, v31.8h + sub v0.8h, v0.8h, v30.8h + sub v1.8h, v1.8h, v30.8h + st1 {v0.8h, v1.8h}, [x0], #32 + b.gt 8b + ret +16: + ldp q0, q1, [x1] + add x1, x1, x2 + sshl v0.8h, v0.8h, v31.8h + ldp q2, q3, [x1] + add x1, x1, x2 + subs w4, w4, #2 + sshl v1.8h, v1.8h, v31.8h + sshl v2.8h, v2.8h, v31.8h + sshl v3.8h, v3.8h, v31.8h + sub v0.8h, v0.8h, v30.8h + sub v1.8h, v1.8h, v30.8h + sub v2.8h, v2.8h, v30.8h + sub v3.8h, v3.8h, v30.8h + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + b.gt 16b + ret +32: + ldp q0, q1, [x1] + sshl v0.8h, v0.8h, v31.8h + ldp q2, q3, [x1, #32] + add x1, x1, x2 + sshl v1.8h, v1.8h, v31.8h + sshl v2.8h, v2.8h, v31.8h + sshl v3.8h, v3.8h, v31.8h + subs w4, w4, #1 + sub v0.8h, v0.8h, v30.8h + sub v1.8h, v1.8h, v30.8h + sub v2.8h, v2.8h, v30.8h + sub v3.8h, v3.8h, v30.8h + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + b.gt 32b + ret +64: + ldp q0, q1, [x1] + subs w4, w4, #1 + sshl v0.8h, v0.8h, v31.8h + ldp q2, q3, [x1, #32] + sshl v1.8h, v1.8h, v31.8h + ldp q4, q5, [x1, #64] + sshl v2.8h, v2.8h, v31.8h + sshl v3.8h, v3.8h, v31.8h + ldp q6, q7, [x1, #96] + add x1, x1, x2 + sshl v4.8h, v4.8h, v31.8h + sshl v5.8h, v5.8h, v31.8h + sshl v6.8h, v6.8h, v31.8h + sshl v7.8h, v7.8h, v31.8h + sub v0.8h, v0.8h, v30.8h + sub v1.8h, v1.8h, v30.8h + sub v2.8h, v2.8h, v30.8h + sub v3.8h, v3.8h, v30.8h + stp q0, q1, [x0] + sub v4.8h, v4.8h, v30.8h + sub v5.8h, v5.8h, v30.8h + stp q2, q3, [x0, #32] + sub v6.8h, v6.8h, v30.8h + sub v7.8h, v7.8h, v30.8h + stp q4, q5, [x0, #64] + stp q6, q7, [x0, #96] + add x0, x0, x8 + b.gt 64b + ret +128: + ldp q0, q1, [x1] + subs w4, w4, #1 + sshl v0.8h, v0.8h, v31.8h + ldp q2, q3, [x1, #32] + sshl v1.8h, v1.8h, v31.8h + ldp q4, q5, [x1, #64] + sshl v2.8h, v2.8h, v31.8h + sshl v3.8h, v3.8h, v31.8h + ldp q6, q7, [x1, #96] + sshl v4.8h, v4.8h, v31.8h + sshl v5.8h, v5.8h, v31.8h + ldp q16, q17, [x1, #128] + sshl v6.8h, v6.8h, v31.8h + sshl v7.8h, v7.8h, v31.8h + ldp q18, q19, [x1, #160] + sshl v16.8h, v16.8h, v31.8h + sshl v17.8h, v17.8h, v31.8h + ldp q20, q21, [x1, #192] + sshl v18.8h, v18.8h, v31.8h + sshl v19.8h, v19.8h, v31.8h + ldp q22, q23, [x1, #224] + add x1, x1, x2 + sshl v20.8h, v20.8h, v31.8h + sshl v21.8h, v21.8h, v31.8h + sshl v22.8h, v22.8h, v31.8h + sshl v23.8h, v23.8h, v31.8h + sub v0.8h, v0.8h, v30.8h + sub v1.8h, v1.8h, v30.8h + sub v2.8h, v2.8h, v30.8h + sub v3.8h, v3.8h, v30.8h + stp q0, q1, [x0] + sub v4.8h, v4.8h, v30.8h + sub v5.8h, v5.8h, v30.8h + stp q2, q3, [x0, #32] + sub v6.8h, v6.8h, v30.8h + sub v7.8h, v7.8h, v30.8h + stp q4, q5, [x0, #64] + sub v16.8h, v16.8h, v30.8h + sub v17.8h, v17.8h, v30.8h + stp q6, q7, [x0, #96] + sub v18.8h, v18.8h, v30.8h + sub v19.8h, v19.8h, v30.8h + stp q16, q17, [x0, #128] + sub v20.8h, v20.8h, v30.8h + sub v21.8h, v21.8h, v30.8h + stp q18, q19, [x0, #160] + sub v22.8h, v22.8h, v30.8h + sub v23.8h, v23.8h, v30.8h + stp q20, q21, [x0, #192] + stp q22, q23, [x0, #224] + add x0, x0, x8 + b.gt 128b + ret + +L(prep_tbl): + .hword L(prep_tbl) - 128b + .hword L(prep_tbl) - 64b + .hword L(prep_tbl) - 32b + .hword L(prep_tbl) - 16b + .hword L(prep_tbl) - 80b + .hword L(prep_tbl) - 40b +endfunc + + +.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 + ld1 {\d0\wd}[0], [\s0], \strd + ld1 {\d1\wd}[0], [\s1], \strd +.ifnb \d2 + ld1 {\d2\wd}[0], [\s0], \strd + ld1 {\d3\wd}[0], [\s1], \strd +.endif +.ifnb \d4 + ld1 {\d4\wd}[0], [\s0], \strd +.endif +.ifnb \d5 + ld1 {\d5\wd}[0], [\s1], \strd +.endif +.ifnb \d6 + ld1 {\d6\wd}[0], [\s0], \strd +.endif +.endm +.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 + ld1 {\d0\wd}, [\s0], \strd + ld1 {\d1\wd}, [\s1], \strd +.ifnb \d2 + ld1 {\d2\wd}, [\s0], \strd + ld1 {\d3\wd}, [\s1], \strd +.endif +.ifnb \d4 + ld1 {\d4\wd}, [\s0], \strd +.endif +.ifnb \d5 + ld1 {\d5\wd}, [\s1], \strd +.endif +.ifnb \d6 + ld1 {\d6\wd}, [\s0], \strd +.endif +.endm +.macro load_regpair s0, s1, strd, wd, d0, d1, d2, d3, d4, d5 + ld1 {\d0\wd, \d1\wd}, [\s0], \strd +.ifnb \d2 + ld1 {\d2\wd, \d3\wd}, [\s1], \strd +.endif +.ifnb \d4 + ld1 {\d4\wd, \d5\wd}, [\s0], \strd +.endif +.endm +.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 + load_slice \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6 +.endm +.macro load_4h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 + load_reg \s0, \s1, \strd, .4h, \d0, \d1, \d2, \d3, \d4, \d5, \d6 +.endm +.macro load_8h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 + load_reg \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5, \d6 +.endm +.macro load_16h s0, s1, strd, d0, d1, d2, d3, d4, d5 + load_regpair \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5 +.endm +.macro interleave_1 wd, r0, r1, r2, r3, r4 + trn1 \r0\wd, \r0\wd, \r1\wd + trn1 \r1\wd, \r1\wd, \r2\wd +.ifnb \r3 + trn1 \r2\wd, \r2\wd, \r3\wd + trn1 \r3\wd, \r3\wd, \r4\wd +.endif +.endm +.macro interleave_1_s r0, r1, r2, r3, r4 + interleave_1 .2s, \r0, \r1, \r2, \r3, \r4 +.endm +.macro umin_h c, wd, r0, r1, r2, r3 + umin \r0\wd, \r0\wd, \c\wd +.ifnb \r1 + umin \r1\wd, \r1\wd, \c\wd +.endif +.ifnb \r2 + umin \r2\wd, \r2\wd, \c\wd + umin \r3\wd, \r3\wd, \c\wd +.endif +.endm +.macro sub_h c, wd, r0, r1, r2, r3 + sub \r0\wd, \r0\wd, \c\wd +.ifnb \r1 + sub \r1\wd, \r1\wd, \c\wd +.endif +.ifnb \r2 + sub \r2\wd, \r2\wd, \c\wd + sub \r3\wd, \r3\wd, \c\wd +.endif +.endm +.macro smull_smlal_4 d, s0, s1, s2, s3 + smull \d\().4s, \s0\().4h, v0.h[0] + smlal \d\().4s, \s1\().4h, v0.h[1] + smlal \d\().4s, \s2\().4h, v0.h[2] + smlal \d\().4s, \s3\().4h, v0.h[3] +.endm +.macro smull2_smlal2_4 d, s0, s1, s2, s3 + smull2 \d\().4s, \s0\().8h, v0.h[0] + smlal2 \d\().4s, \s1\().8h, v0.h[1] + smlal2 \d\().4s, \s2\().8h, v0.h[2] + smlal2 \d\().4s, \s3\().8h, v0.h[3] +.endm +.macro smull_smlal_8 d, s0, s1, s2, s3, s4, s5, s6, s7 + smull \d\().4s, \s0\().4h, v0.h[0] + smlal \d\().4s, \s1\().4h, v0.h[1] + smlal \d\().4s, \s2\().4h, v0.h[2] + smlal \d\().4s, \s3\().4h, v0.h[3] + smlal \d\().4s, \s4\().4h, v0.h[4] + smlal \d\().4s, \s5\().4h, v0.h[5] + smlal \d\().4s, \s6\().4h, v0.h[6] + smlal \d\().4s, \s7\().4h, v0.h[7] +.endm +.macro smull2_smlal2_8 d, s0, s1, s2, s3, s4, s5, s6, s7 + smull2 \d\().4s, \s0\().8h, v0.h[0] + smlal2 \d\().4s, \s1\().8h, v0.h[1] + smlal2 \d\().4s, \s2\().8h, v0.h[2] + smlal2 \d\().4s, \s3\().8h, v0.h[3] + smlal2 \d\().4s, \s4\().8h, v0.h[4] + smlal2 \d\().4s, \s5\().8h, v0.h[5] + smlal2 \d\().4s, \s6\().8h, v0.h[6] + smlal2 \d\().4s, \s7\().8h, v0.h[7] +.endm +.macro sqrshrun_h shift, r0, r1, r2, r3 + sqrshrun \r0\().4h, \r0\().4s, #\shift +.ifnb \r1 + sqrshrun2 \r0\().8h, \r1\().4s, #\shift +.endif +.ifnb \r2 + sqrshrun \r2\().4h, \r2\().4s, #\shift + sqrshrun2 \r2\().8h, \r3\().4s, #\shift +.endif +.endm +.macro xtn_h r0, r1, r2, r3 + xtn \r0\().4h, \r0\().4s + xtn2 \r0\().8h, \r1\().4s +.ifnb \r2 + xtn \r2\().4h, \r2\().4s + xtn2 \r2\().8h, \r3\().4s +.endif +.endm +.macro srshl_s shift, r0, r1, r2, r3 + srshl \r0\().4s, \r0\().4s, \shift\().4s + srshl \r1\().4s, \r1\().4s, \shift\().4s +.ifnb \r2 + srshl \r2\().4s, \r2\().4s, \shift\().4s + srshl \r3\().4s, \r3\().4s, \shift\().4s +.endif +.endm +.macro st_s strd, reg, lanes + st1 {\reg\().s}[0], [x0], \strd + st1 {\reg\().s}[1], [x9], \strd +.if \lanes > 2 + st1 {\reg\().s}[2], [x0], \strd + st1 {\reg\().s}[3], [x9], \strd +.endif +.endm +.macro st_d strd, r0, r1 + st1 {\r0\().d}[0], [x0], \strd + st1 {\r0\().d}[1], [x9], \strd +.ifnb \r1 + st1 {\r1\().d}[0], [x0], \strd + st1 {\r1\().d}[1], [x9], \strd +.endif +.endm +.macro shift_store_4 type, strd, r0, r1, r2, r3 +.ifc \type, put + sqrshrun_h 6, \r0, \r1, \r2, \r3 + umin_h v31, .8h, \r0, \r2 +.else + srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits) + xtn_h \r0, \r1, \r2, \r3 + sub_h v29, .8h, \r0, \r2 // PREP_BIAS +.endif + st_d \strd, \r0, \r2 +.endm +.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7 + st1 {\r0\wd}, [x0], \strd + st1 {\r1\wd}, [x9], \strd +.ifnb \r2 + st1 {\r2\wd}, [x0], \strd + st1 {\r3\wd}, [x9], \strd +.endif +.ifnb \r4 + st1 {\r4\wd}, [x0], \strd + st1 {\r5\wd}, [x9], \strd + st1 {\r6\wd}, [x0], \strd + st1 {\r7\wd}, [x9], \strd +.endif +.endm +.macro st_8h strd, r0, r1, r2, r3, r4, r5, r6, r7 + st_reg \strd, .8h, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 +.endm +.macro shift_store_8 type, strd, r0, r1, r2, r3 +.ifc \type, put + sqrshrun_h 6, \r0, \r1, \r2, \r3 + umin_h v31, .8h, \r0, \r2 +.else + srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits) + xtn_h \r0, \r1, \r2, \r3 + sub_h v29, .8h, \r0, \r2 // PREP_BIAS +.endif + st_8h \strd, \r0, \r2 +.endm +.macro shift_store_16 type, strd, dst, r0, r1, r2, r3 +.ifc \type, put + sqrshrun_h 6, \r0, \r1, \r2, \r3 + umin \r0\().8h, \r0\().8h, v31.8h + umin \r1\().8h, \r2\().8h, v31.8h +.else + srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits) + xtn_h \r0, \r1, \r2, \r3 + sub \r0\().8h, \r0\().8h, v29.8h + sub \r1\().8h, \r2\().8h, v29.8h +.endif + st1 {\r0\().8h, \r1\().8h}, [\dst], \strd +.endm + +.macro make_8tap_fn op, type, type_h, type_v +function \op\()_8tap_\type\()_16bpc_neon, export=1 + mov w9, \type_h + mov w10, \type_v + b \op\()_8tap_neon +endfunc +.endm + +// No spaces in these expressions, due to gas-preprocessor. +#define REGULAR ((0*15<<7)|3*15) +#define SMOOTH ((1*15<<7)|4*15) +#define SHARP ((2*15<<7)|3*15) + +.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2 +make_8tap_fn \type, regular, REGULAR, REGULAR +make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH +make_8tap_fn \type, regular_sharp, REGULAR, SHARP +make_8tap_fn \type, smooth, SMOOTH, SMOOTH +make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR +make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP +make_8tap_fn \type, sharp, SHARP, SHARP +make_8tap_fn \type, sharp_regular, SHARP, REGULAR +make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH + +function \type\()_8tap_neon +.ifc \bdmax, w8 + ldr w8, [sp] +.endif + mov w11, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) + mul \mx, \mx, w11 + mul \my, \my, w11 + add \mx, \mx, w9 // mx, 8tap_h, 4tap_h + add \my, \my, w10 // my, 8tap_v, 4tap_v +.ifc \type, prep + uxtw \d_strd, \w + lsl \d_strd, \d_strd, #1 +.endif + + dup v31.8h, \bdmax // bitdepth_max + clz \bdmax, \bdmax + clz w9, \w + sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18 + mov w12, #6 + tst \mx, #(0x7f << 14) + sub w9, w9, #24 + add w13, w12, \bdmax // 6 + intermediate_bits + sub w12, w12, \bdmax // 6 - intermediate_bits + movrel x11, X(mc_subpel_filters), -8 + b.ne L(\type\()_8tap_h) + tst \my, #(0x7f << 14) + b.ne L(\type\()_8tap_v) + b \type\()_neon + +L(\type\()_8tap_h): + cmp \w, #4 + ubfx w10, \mx, #7, #7 + and \mx, \mx, #0x7f + b.le 4f + mov \mx, w10 +4: + tst \my, #(0x7f << 14) + add \xmx, x11, \mx, uxtw #3 + b.ne L(\type\()_8tap_hv) + + adr x10, L(\type\()_8tap_h_tbl) + dup v30.4s, w12 // 6 - intermediate_bits + ldrh w9, [x10, x9, lsl #1] + neg v30.4s, v30.4s // -(6-intermediate_bits) +.ifc \type, put + dup v29.8h, \bdmax // intermediate_bits +.else + movi v28.8h, #(PREP_BIAS >> 8), lsl #8 +.endif + sub x10, x10, w9, uxtw +.ifc \type, put + neg v29.8h, v29.8h // -intermediate_bits +.endif + br x10 + +20: // 2xN h +.ifc \type, put + add \xmx, \xmx, #2 + ld1 {v0.s}[0], [\xmx] + sub \src, \src, #2 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + sxtl v0.8h, v0.8b +2: + ld1 {v4.8h}, [\src], \s_strd + ld1 {v6.8h}, [\sr2], \s_strd + ext v5.16b, v4.16b, v4.16b, #2 + ext v7.16b, v6.16b, v6.16b, #2 + subs \h, \h, #2 + trn1 v3.2s, v4.2s, v6.2s + trn2 v6.2s, v4.2s, v6.2s + trn1 v4.2s, v5.2s, v7.2s + trn2 v7.2s, v5.2s, v7.2s + smull v3.4s, v3.4h, v0.h[0] + smlal v3.4s, v4.4h, v0.h[1] + smlal v3.4s, v6.4h, v0.h[2] + smlal v3.4s, v7.4h, v0.h[3] + srshl v3.4s, v3.4s, v30.4s // -(6-intermediate_bits) + sqxtun v3.4h, v3.4s + srshl v3.4h, v3.4h, v29.4h // -intermediate_bits + umin v3.4h, v3.4h, v31.4h + st1 {v3.s}[0], [\dst], \d_strd + st1 {v3.s}[1], [\ds2], \d_strd + b.gt 2b + ret +.endif + +40: // 4xN h + add \xmx, \xmx, #2 + ld1 {v0.s}[0], [\xmx] + sub \src, \src, #2 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + sxtl v0.8h, v0.8b +4: + ld1 {v16.8h}, [\src], \s_strd + ld1 {v20.8h}, [\sr2], \s_strd + ext v17.16b, v16.16b, v16.16b, #2 + ext v18.16b, v16.16b, v16.16b, #4 + ext v19.16b, v16.16b, v16.16b, #6 + ext v21.16b, v20.16b, v20.16b, #2 + ext v22.16b, v20.16b, v20.16b, #4 + ext v23.16b, v20.16b, v20.16b, #6 + subs \h, \h, #2 + smull v16.4s, v16.4h, v0.h[0] + smlal v16.4s, v17.4h, v0.h[1] + smlal v16.4s, v18.4h, v0.h[2] + smlal v16.4s, v19.4h, v0.h[3] + smull v20.4s, v20.4h, v0.h[0] + smlal v20.4s, v21.4h, v0.h[1] + smlal v20.4s, v22.4h, v0.h[2] + smlal v20.4s, v23.4h, v0.h[3] + srshl v16.4s, v16.4s, v30.4s // -(6-intermediate_bits) + srshl v20.4s, v20.4s, v30.4s // -(6-intermediate_bits) +.ifc \type, put + sqxtun v16.4h, v16.4s + sqxtun2 v16.8h, v20.4s + srshl v16.8h, v16.8h, v29.8h // -intermediate_bits + umin v16.8h, v16.8h, v31.8h +.else + xtn v16.4h, v16.4s + xtn2 v16.8h, v20.4s + sub v16.8h, v16.8h, v28.8h // PREP_BIAS +.endif + st1 {v16.d}[0], [\dst], \d_strd + st1 {v16.d}[1], [\ds2], \d_strd + b.gt 4b + ret + +80: +160: +320: +640: +1280: // 8xN, 16xN, 32xN, ... h + ld1 {v0.8b}, [\xmx] + sub \src, \src, #6 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + sxtl v0.8h, v0.8b + + sub \s_strd, \s_strd, \w, uxtw #1 + sub \s_strd, \s_strd, #16 +.ifc \type, put + lsl \d_strd, \d_strd, #1 + sub \d_strd, \d_strd, \w, uxtw #1 +.endif +81: + ld1 {v16.8h, v17.8h}, [\src], #32 + ld1 {v20.8h, v21.8h}, [\sr2], #32 + mov \mx, \w + +8: + smull v18.4s, v16.4h, v0.h[0] + smull2 v19.4s, v16.8h, v0.h[0] + smull v22.4s, v20.4h, v0.h[0] + smull2 v23.4s, v20.8h, v0.h[0] +.irpc i, 1234567 + ext v24.16b, v16.16b, v17.16b, #(2*\i) + ext v25.16b, v20.16b, v21.16b, #(2*\i) + smlal v18.4s, v24.4h, v0.h[\i] + smlal2 v19.4s, v24.8h, v0.h[\i] + smlal v22.4s, v25.4h, v0.h[\i] + smlal2 v23.4s, v25.8h, v0.h[\i] +.endr + subs \mx, \mx, #8 + srshl v18.4s, v18.4s, v30.4s // -(6-intermediate_bits) + srshl v19.4s, v19.4s, v30.4s // -(6-intermediate_bits) + srshl v22.4s, v22.4s, v30.4s // -(6-intermediate_bits) + srshl v23.4s, v23.4s, v30.4s // -(6-intermediate_bits) +.ifc \type, put + sqxtun v18.4h, v18.4s + sqxtun2 v18.8h, v19.4s + sqxtun v22.4h, v22.4s + sqxtun2 v22.8h, v23.4s + srshl v18.8h, v18.8h, v29.8h // -intermediate_bits + srshl v22.8h, v22.8h, v29.8h // -intermediate_bits + umin v18.8h, v18.8h, v31.8h + umin v22.8h, v22.8h, v31.8h +.else + xtn v18.4h, v18.4s + xtn2 v18.8h, v19.4s + xtn v22.4h, v22.4s + xtn2 v22.8h, v23.4s + sub v18.8h, v18.8h, v28.8h // PREP_BIAS + sub v22.8h, v22.8h, v28.8h // PREP_BIAS +.endif + st1 {v18.8h}, [\dst], #16 + st1 {v22.8h}, [\ds2], #16 + b.le 9f + + mov v16.16b, v17.16b + mov v20.16b, v21.16b + ld1 {v17.8h}, [\src], #16 + ld1 {v21.8h}, [\sr2], #16 + b 8b + +9: + add \dst, \dst, \d_strd + add \ds2, \ds2, \d_strd + add \src, \src, \s_strd + add \sr2, \sr2, \s_strd + + subs \h, \h, #2 + b.gt 81b + ret + +L(\type\()_8tap_h_tbl): + .hword L(\type\()_8tap_h_tbl) - 1280b + .hword L(\type\()_8tap_h_tbl) - 640b + .hword L(\type\()_8tap_h_tbl) - 320b + .hword L(\type\()_8tap_h_tbl) - 160b + .hword L(\type\()_8tap_h_tbl) - 80b + .hword L(\type\()_8tap_h_tbl) - 40b + .hword L(\type\()_8tap_h_tbl) - 20b + .hword 0 + + +L(\type\()_8tap_v): + cmp \h, #4 + ubfx w10, \my, #7, #7 + and \my, \my, #0x7f + b.le 4f + mov \my, w10 +4: + add \xmy, x11, \my, uxtw #3 + +.ifc \type, prep + dup v30.4s, w12 // 6 - intermediate_bits + movi v29.8h, #(PREP_BIAS >> 8), lsl #8 +.endif + adr x10, L(\type\()_8tap_v_tbl) + ldrh w9, [x10, x9, lsl #1] +.ifc \type, prep + neg v30.4s, v30.4s // -(6-intermediate_bits) +.endif + sub x10, x10, w9, uxtw + br x10 + +20: // 2xN v +.ifc \type, put + b.gt 28f + + cmp \h, #2 + add \xmy, \xmy, #2 + ld1 {v0.s}[0], [\xmy] + sub \src, \src, \s_strd + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + + // 2x2 v + load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5 + interleave_1_s v1, v2, v3, v4, v5 + b.gt 24f + smull_smlal_4 v6, v1, v2, v3, v4 + sqrshrun_h 6, v6 + umin_h v31, .8h, v6 + st_s \d_strd, v6, 2 + ret + +24: // 2x4 v + load_s \sr2, \src, \s_strd, v6, v7 + interleave_1_s v5, v6, v7 + smull_smlal_4 v16, v1, v2, v3, v4 + smull_smlal_4 v17, v3, v4, v5, v6 + sqrshrun_h 6, v16, v17 + umin_h v31, .8h, v16 + st_s \d_strd, v16, 4 + ret + +28: // 2x8, 2x16 v + ld1 {v0.8b}, [\xmy] + sub \sr2, \src, \s_strd, lsl #1 + add \ds2, \dst, \d_strd + sub \src, \sr2, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + sxtl v0.8h, v0.8b + + load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5, v6, v7 + interleave_1_s v1, v2, v3, v4, v5 + interleave_1_s v5, v6, v7 +216: + subs \h, \h, #8 + load_s \sr2, \src, \s_strd, v16, v17, v18, v19 + load_s \sr2, \src, \s_strd, v20, v21, v22, v23 + interleave_1_s v7, v16, v17, v18, v19 + interleave_1_s v19, v20, v21, v22, v23 + smull_smlal_8 v24, v1, v2, v3, v4, v5, v6, v7, v16 + smull_smlal_8 v25, v3, v4, v5, v6, v7, v16, v17, v18 + smull_smlal_8 v26, v5, v6, v7, v16, v17, v18, v19, v20 + smull_smlal_8 v27, v7, v16, v17, v18, v19, v20, v21, v22 + sqrshrun_h 6, v24, v25, v26, v27 + umin_h v31, .8h, v24, v26 + st_s \d_strd, v24, 4 + st_s \d_strd, v26, 4 + b.le 0f + mov v1.16b, v17.16b + mov v2.16b, v18.16b + mov v3.16b, v19.16b + mov v4.16b, v20.16b + mov v5.16b, v21.16b + mov v6.16b, v22.16b + mov v7.16b, v23.16b + b 216b +0: + ret +.endif + +40: + b.gt 480f + + // 4x2, 4x4 v + cmp \h, #2 + add \xmy, \xmy, #2 + ld1 {v0.s}[0], [\xmy] + sub \src, \src, \s_strd + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + + load_4h \src, \sr2, \s_strd, v1, v2, v3, v4, v5 + smull_smlal_4 v6, v1, v2, v3, v4 + smull_smlal_4 v7, v2, v3, v4, v5 + shift_store_4 \type, \d_strd, v6, v7 + b.le 0f + load_4h \sr2, \src, \s_strd, v6, v7 + smull_smlal_4 v1, v3, v4, v5, v6 + smull_smlal_4 v2, v4, v5, v6, v7 + shift_store_4 \type, \d_strd, v1, v2 +0: + ret + +480: // 4x8, 4x16 v + ld1 {v0.8b}, [\xmy] + sub \sr2, \src, \s_strd, lsl #1 + add \ds2, \dst, \d_strd + sub \src, \sr2, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + + load_4h \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 + +48: + subs \h, \h, #4 + load_4h \sr2, \src, \s_strd, v23, v24, v25, v26 + smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23 + smull_smlal_8 v2, v17, v18, v19, v20, v21, v22, v23, v24 + smull_smlal_8 v3, v18, v19, v20, v21, v22, v23, v24, v25 + smull_smlal_8 v4, v19, v20, v21, v22, v23, v24, v25, v26 + shift_store_4 \type, \d_strd, v1, v2, v3, v4 + b.le 0f + mov v16.8b, v20.8b + mov v17.8b, v21.8b + mov v18.8b, v22.8b + mov v19.8b, v23.8b + mov v20.8b, v24.8b + mov v21.8b, v25.8b + mov v22.8b, v26.8b + b 48b +0: + ret + +80: + b.gt 880f + + // 8x2, 8x4 v + cmp \h, #2 + add \xmy, \xmy, #2 + ld1 {v0.s}[0], [\xmy] + sub \src, \src, \s_strd + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + + load_8h \src, \sr2, \s_strd, v1, v2, v3, v4, v5 + smull_smlal_4 v16, v1, v2, v3, v4 + smull2_smlal2_4 v17, v1, v2, v3, v4 + smull_smlal_4 v18, v2, v3, v4, v5 + smull2_smlal2_4 v19, v2, v3, v4, v5 + shift_store_8 \type, \d_strd, v16, v17, v18, v19 + b.le 0f + load_8h \sr2, \src, \s_strd, v6, v7 + smull_smlal_4 v16, v3, v4, v5, v6 + smull2_smlal2_4 v17, v3, v4, v5, v6 + smull_smlal_4 v18, v4, v5, v6, v7 + smull2_smlal2_4 v19, v4, v5, v6, v7 + shift_store_8 \type, \d_strd, v16, v17, v18, v19 +0: + ret + +880: // 8x6, 8x8, 8x16, 8x32 v +1680: // 16x8, 16x16, ... +320: // 32x8, 32x16, ... +640: +1280: + ld1 {v0.8b}, [\xmy] + sub \src, \src, \s_strd + sub \src, \src, \s_strd, lsl #1 + sxtl v0.8h, v0.8b + mov \my, \h +168: + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + load_8h \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 + +88: + subs \h, \h, #2 + load_8h \sr2, \src, \s_strd, v23, v24 + smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23 + smull2_smlal2_8 v2, v16, v17, v18, v19, v20, v21, v22, v23 + smull_smlal_8 v3, v17, v18, v19, v20, v21, v22, v23, v24 + smull2_smlal2_8 v4, v17, v18, v19, v20, v21, v22, v23, v24 + shift_store_8 \type, \d_strd, v1, v2, v3, v4 + b.le 9f + subs \h, \h, #2 + load_8h \sr2, \src, \s_strd, v25, v26 + smull_smlal_8 v1, v18, v19, v20, v21, v22, v23, v24, v25 + smull2_smlal2_8 v2, v18, v19, v20, v21, v22, v23, v24, v25 + smull_smlal_8 v3, v19, v20, v21, v22, v23, v24, v25, v26 + smull2_smlal2_8 v4, v19, v20, v21, v22, v23, v24, v25, v26 + shift_store_8 \type, \d_strd, v1, v2, v3, v4 + b.le 9f + mov v16.16b, v20.16b + mov v17.16b, v21.16b + mov v18.16b, v22.16b + mov v19.16b, v23.16b + mov v20.16b, v24.16b + mov v21.16b, v25.16b + mov v22.16b, v26.16b + b 88b +9: + subs \w, \w, #8 + b.le 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + msub \src, \s_strd, \xmy, \src + msub \dst, \d_strd, \xmy, \dst + sub \src, \src, \s_strd, lsl #3 + mov \h, \my + add \src, \src, #16 + add \dst, \dst, #16 + b 168b +0: + ret + +160: + b.gt 1680b + + // 16x2, 16x4 v + add \xmy, \xmy, #2 + ld1 {v0.s}[0], [\xmy] + sub \src, \src, \s_strd + sxtl v0.8h, v0.8b + + load_16h \src, \src, \s_strd, v16, v17, v18, v19, v20, v21 +16: + load_16h \src, \src, \s_strd, v22, v23 + subs \h, \h, #1 + smull_smlal_4 v1, v16, v18, v20, v22 + smull2_smlal2_4 v2, v16, v18, v20, v22 + smull_smlal_4 v3, v17, v19, v21, v23 + smull2_smlal2_4 v4, v17, v19, v21, v23 + shift_store_16 \type, \d_strd, x0, v1, v2, v3, v4 + b.le 0f + mov v16.16b, v18.16b + mov v17.16b, v19.16b + mov v18.16b, v20.16b + mov v19.16b, v21.16b + mov v20.16b, v22.16b + mov v21.16b, v23.16b + b 16b +0: + ret + +L(\type\()_8tap_v_tbl): + .hword L(\type\()_8tap_v_tbl) - 1280b + .hword L(\type\()_8tap_v_tbl) - 640b + .hword L(\type\()_8tap_v_tbl) - 320b + .hword L(\type\()_8tap_v_tbl) - 160b + .hword L(\type\()_8tap_v_tbl) - 80b + .hword L(\type\()_8tap_v_tbl) - 40b + .hword L(\type\()_8tap_v_tbl) - 20b + .hword 0 + +L(\type\()_8tap_hv): + cmp \h, #4 + ubfx w10, \my, #7, #7 + and \my, \my, #0x7f + b.le 4f + mov \my, w10 +4: + add \xmy, x11, \my, uxtw #3 + + adr x10, L(\type\()_8tap_hv_tbl) + dup v30.4s, w12 // 6 - intermediate_bits + ldrh w9, [x10, x9, lsl #1] + neg v30.4s, v30.4s // -(6-intermediate_bits) +.ifc \type, put + dup v29.4s, w13 // 6 + intermediate_bits +.else + movi v29.8h, #(PREP_BIAS >> 8), lsl #8 +.endif + sub x10, x10, w9, uxtw +.ifc \type, put + neg v29.4s, v29.4s // -(6+intermediate_bits) +.endif + br x10 + +20: +.ifc \type, put + add \xmx, \xmx, #2 + ld1 {v0.s}[0], [\xmx] + b.gt 280f + add \xmy, \xmy, #2 + ld1 {v1.s}[0], [\xmy] + + // 2x2, 2x4 hv + sub \sr2, \src, #2 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + sxtl v1.8h, v1.8b + mov x15, x30 + + ld1 {v27.8h}, [\src], \s_strd + ext v28.16b, v27.16b, v27.16b, #2 + smull v27.4s, v27.4h, v0.4h + smull v28.4s, v28.4h, v0.4h + addp v27.4s, v27.4s, v28.4s + addp v16.4s, v27.4s, v27.4s + srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits) + bl L(\type\()_8tap_filter_2) + // The intermediates from the horizontal pass fit in 16 bit without + // any bias; we could just as well keep them as .4s, but narrowing + // them to .4h gives a significant speedup on out of order cores + // (at the cost of a smaller slowdown on in-order cores such as A53). + xtn v16.4h, v16.4s + + trn1 v16.2s, v16.2s, v24.2s + mov v17.8b, v24.8b + +2: + bl L(\type\()_8tap_filter_2) + + ext v18.8b, v17.8b, v24.8b, #4 + smull v2.4s, v16.4h, v1.h[0] + smlal v2.4s, v17.4h, v1.h[1] + smlal v2.4s, v18.4h, v1.h[2] + smlal v2.4s, v24.4h, v1.h[3] + + srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) + sqxtun v2.4h, v2.4s + umin v2.4h, v2.4h, v31.4h + subs \h, \h, #2 + st1 {v2.s}[0], [\dst], \d_strd + st1 {v2.s}[1], [\ds2], \d_strd + b.le 0f + mov v16.8b, v18.8b + mov v17.8b, v24.8b + b 2b + +280: // 2x8, 2x16, 2x32 hv + ld1 {v1.8b}, [\xmy] + sub \src, \src, #2 + sub \sr2, \src, \s_strd, lsl #1 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + sxtl v1.8h, v1.8b + mov x15, x30 + + ld1 {v27.8h}, [\src], \s_strd + ext v28.16b, v27.16b, v27.16b, #2 + smull v27.4s, v27.4h, v0.4h + smull v28.4s, v28.4h, v0.4h + addp v27.4s, v27.4s, v28.4s + addp v16.4s, v27.4s, v27.4s + srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits) + // The intermediates from the horizontal pass fit in 16 bit without + // any bias; we could just as well keep them as .4s, but narrowing + // them to .4h gives a significant speedup on out of order cores + // (at the cost of a smaller slowdown on in-order cores such as A53). + + bl L(\type\()_8tap_filter_2) + xtn v16.4h, v16.4s + trn1 v16.2s, v16.2s, v24.2s + mov v17.8b, v24.8b + bl L(\type\()_8tap_filter_2) + ext v18.8b, v17.8b, v24.8b, #4 + mov v19.8b, v24.8b + bl L(\type\()_8tap_filter_2) + ext v20.8b, v19.8b, v24.8b, #4 + mov v21.8b, v24.8b + +28: + bl L(\type\()_8tap_filter_2) + ext v22.8b, v21.8b, v24.8b, #4 + smull v3.4s, v16.4h, v1.h[0] + smlal v3.4s, v17.4h, v1.h[1] + smlal v3.4s, v18.4h, v1.h[2] + smlal v3.4s, v19.4h, v1.h[3] + smlal v3.4s, v20.4h, v1.h[4] + smlal v3.4s, v21.4h, v1.h[5] + smlal v3.4s, v22.4h, v1.h[6] + smlal v3.4s, v24.4h, v1.h[7] + + srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) + sqxtun v3.4h, v3.4s + umin v3.4h, v3.4h, v31.4h + subs \h, \h, #2 + st1 {v3.s}[0], [\dst], \d_strd + st1 {v3.s}[1], [\ds2], \d_strd + b.le 0f + mov v16.8b, v18.8b + mov v17.8b, v19.8b + mov v18.8b, v20.8b + mov v19.8b, v21.8b + mov v20.8b, v22.8b + mov v21.8b, v24.8b + b 28b + +0: + br x15 + +L(\type\()_8tap_filter_2): + ld1 {v25.8h}, [\sr2], \s_strd + ld1 {v27.8h}, [\src], \s_strd + ext v26.16b, v25.16b, v25.16b, #2 + ext v28.16b, v27.16b, v27.16b, #2 + trn1 v24.2s, v25.2s, v27.2s + trn2 v27.2s, v25.2s, v27.2s + trn1 v25.2s, v26.2s, v28.2s + trn2 v28.2s, v26.2s, v28.2s + smull v24.4s, v24.4h, v0.h[0] + smlal v24.4s, v25.4h, v0.h[1] + smlal v24.4s, v27.4h, v0.h[2] + smlal v24.4s, v28.4h, v0.h[3] + srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) + xtn v24.4h, v24.4s + ret +.endif + +40: + add \xmx, \xmx, #2 + ld1 {v0.s}[0], [\xmx] + b.gt 480f + add \xmy, \xmy, #2 + ld1 {v1.s}[0], [\xmy] + sub \sr2, \src, #2 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + sxtl v1.8h, v1.8b + mov x15, x30 + + // 4x2, 4x4 hv + ld1 {v25.8h}, [\src], \s_strd + ext v26.16b, v25.16b, v25.16b, #2 + ext v27.16b, v25.16b, v25.16b, #4 + ext v28.16b, v25.16b, v25.16b, #6 + smull v25.4s, v25.4h, v0.h[0] + smlal v25.4s, v26.4h, v0.h[1] + smlal v25.4s, v27.4h, v0.h[2] + smlal v25.4s, v28.4h, v0.h[3] + srshl v16.4s, v25.4s, v30.4s // -(6-intermediate_bits) + // The intermediates from the horizontal pass fit in 16 bit without + // any bias; we could just as well keep them as .4s, but narrowing + // them to .4h gives a significant speedup on out of order cores + // (at the cost of a smaller slowdown on in-order cores such as A53). + xtn v16.4h, v16.4s + + bl L(\type\()_8tap_filter_4) + mov v17.8b, v24.8b + mov v18.8b, v25.8b + +4: + bl L(\type\()_8tap_filter_4) + smull v2.4s, v16.4h, v1.h[0] + smlal v2.4s, v17.4h, v1.h[1] + smlal v2.4s, v18.4h, v1.h[2] + smlal v2.4s, v24.4h, v1.h[3] + smull v3.4s, v17.4h, v1.h[0] + smlal v3.4s, v18.4h, v1.h[1] + smlal v3.4s, v24.4h, v1.h[2] + smlal v3.4s, v25.4h, v1.h[3] +.ifc \type, put + srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) + srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) + sqxtun v2.4h, v2.4s + sqxtun2 v2.8h, v3.4s + umin v2.8h, v2.8h, v31.8h +.else + rshrn v2.4h, v2.4s, #6 + rshrn2 v2.8h, v3.4s, #6 + sub v2.8h, v2.8h, v29.8h // PREP_BIAS +.endif + subs \h, \h, #2 + + st1 {v2.d}[0], [\dst], \d_strd + st1 {v2.d}[1], [\ds2], \d_strd + b.le 0f + mov v16.8b, v18.8b + mov v17.8b, v24.8b + mov v18.8b, v25.8b + b 4b + +480: // 4x8, 4x16, 4x32 hv + ld1 {v1.8b}, [\xmy] + sub \src, \src, #2 + sub \sr2, \src, \s_strd, lsl #1 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + sxtl v1.8h, v1.8b + mov x15, x30 + + ld1 {v25.8h}, [\src], \s_strd + ext v26.16b, v25.16b, v25.16b, #2 + ext v27.16b, v25.16b, v25.16b, #4 + ext v28.16b, v25.16b, v25.16b, #6 + smull v25.4s, v25.4h, v0.h[0] + smlal v25.4s, v26.4h, v0.h[1] + smlal v25.4s, v27.4h, v0.h[2] + smlal v25.4s, v28.4h, v0.h[3] + srshl v16.4s, v25.4s, v30.4s // -(6-intermediate_bits) + // The intermediates from the horizontal pass fit in 16 bit without + // any bias; we could just as well keep them as .4s, but narrowing + // them to .4h gives a significant speedup on out of order cores + // (at the cost of a smaller slowdown on in-order cores such as A53). + xtn v16.4h, v16.4s + + bl L(\type\()_8tap_filter_4) + mov v17.8b, v24.8b + mov v18.8b, v25.8b + bl L(\type\()_8tap_filter_4) + mov v19.8b, v24.8b + mov v20.8b, v25.8b + bl L(\type\()_8tap_filter_4) + mov v21.8b, v24.8b + mov v22.8b, v25.8b + +48: + bl L(\type\()_8tap_filter_4) + smull v3.4s, v16.4h, v1.h[0] + smlal v3.4s, v17.4h, v1.h[1] + smlal v3.4s, v18.4h, v1.h[2] + smlal v3.4s, v19.4h, v1.h[3] + smlal v3.4s, v20.4h, v1.h[4] + smlal v3.4s, v21.4h, v1.h[5] + smlal v3.4s, v22.4h, v1.h[6] + smlal v3.4s, v24.4h, v1.h[7] + smull v4.4s, v17.4h, v1.h[0] + smlal v4.4s, v18.4h, v1.h[1] + smlal v4.4s, v19.4h, v1.h[2] + smlal v4.4s, v20.4h, v1.h[3] + smlal v4.4s, v21.4h, v1.h[4] + smlal v4.4s, v22.4h, v1.h[5] + smlal v4.4s, v24.4h, v1.h[6] + smlal v4.4s, v25.4h, v1.h[7] +.ifc \type, put + srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) + srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits) + sqxtun v3.4h, v3.4s + sqxtun2 v3.8h, v4.4s + umin v3.8h, v3.8h, v31.8h +.else + rshrn v3.4h, v3.4s, #6 + rshrn2 v3.8h, v4.4s, #6 + sub v3.8h, v3.8h, v29.8h // PREP_BIAS +.endif + subs \h, \h, #2 + st1 {v3.d}[0], [\dst], \d_strd + st1 {v3.d}[1], [\ds2], \d_strd + b.le 0f + mov v16.8b, v18.8b + mov v17.8b, v19.8b + mov v18.8b, v20.8b + mov v19.8b, v21.8b + mov v20.8b, v22.8b + mov v21.8b, v24.8b + mov v22.8b, v25.8b + b 48b +0: + br x15 + +L(\type\()_8tap_filter_4): + ld1 {v24.8h}, [\sr2], \s_strd + ld1 {v25.8h}, [\src], \s_strd + ext v26.16b, v24.16b, v24.16b, #2 + ext v27.16b, v24.16b, v24.16b, #4 + ext v28.16b, v24.16b, v24.16b, #6 + smull v24.4s, v24.4h, v0.h[0] + smlal v24.4s, v26.4h, v0.h[1] + smlal v24.4s, v27.4h, v0.h[2] + smlal v24.4s, v28.4h, v0.h[3] + ext v26.16b, v25.16b, v25.16b, #2 + ext v27.16b, v25.16b, v25.16b, #4 + ext v28.16b, v25.16b, v25.16b, #6 + smull v25.4s, v25.4h, v0.h[0] + smlal v25.4s, v26.4h, v0.h[1] + smlal v25.4s, v27.4h, v0.h[2] + smlal v25.4s, v28.4h, v0.h[3] + srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) + srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) + xtn v24.4h, v24.4s + xtn v25.4h, v25.4s + ret + +80: +160: +320: + b.gt 880f + add \xmy, \xmy, #2 + ld1 {v0.8b}, [\xmx] + ld1 {v1.s}[0], [\xmy] + sub \src, \src, #6 + sub \src, \src, \s_strd + sxtl v0.8h, v0.8b + sxtl v1.8h, v1.8b + mov x15, x30 + mov \my, \h + +164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + + ld1 {v27.8h, v28.8h}, [\src], \s_strd + smull v24.4s, v27.4h, v0.h[0] + smull2 v25.4s, v27.8h, v0.h[0] +.irpc i, 1234567 + ext v26.16b, v27.16b, v28.16b, #(2*\i) + smlal v24.4s, v26.4h, v0.h[\i] + smlal2 v25.4s, v26.8h, v0.h[\i] +.endr + srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) + srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) + // The intermediates from the horizontal pass fit in 16 bit without + // any bias; we could just as well keep them as .4s, but narrowing + // them to .4h gives a significant speedup on out of order cores + // (at the cost of a smaller slowdown on in-order cores such as A53), + // and conserves register space (no need to clobber v8-v15). + xtn v16.4h, v24.4s + xtn2 v16.8h, v25.4s + + bl L(\type\()_8tap_filter_8) + mov v17.16b, v23.16b + mov v18.16b, v24.16b + +8: + smull v2.4s, v16.4h, v1.h[0] + smull2 v3.4s, v16.8h, v1.h[0] + bl L(\type\()_8tap_filter_8) + smull v4.4s, v17.4h, v1.h[0] + smull2 v5.4s, v17.8h, v1.h[0] + smlal v2.4s, v17.4h, v1.h[1] + smlal2 v3.4s, v17.8h, v1.h[1] + smlal v4.4s, v18.4h, v1.h[1] + smlal2 v5.4s, v18.8h, v1.h[1] + smlal v2.4s, v18.4h, v1.h[2] + smlal2 v3.4s, v18.8h, v1.h[2] + smlal v4.4s, v23.4h, v1.h[2] + smlal2 v5.4s, v23.8h, v1.h[2] + smlal v2.4s, v23.4h, v1.h[3] + smlal2 v3.4s, v23.8h, v1.h[3] + smlal v4.4s, v24.4h, v1.h[3] + smlal2 v5.4s, v24.8h, v1.h[3] +.ifc \type, put + srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) + srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) + srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits) + srshl v5.4s, v5.4s, v29.4s // -(6+intermediate_bits) + sqxtun v2.4h, v2.4s + sqxtun2 v2.8h, v3.4s + sqxtun v3.4h, v4.4s + sqxtun2 v3.8h, v5.4s + umin v2.8h, v2.8h, v31.8h + umin v3.8h, v3.8h, v31.8h +.else + rshrn v2.4h, v2.4s, #6 + rshrn2 v2.8h, v3.4s, #6 + rshrn v3.4h, v4.4s, #6 + rshrn2 v3.8h, v5.4s, #6 + sub v2.8h, v2.8h, v29.8h // PREP_BIAS + sub v3.8h, v3.8h, v29.8h // PREP_BIAS +.endif + subs \h, \h, #2 + st1 {v2.8h}, [\dst], \d_strd + st1 {v3.8h}, [\ds2], \d_strd + b.le 9f + mov v16.16b, v18.16b + mov v17.16b, v23.16b + mov v18.16b, v24.16b + b 8b +9: + subs \w, \w, #8 + b.le 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + msub \src, \s_strd, \xmy, \src + msub \dst, \d_strd, \xmy, \dst + sub \src, \src, \s_strd, lsl #2 + mov \h, \my + add \src, \src, #16 + add \dst, \dst, #16 + b 164b + +880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv +640: +1280: + ld1 {v0.8b}, [\xmx] + ld1 {v1.8b}, [\xmy] + sub \src, \src, #6 + sub \src, \src, \s_strd + sub \src, \src, \s_strd, lsl #1 + sxtl v0.8h, v0.8b + sxtl v1.8h, v1.8b + mov x15, x30 + mov \my, \h + +168: + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + + ld1 {v27.8h, v28.8h}, [\src], \s_strd + smull v24.4s, v27.4h, v0.h[0] + smull2 v25.4s, v27.8h, v0.h[0] +.irpc i, 1234567 + ext v26.16b, v27.16b, v28.16b, #(2*\i) + smlal v24.4s, v26.4h, v0.h[\i] + smlal2 v25.4s, v26.8h, v0.h[\i] +.endr + srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) + srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) + // The intermediates from the horizontal pass fit in 16 bit without + // any bias; we could just as well keep them as .4s, but narrowing + // them to .4h gives a significant speedup on out of order cores + // (at the cost of a smaller slowdown on in-order cores such as A53), + // and conserves register space (no need to clobber v8-v15). + xtn v16.4h, v24.4s + xtn2 v16.8h, v25.4s + + bl L(\type\()_8tap_filter_8) + mov v17.16b, v23.16b + mov v18.16b, v24.16b + bl L(\type\()_8tap_filter_8) + mov v19.16b, v23.16b + mov v20.16b, v24.16b + bl L(\type\()_8tap_filter_8) + mov v21.16b, v23.16b + mov v22.16b, v24.16b + +88: + smull v2.4s, v16.4h, v1.h[0] + smull2 v3.4s, v16.8h, v1.h[0] + bl L(\type\()_8tap_filter_8) + smull v4.4s, v17.4h, v1.h[0] + smull2 v5.4s, v17.8h, v1.h[0] + smlal v2.4s, v17.4h, v1.h[1] + smlal2 v3.4s, v17.8h, v1.h[1] + smlal v4.4s, v18.4h, v1.h[1] + smlal2 v5.4s, v18.8h, v1.h[1] + smlal v2.4s, v18.4h, v1.h[2] + smlal2 v3.4s, v18.8h, v1.h[2] + smlal v4.4s, v19.4h, v1.h[2] + smlal2 v5.4s, v19.8h, v1.h[2] + smlal v2.4s, v19.4h, v1.h[3] + smlal2 v3.4s, v19.8h, v1.h[3] + smlal v4.4s, v20.4h, v1.h[3] + smlal2 v5.4s, v20.8h, v1.h[3] + smlal v2.4s, v20.4h, v1.h[4] + smlal2 v3.4s, v20.8h, v1.h[4] + smlal v4.4s, v21.4h, v1.h[4] + smlal2 v5.4s, v21.8h, v1.h[4] + smlal v2.4s, v21.4h, v1.h[5] + smlal2 v3.4s, v21.8h, v1.h[5] + smlal v4.4s, v22.4h, v1.h[5] + smlal2 v5.4s, v22.8h, v1.h[5] + smlal v2.4s, v22.4h, v1.h[6] + smlal2 v3.4s, v22.8h, v1.h[6] + smlal v4.4s, v23.4h, v1.h[6] + smlal2 v5.4s, v23.8h, v1.h[6] + smlal v2.4s, v23.4h, v1.h[7] + smlal2 v3.4s, v23.8h, v1.h[7] + smlal v4.4s, v24.4h, v1.h[7] + smlal2 v5.4s, v24.8h, v1.h[7] +.ifc \type, put + srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) + srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) + srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits) + srshl v5.4s, v5.4s, v29.4s // -(6+intermediate_bits) + sqxtun v2.4h, v2.4s + sqxtun2 v2.8h, v3.4s + sqxtun v3.4h, v4.4s + sqxtun2 v3.8h, v5.4s + umin v2.8h, v2.8h, v31.8h + umin v3.8h, v3.8h, v31.8h +.else + rshrn v2.4h, v2.4s, #6 + rshrn2 v2.8h, v3.4s, #6 + rshrn v3.4h, v4.4s, #6 + rshrn2 v3.8h, v5.4s, #6 + sub v2.8h, v2.8h, v29.8h // PREP_BIAS + sub v3.8h, v3.8h, v29.8h // PREP_BIAS +.endif + subs \h, \h, #2 + st1 {v2.8h}, [\dst], \d_strd + st1 {v3.8h}, [\ds2], \d_strd + b.le 9f + mov v16.16b, v18.16b + mov v17.16b, v19.16b + mov v18.16b, v20.16b + mov v19.16b, v21.16b + mov v20.16b, v22.16b + mov v21.16b, v23.16b + mov v22.16b, v24.16b + b 88b +9: + subs \w, \w, #8 + b.le 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + msub \src, \s_strd, \xmy, \src + msub \dst, \d_strd, \xmy, \dst + sub \src, \src, \s_strd, lsl #3 + mov \h, \my + add \src, \src, #16 + add \dst, \dst, #16 + b 168b +0: + br x15 + +L(\type\()_8tap_filter_8): + ld1 {v4.8h, v5.8h}, [\sr2], \s_strd + ld1 {v6.8h, v7.8h}, [\src], \s_strd + smull v25.4s, v4.4h, v0.h[0] + smull2 v26.4s, v4.8h, v0.h[0] + smull v27.4s, v6.4h, v0.h[0] + smull2 v28.4s, v6.8h, v0.h[0] +.irpc i, 1234567 + ext v23.16b, v4.16b, v5.16b, #(2*\i) + ext v24.16b, v6.16b, v7.16b, #(2*\i) + smlal v25.4s, v23.4h, v0.h[\i] + smlal2 v26.4s, v23.8h, v0.h[\i] + smlal v27.4s, v24.4h, v0.h[\i] + smlal2 v28.4s, v24.8h, v0.h[\i] +.endr + srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) + srshl v26.4s, v26.4s, v30.4s // -(6-intermediate_bits) + srshl v27.4s, v27.4s, v30.4s // -(6-intermediate_bits) + srshl v28.4s, v28.4s, v30.4s // -(6-intermediate_bits) + xtn v23.4h, v25.4s + xtn2 v23.8h, v26.4s + xtn v24.4h, v27.4s + xtn2 v24.8h, v28.4s + ret + +L(\type\()_8tap_hv_tbl): + .hword L(\type\()_8tap_hv_tbl) - 1280b + .hword L(\type\()_8tap_hv_tbl) - 640b + .hword L(\type\()_8tap_hv_tbl) - 320b + .hword L(\type\()_8tap_hv_tbl) - 160b + .hword L(\type\()_8tap_hv_tbl) - 80b + .hword L(\type\()_8tap_hv_tbl) - 40b + .hword L(\type\()_8tap_hv_tbl) - 20b + .hword 0 +endfunc + + +function \type\()_bilin_16bpc_neon, export=1 +.ifc \bdmax, w8 + ldr w8, [sp] +.endif + dup v1.8h, \mx + dup v3.8h, \my + mov w10, #16 + sub w9, w10, \mx + sub w10, w10, \my + dup v0.8h, w9 + dup v2.8h, w10 +.ifc \type, prep + uxtw \d_strd, \w + lsl \d_strd, \d_strd, #1 +.endif + + clz \bdmax, \bdmax // bitdepth_max + clz w9, \w + sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18 + mov w11, #4 + sub w9, w9, #24 + sub w11, w11, \bdmax // 4 - intermediate_bits + add w12, \bdmax, #4 // 4 + intermediate_bits + cbnz \mx, L(\type\()_bilin_h) + cbnz \my, L(\type\()_bilin_v) + b \type\()_neon + +L(\type\()_bilin_h): + cbnz \my, L(\type\()_bilin_hv) + + adr x10, L(\type\()_bilin_h_tbl) + dup v31.8h, w11 // 4 - intermediate_bits + ldrh w9, [x10, x9, lsl #1] + neg v31.8h, v31.8h // -(4-intermediate_bits) +.ifc \type, put + dup v30.8h, \bdmax // intermediate_bits +.else + movi v29.8h, #(PREP_BIAS >> 8), lsl #8 +.endif + sub x10, x10, w9, uxtw +.ifc \type, put + neg v30.8h, v30.8h // -intermediate_bits +.endif + br x10 + +20: // 2xN h +.ifc \type, put + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 +2: + ld1 {v4.4h}, [\src], \s_strd + ld1 {v6.4h}, [\sr2], \s_strd + ext v5.8b, v4.8b, v4.8b, #2 + ext v7.8b, v6.8b, v6.8b, #2 + trn1 v4.2s, v4.2s, v6.2s + trn1 v5.2s, v5.2s, v7.2s + subs \h, \h, #2 + mul v4.4h, v4.4h, v0.4h + mla v4.4h, v5.4h, v1.4h + urshl v4.4h, v4.4h, v31.4h + urshl v4.4h, v4.4h, v30.4h + st1 {v4.s}[0], [\dst], \d_strd + st1 {v4.s}[1], [\ds2], \d_strd + b.gt 2b + ret +.endif + +40: // 4xN h + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 +4: + ld1 {v4.8h}, [\src], \s_strd + ld1 {v6.8h}, [\sr2], \s_strd + ext v5.16b, v4.16b, v4.16b, #2 + ext v7.16b, v6.16b, v6.16b, #2 + trn1 v4.2d, v4.2d, v6.2d + trn1 v5.2d, v5.2d, v7.2d + subs \h, \h, #2 + mul v4.8h, v4.8h, v0.8h + mla v4.8h, v5.8h, v1.8h + urshl v4.8h, v4.8h, v31.8h +.ifc \type, put + urshl v4.8h, v4.8h, v30.8h +.else + sub v4.8h, v4.8h, v29.8h +.endif + st1 {v4.d}[0], [\dst], \d_strd + st1 {v4.d}[1], [\ds2], \d_strd + b.gt 4b + ret + +80: // 8xN h + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 +8: + ldr h5, [\src, #16] + ldr h7, [\sr2, #16] + ld1 {v4.8h}, [\src], \s_strd + ld1 {v6.8h}, [\sr2], \s_strd + ext v5.16b, v4.16b, v5.16b, #2 + ext v7.16b, v6.16b, v7.16b, #2 + subs \h, \h, #2 + mul v4.8h, v4.8h, v0.8h + mla v4.8h, v5.8h, v1.8h + mul v6.8h, v6.8h, v0.8h + mla v6.8h, v7.8h, v1.8h + urshl v4.8h, v4.8h, v31.8h + urshl v6.8h, v6.8h, v31.8h +.ifc \type, put + urshl v4.8h, v4.8h, v30.8h + urshl v6.8h, v6.8h, v30.8h +.else + sub v4.8h, v4.8h, v29.8h + sub v6.8h, v6.8h, v29.8h +.endif + st1 {v4.8h}, [\dst], \d_strd + st1 {v6.8h}, [\ds2], \d_strd + b.gt 8b + ret +160: +320: +640: +1280: // 16xN, 32xN, ... h + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + + sub \s_strd, \s_strd, \w, uxtw #1 + sub \s_strd, \s_strd, #16 +.ifc \type, put + lsl \d_strd, \d_strd, #1 + sub \d_strd, \d_strd, \w, uxtw #1 +.endif +161: + ld1 {v16.8h}, [\src], #16 + ld1 {v21.8h}, [\sr2], #16 + mov \mx, \w + +16: + ld1 {v17.8h, v18.8h}, [\src], #32 + ld1 {v22.8h, v23.8h}, [\sr2], #32 + ext v19.16b, v16.16b, v17.16b, #2 + ext v20.16b, v17.16b, v18.16b, #2 + ext v24.16b, v21.16b, v22.16b, #2 + ext v25.16b, v22.16b, v23.16b, #2 + mul v16.8h, v16.8h, v0.8h + mla v16.8h, v19.8h, v1.8h + mul v17.8h, v17.8h, v0.8h + mla v17.8h, v20.8h, v1.8h + mul v21.8h, v21.8h, v0.8h + mla v21.8h, v24.8h, v1.8h + mul v22.8h, v22.8h, v0.8h + mla v22.8h, v25.8h, v1.8h + urshl v16.8h, v16.8h, v31.8h + urshl v17.8h, v17.8h, v31.8h + urshl v21.8h, v21.8h, v31.8h + urshl v22.8h, v22.8h, v31.8h + subs \mx, \mx, #16 +.ifc \type, put + urshl v16.8h, v16.8h, v30.8h + urshl v17.8h, v17.8h, v30.8h + urshl v21.8h, v21.8h, v30.8h + urshl v22.8h, v22.8h, v30.8h +.else + sub v16.8h, v16.8h, v29.8h + sub v17.8h, v17.8h, v29.8h + sub v21.8h, v21.8h, v29.8h + sub v22.8h, v22.8h, v29.8h +.endif + st1 {v16.8h, v17.8h}, [\dst], #32 + st1 {v21.8h, v22.8h}, [\ds2], #32 + b.le 9f + + mov v16.16b, v18.16b + mov v21.16b, v23.16b + b 16b + +9: + add \dst, \dst, \d_strd + add \ds2, \ds2, \d_strd + add \src, \src, \s_strd + add \sr2, \sr2, \s_strd + + subs \h, \h, #2 + b.gt 161b + ret + +L(\type\()_bilin_h_tbl): + .hword L(\type\()_bilin_h_tbl) - 1280b + .hword L(\type\()_bilin_h_tbl) - 640b + .hword L(\type\()_bilin_h_tbl) - 320b + .hword L(\type\()_bilin_h_tbl) - 160b + .hword L(\type\()_bilin_h_tbl) - 80b + .hword L(\type\()_bilin_h_tbl) - 40b + .hword L(\type\()_bilin_h_tbl) - 20b + .hword 0 + + +L(\type\()_bilin_v): + cmp \h, #4 + adr x10, L(\type\()_bilin_v_tbl) +.ifc \type, prep + dup v31.8h, w11 // 4 - intermediate_bits +.endif + ldrh w9, [x10, x9, lsl #1] +.ifc \type, prep + movi v29.8h, #(PREP_BIAS >> 8), lsl #8 + neg v31.8h, v31.8h // -(4-intermediate_bits) +.endif + sub x10, x10, w9, uxtw + br x10 + +20: // 2xN v +.ifc \type, put + cmp \h, #2 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + // 2x2 v + ld1 {v16.s}[0], [\src], \s_strd + b.gt 24f + ld1 {v17.s}[0], [\sr2], \s_strd + ld1 {v18.s}[0], [\src], \s_strd + trn1 v16.2s, v16.2s, v17.2s + trn1 v17.2s, v17.2s, v18.2s + mul v4.4h, v16.4h, v2.4h + mla v4.4h, v17.4h, v3.4h + urshr v4.8h, v4.8h, #4 + st1 {v4.s}[0], [\dst] + st1 {v4.s}[1], [\ds2] + ret +24: // 2x4, 2x8, ... v + ld1 {v17.s}[0], [\sr2], \s_strd + ld1 {v18.s}[0], [\src], \s_strd + ld1 {v19.s}[0], [\sr2], \s_strd + ld1 {v20.s}[0], [\src], \s_strd + trn1 v16.2s, v16.2s, v17.2s + trn1 v17.2s, v17.2s, v18.2s + trn1 v18.2s, v18.2s, v19.2s + trn1 v19.2s, v19.2s, v20.2s + trn1 v16.2d, v16.2d, v18.2d + trn1 v17.2d, v17.2d, v19.2d + mul v4.8h, v16.8h, v2.8h + mla v4.8h, v17.8h, v3.8h + subs \h, \h, #4 + urshr v4.8h, v4.8h, #4 + st1 {v4.s}[0], [\dst], \d_strd + st1 {v4.s}[1], [\ds2], \d_strd + st1 {v4.s}[2], [\dst], \d_strd + st1 {v4.s}[3], [\ds2], \d_strd + b.le 0f + mov v16.8b, v20.8b + b 24b +0: + ret +.endif + +40: // 4xN v + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + ld1 {v16.4h}, [\src], \s_strd +4: + ld1 {v17.4h}, [\sr2], \s_strd + ld1 {v18.4h}, [\src], \s_strd + trn1 v16.2d, v16.2d, v17.2d + trn1 v17.2d, v17.2d, v18.2d + mul v4.8h, v16.8h, v2.8h + mla v4.8h, v17.8h, v3.8h + subs \h, \h, #2 +.ifc \type, put + urshr v4.8h, v4.8h, #4 +.else + urshl v4.8h, v4.8h, v31.8h + sub v4.8h, v4.8h, v29.8h +.endif + st1 {v4.d}[0], [\dst], \d_strd + st1 {v4.d}[1], [\ds2], \d_strd + b.le 0f + mov v16.8b, v18.8b + b 4b +0: + ret + +80: // 8xN v + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + ld1 {v16.8h}, [\src], \s_strd +8: + ld1 {v17.8h}, [\sr2], \s_strd + ld1 {v18.8h}, [\src], \s_strd + mul v4.8h, v16.8h, v2.8h + mla v4.8h, v17.8h, v3.8h + mul v5.8h, v17.8h, v2.8h + mla v5.8h, v18.8h, v3.8h + subs \h, \h, #2 +.ifc \type, put + urshr v4.8h, v4.8h, #4 + urshr v5.8h, v5.8h, #4 +.else + urshl v4.8h, v4.8h, v31.8h + urshl v5.8h, v5.8h, v31.8h + sub v4.8h, v4.8h, v29.8h + sub v5.8h, v5.8h, v29.8h +.endif + st1 {v4.8h}, [\dst], \d_strd + st1 {v5.8h}, [\ds2], \d_strd + b.le 0f + mov v16.16b, v18.16b + b 8b +0: + ret + +160: // 16xN, 32xN, ... +320: +640: +1280: + mov \my, \h +1: + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + ld1 {v16.8h, v17.8h}, [\src], \s_strd +2: + ld1 {v18.8h, v19.8h}, [\sr2], \s_strd + ld1 {v20.8h, v21.8h}, [\src], \s_strd + mul v4.8h, v16.8h, v2.8h + mla v4.8h, v18.8h, v3.8h + mul v5.8h, v17.8h, v2.8h + mla v5.8h, v19.8h, v3.8h + mul v6.8h, v18.8h, v2.8h + mla v6.8h, v20.8h, v3.8h + mul v7.8h, v19.8h, v2.8h + mla v7.8h, v21.8h, v3.8h + subs \h, \h, #2 +.ifc \type, put + urshr v4.8h, v4.8h, #4 + urshr v5.8h, v5.8h, #4 + urshr v6.8h, v6.8h, #4 + urshr v7.8h, v7.8h, #4 +.else + urshl v4.8h, v4.8h, v31.8h + urshl v5.8h, v5.8h, v31.8h + urshl v6.8h, v6.8h, v31.8h + urshl v7.8h, v7.8h, v31.8h + sub v4.8h, v4.8h, v29.8h + sub v5.8h, v5.8h, v29.8h + sub v6.8h, v6.8h, v29.8h + sub v7.8h, v7.8h, v29.8h +.endif + st1 {v4.8h, v5.8h}, [\dst], \d_strd + st1 {v6.8h, v7.8h}, [\ds2], \d_strd + b.le 9f + mov v16.16b, v20.16b + mov v17.16b, v21.16b + b 2b +9: + subs \w, \w, #16 + b.le 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + msub \src, \s_strd, \xmy, \src + msub \dst, \d_strd, \xmy, \dst + sub \src, \src, \s_strd, lsl #1 + mov \h, \my + add \src, \src, #32 + add \dst, \dst, #32 + b 1b +0: + ret + +L(\type\()_bilin_v_tbl): + .hword L(\type\()_bilin_v_tbl) - 1280b + .hword L(\type\()_bilin_v_tbl) - 640b + .hword L(\type\()_bilin_v_tbl) - 320b + .hword L(\type\()_bilin_v_tbl) - 160b + .hword L(\type\()_bilin_v_tbl) - 80b + .hword L(\type\()_bilin_v_tbl) - 40b + .hword L(\type\()_bilin_v_tbl) - 20b + .hword 0 + +L(\type\()_bilin_hv): + adr x10, L(\type\()_bilin_hv_tbl) + dup v31.8h, w11 // 4 - intermediate_bits + ldrh w9, [x10, x9, lsl #1] + neg v31.8h, v31.8h // -(4-intermediate_bits) +.ifc \type, put + dup v30.4s, w12 // 4 + intermediate_bits +.else + movi v29.8h, #(PREP_BIAS >> 8), lsl #8 +.endif + sub x10, x10, w9, uxtw +.ifc \type, put + neg v30.4s, v30.4s // -(4+intermediate_bits) +.endif + br x10 + +20: // 2xN hv +.ifc \type, put + add \sr2, \src, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + ld1 {v20.4h}, [\src], \s_strd + ext v21.8b, v20.8b, v20.8b, #2 + mul v16.4h, v20.4h, v0.4h + mla v16.4h, v21.4h, v1.4h + urshl v16.4h, v16.4h, v31.4h + +2: + ld1 {v22.4h}, [\sr2], \s_strd + ld1 {v24.4h}, [\src], \s_strd + ext v23.8b, v22.8b, v22.8b, #2 + ext v25.8b, v24.8b, v24.8b, #2 + trn1 v22.2s, v22.2s, v24.2s + trn1 v23.2s, v23.2s, v25.2s + mul v17.4h, v22.4h, v0.4h + mla v17.4h, v23.4h, v1.4h + urshl v17.4h, v17.4h, v31.4h + + trn1 v16.2s, v16.2s, v17.2s + + umull v4.4s, v16.4h, v2.4h + umlal v4.4s, v17.4h, v3.4h + urshl v4.4s, v4.4s, v30.4s + xtn v4.4h, v4.4s + subs \h, \h, #2 + st1 {v4.s}[0], [\dst], \d_strd + st1 {v4.s}[1], [\ds2], \d_strd + b.le 0f + trn2 v16.2s, v17.2s, v17.2s + b 2b +0: + ret +.endif + +40: // 4xN hv + add \sr2, \src, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + ld1 {v20.8h}, [\src], \s_strd + ext v21.16b, v20.16b, v20.16b, #2 + mul v16.4h, v20.4h, v0.4h + mla v16.4h, v21.4h, v1.4h + urshl v16.4h, v16.4h, v31.4h + +4: + ld1 {v22.8h}, [\sr2], \s_strd + ld1 {v24.8h}, [\src], \s_strd + ext v23.16b, v22.16b, v22.16b, #2 + ext v25.16b, v24.16b, v24.16b, #2 + trn1 v22.2d, v22.2d, v24.2d + trn1 v23.2d, v23.2d, v25.2d + mul v17.8h, v22.8h, v0.8h + mla v17.8h, v23.8h, v1.8h + urshl v17.8h, v17.8h, v31.8h + + trn1 v16.2d, v16.2d, v17.2d + + umull v4.4s, v16.4h, v2.4h + umlal v4.4s, v17.4h, v3.4h + umull2 v5.4s, v16.8h, v2.8h + umlal2 v5.4s, v17.8h, v3.8h +.ifc \type, put + urshl v4.4s, v4.4s, v30.4s + urshl v5.4s, v5.4s, v30.4s + xtn v4.4h, v4.4s + xtn2 v4.8h, v5.4s +.else + rshrn v4.4h, v4.4s, #4 + rshrn2 v4.8h, v5.4s, #4 + sub v4.8h, v4.8h, v29.8h +.endif + subs \h, \h, #2 + st1 {v4.d}[0], [\dst], \d_strd + st1 {v4.d}[1], [\ds2], \d_strd + b.le 0f + trn2 v16.2d, v17.2d, v17.2d + b 4b +0: + ret + +80: // 8xN, 16xN, ... hv +160: +320: +640: +1280: + mov \my, \h + +1: + add \sr2, \src, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + ldr h21, [\src, #16] + ld1 {v20.8h}, [\src], \s_strd + ext v21.16b, v20.16b, v21.16b, #2 + mul v16.8h, v20.8h, v0.8h + mla v16.8h, v21.8h, v1.8h + urshl v16.8h, v16.8h, v31.8h + +2: + ldr h23, [\sr2, #16] + ld1 {v22.8h}, [\sr2], \s_strd + ldr h25, [\src, #16] + ld1 {v24.8h}, [\src], \s_strd + ext v23.16b, v22.16b, v23.16b, #2 + ext v25.16b, v24.16b, v25.16b, #2 + mul v17.8h, v22.8h, v0.8h + mla v17.8h, v23.8h, v1.8h + mul v18.8h, v24.8h, v0.8h + mla v18.8h, v25.8h, v1.8h + urshl v17.8h, v17.8h, v31.8h + urshl v18.8h, v18.8h, v31.8h + + umull v4.4s, v16.4h, v2.4h + umlal v4.4s, v17.4h, v3.4h + umull2 v5.4s, v16.8h, v2.8h + umlal2 v5.4s, v17.8h, v3.8h + umull v6.4s, v17.4h, v2.4h + umlal v6.4s, v18.4h, v3.4h + umull2 v7.4s, v17.8h, v2.8h + umlal2 v7.4s, v18.8h, v3.8h +.ifc \type, put + urshl v4.4s, v4.4s, v30.4s + urshl v5.4s, v5.4s, v30.4s + urshl v6.4s, v6.4s, v30.4s + urshl v7.4s, v7.4s, v30.4s + xtn v4.4h, v4.4s + xtn2 v4.8h, v5.4s + xtn v5.4h, v6.4s + xtn2 v5.8h, v7.4s +.else + rshrn v4.4h, v4.4s, #4 + rshrn2 v4.8h, v5.4s, #4 + rshrn v5.4h, v6.4s, #4 + rshrn2 v5.8h, v7.4s, #4 + sub v4.8h, v4.8h, v29.8h + sub v5.8h, v5.8h, v29.8h +.endif + subs \h, \h, #2 + st1 {v4.8h}, [\dst], \d_strd + st1 {v5.8h}, [\ds2], \d_strd + b.le 9f + mov v16.16b, v18.16b + b 2b +9: + subs \w, \w, #8 + b.le 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + msub \src, \s_strd, \xmy, \src + msub \dst, \d_strd, \xmy, \dst + sub \src, \src, \s_strd, lsl #1 + mov \h, \my + add \src, \src, #16 + add \dst, \dst, #16 + b 1b +0: + ret + +L(\type\()_bilin_hv_tbl): + .hword L(\type\()_bilin_hv_tbl) - 1280b + .hword L(\type\()_bilin_hv_tbl) - 640b + .hword L(\type\()_bilin_hv_tbl) - 320b + .hword L(\type\()_bilin_hv_tbl) - 160b + .hword L(\type\()_bilin_hv_tbl) - 80b + .hword L(\type\()_bilin_hv_tbl) - 40b + .hword L(\type\()_bilin_hv_tbl) - 20b + .hword 0 +endfunc +.endm + +filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10 +filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10 + +.macro load_filter_row dst, src, inc + asr w13, \src, #10 + ldr \dst, [x11, w13, sxtw #3] + add \src, \src, \inc +.endm + +function warp_filter_horz_neon + add w12, w5, #512 + + ld1 {v16.8h, v17.8h}, [x2], x3 + + load_filter_row d0, w12, w7 + load_filter_row d1, w12, w7 + load_filter_row d2, w12, w7 + sxtl v0.8h, v0.8b + load_filter_row d3, w12, w7 + sxtl v1.8h, v1.8b + load_filter_row d4, w12, w7 + sxtl v2.8h, v2.8b + load_filter_row d5, w12, w7 + sxtl v3.8h, v3.8b + load_filter_row d6, w12, w7 + sxtl v4.8h, v4.8b + load_filter_row d7, w12, w7 + sxtl v5.8h, v5.8b + ext v18.16b, v16.16b, v17.16b, #2*1 + smull v8.4s, v16.4h, v0.4h + smull2 v9.4s, v16.8h, v0.8h + sxtl v6.8h, v6.8b + ext v19.16b, v16.16b, v17.16b, #2*2 + smull v10.4s, v18.4h, v1.4h + smull2 v11.4s, v18.8h, v1.8h + sxtl v7.8h, v7.8b + ext v20.16b, v16.16b, v17.16b, #2*3 + smull v0.4s, v19.4h, v2.4h + smull2 v1.4s, v19.8h, v2.8h + ext v21.16b, v16.16b, v17.16b, #2*4 + addp v8.4s, v8.4s, v9.4s + smull v2.4s, v20.4h, v3.4h + smull2 v3.4s, v20.8h, v3.8h + ext v22.16b, v16.16b, v17.16b, #2*5 + addp v9.4s, v10.4s, v11.4s + smull v10.4s, v21.4h, v4.4h + smull2 v11.4s, v21.8h, v4.8h + ext v23.16b, v16.16b, v17.16b, #2*6 + addp v0.4s, v0.4s, v1.4s + smull v18.4s, v22.4h, v5.4h + smull2 v19.4s, v22.8h, v5.8h + ext v16.16b, v16.16b, v17.16b, #2*7 + addp v1.4s, v2.4s, v3.4s + addp v2.4s, v10.4s, v11.4s + smull v20.4s, v23.4h, v6.4h + smull2 v21.4s, v23.8h, v6.8h + addp v3.4s, v18.4s, v19.4s + smull v22.4s, v16.4h, v7.4h + smull2 v23.4s, v16.8h, v7.8h + addp v4.4s, v20.4s, v21.4s + addp v5.4s, v22.4s, v23.4s + + addp v8.4s, v8.4s, v9.4s + addp v0.4s, v0.4s, v1.4s + addp v2.4s, v2.4s, v3.4s + addp v4.4s, v4.4s, v5.4s + + addp v16.4s, v8.4s, v0.4s + addp v17.4s, v2.4s, v4.4s + + add w5, w5, w8 + + srshl v16.4s, v16.4s, v14.4s // -(7 - intermediate_bits) + srshl v17.4s, v17.4s, v14.4s // -(7 - intermediate_bits) + + ret +endfunc + +// void dav1d_warp_affine_8x8_16bpc_neon( +// pixel *dst, const ptrdiff_t dst_stride, +// const pixel *src, const ptrdiff_t src_stride, +// const int16_t *const abcd, int mx, int my, +// const int bitdepth_max) +.macro warp t +function warp_affine_8x8\t\()_16bpc_neon, export=1 + stp d8, d9, [sp, #-0x40]! + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + +.ifb \t + dup v15.8h, w7 // bitdepth_max +.else + movi v15.8h, #(PREP_BIAS >> 8), lsl #8 +.endif + clz w7, w7 + // intermediate_bits = clz(bitdepth_max) - 18 +.ifb \t + sub w8, w7, #11 // 7 + intermediate_bits = clz(bitdepth_max) - 18 + 7 +.endif + sub w7, w7, #25 // -(7 - intermediate_bits) +.ifb \t + neg w8, w8 // -(7 + intermediate_bits) +.endif + dup v14.4s, w7 // -(7 - intermediate_bits) +.ifb \t + dup v13.4s, w8 // -(7 + intermediate_bits) +.endif + + ldr x4, [x4] + sbfx x7, x4, #0, #16 + sbfx x8, x4, #16, #16 + sbfx x9, x4, #32, #16 + sbfx x4, x4, #48, #16 + mov w10, #8 + sub x2, x2, x3, lsl #1 + sub x2, x2, x3 + sub x2, x2, #6 + movrel x11, X(mc_warp_filter), 64*8 + mov x15, x30 +.ifnb \t + lsl x1, x1, #1 +.endif + + bl warp_filter_horz_neon + xtn v24.4h, v16.4s + xtn2 v24.8h, v17.4s + bl warp_filter_horz_neon + xtn v25.4h, v16.4s + xtn2 v25.8h, v17.4s + bl warp_filter_horz_neon + xtn v26.4h, v16.4s + xtn2 v26.8h, v17.4s + bl warp_filter_horz_neon + xtn v27.4h, v16.4s + xtn2 v27.8h, v17.4s + bl warp_filter_horz_neon + xtn v28.4h, v16.4s + xtn2 v28.8h, v17.4s + bl warp_filter_horz_neon + xtn v29.4h, v16.4s + xtn2 v29.8h, v17.4s + bl warp_filter_horz_neon + xtn v30.4h, v16.4s + xtn2 v30.8h, v17.4s + +1: + add w14, w6, #512 + bl warp_filter_horz_neon + xtn v31.4h, v16.4s + xtn2 v31.8h, v17.4s + + load_filter_row d0, w14, w9 + load_filter_row d1, w14, w9 + load_filter_row d2, w14, w9 + load_filter_row d3, w14, w9 + load_filter_row d4, w14, w9 + load_filter_row d5, w14, w9 + load_filter_row d6, w14, w9 + load_filter_row d7, w14, w9 + transpose_8x8b v0, v1, v2, v3, v4, v5, v6, v7, v16, v17 + sxtl v0.8h, v0.8b + sxtl v1.8h, v1.8b + sxtl v2.8h, v2.8b + sxtl v3.8h, v3.8b + sxtl v4.8h, v4.8b + sxtl v5.8h, v5.8b + sxtl v6.8h, v6.8b + sxtl v7.8h, v7.8b + + // This ordering of smull/smlal/smull2/smlal2 is highly + // beneficial for Cortex A53 here. + smull v16.4s, v24.4h, v0.4h + smlal v16.4s, v25.4h, v1.4h + smlal v16.4s, v26.4h, v2.4h + smlal v16.4s, v27.4h, v3.4h + smlal v16.4s, v28.4h, v4.4h + smlal v16.4s, v29.4h, v5.4h + smlal v16.4s, v30.4h, v6.4h + smlal v16.4s, v31.4h, v7.4h + smull2 v17.4s, v24.8h, v0.8h + smlal2 v17.4s, v25.8h, v1.8h + smlal2 v17.4s, v26.8h, v2.8h + smlal2 v17.4s, v27.8h, v3.8h + smlal2 v17.4s, v28.8h, v4.8h + smlal2 v17.4s, v29.8h, v5.8h + smlal2 v17.4s, v30.8h, v6.8h + smlal2 v17.4s, v31.8h, v7.8h + + mov v24.16b, v25.16b + mov v25.16b, v26.16b +.ifb \t + srshl v16.4s, v16.4s, v13.4s // -(7 + intermediate_bits) + srshl v17.4s, v17.4s, v13.4s // -(7 + intermediate_bits) +.else + rshrn v16.4h, v16.4s, #7 + rshrn2 v16.8h, v17.4s, #7 +.endif + mov v26.16b, v27.16b +.ifb \t + sqxtun v16.4h, v16.4s + sqxtun2 v16.8h, v17.4s +.else + sub v16.8h, v16.8h, v15.8h // PREP_BIAS +.endif + mov v27.16b, v28.16b + mov v28.16b, v29.16b +.ifb \t + umin v16.8h, v16.8h, v15.8h // bitdepth_max +.endif + mov v29.16b, v30.16b + mov v30.16b, v31.16b + subs w10, w10, #1 + st1 {v16.8h}, [x0], x1 + + add w6, w6, w4 + b.gt 1b + + ldp d14, d15, [sp, #0x30] + ldp d12, d13, [sp, #0x20] + ldp d10, d11, [sp, #0x10] + ldp d8, d9, [sp], 0x40 + + br x15 +endfunc +.endm + +warp +warp t + +// void dav1d_emu_edge_16bpc_neon( +// const intptr_t bw, const intptr_t bh, +// const intptr_t iw, const intptr_t ih, +// const intptr_t x, const intptr_t y, +// pixel *dst, const ptrdiff_t dst_stride, +// const pixel *ref, const ptrdiff_t ref_stride) +function emu_edge_16bpc_neon, export=1 + ldp x8, x9, [sp] + + // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) + // ref += iclip(x, 0, iw - 1) + sub x12, x3, #1 // ih - 1 + cmp x5, x3 + sub x13, x2, #1 // iw - 1 + csel x12, x12, x5, ge // min(y, ih - 1) + cmp x4, x2 + bic x12, x12, x12, asr #63 // max(min(y, ih - 1), 0) + csel x13, x13, x4, ge // min(x, iw - 1) + bic x13, x13, x13, asr #63 // max(min(x, iw - 1), 0) + madd x8, x12, x9, x8 // ref += iclip() * stride + add x8, x8, x13, lsl #1 // ref += iclip() + + // bottom_ext = iclip(y + bh - ih, 0, bh - 1) + // top_ext = iclip(-y, 0, bh - 1) + add x10, x5, x1 // y + bh + neg x5, x5 // -y + sub x10, x10, x3 // y + bh - ih + sub x12, x1, #1 // bh - 1 + cmp x10, x1 + bic x5, x5, x5, asr #63 // max(-y, 0) + csel x10, x10, x12, lt // min(y + bh - ih, bh-1) + cmp x5, x1 + bic x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0) + csel x5, x5, x12, lt // min(max(-y, 0), bh-1) + + // right_ext = iclip(x + bw - iw, 0, bw - 1) + // left_ext = iclip(-x, 0, bw - 1) + add x11, x4, x0 // x + bw + neg x4, x4 // -x + sub x11, x11, x2 // x + bw - iw + sub x13, x0, #1 // bw - 1 + cmp x11, x0 + bic x4, x4, x4, asr #63 // max(-x, 0) + csel x11, x11, x13, lt // min(x + bw - iw, bw-1) + cmp x4, x0 + bic x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0) + csel x4, x4, x13, lt // min(max(-x, 0), bw - 1) + + // center_h = bh - top_ext - bottom_ext + // dst += top_ext * PXSTRIDE(dst_stride) + // center_w = bw - left_ext - right_ext + sub x1, x1, x5 // bh - top_ext + madd x6, x5, x7, x6 + sub x2, x0, x4 // bw - left_ext + sub x1, x1, x10 // center_h = bh - top_ext - bottom_ext + sub x2, x2, x11 // center_w = bw - left_ext - right_ext + + mov x14, x6 // backup of dst + +.macro v_loop need_left, need_right +0: +.if \need_left + ld1r {v0.8h}, [x8] + mov x12, x6 // out = dst + mov x3, x4 + mov v1.16b, v0.16b +1: + subs x3, x3, #16 + st1 {v0.8h, v1.8h}, [x12], #32 + b.gt 1b +.endif + mov x13, x8 + add x12, x6, x4, lsl #1 // out = dst + left_ext + mov x3, x2 +1: + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x13], #64 + subs x3, x3, #32 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x12], #64 + b.gt 1b +.if \need_right + add x3, x8, x2, lsl #1 // in + center_w + sub x3, x3, #2 // in + center_w - 1 + add x12, x6, x4, lsl #1 // dst + left_ext + ld1r {v0.8h}, [x3] + add x12, x12, x2, lsl #1 // out = dst + left_ext + center_w + mov x3, x11 + mov v1.16b, v0.16b +1: + subs x3, x3, #16 + st1 {v0.8h, v1.8h}, [x12], #32 + b.gt 1b +.endif + + subs x1, x1, #1 // center_h-- + add x6, x6, x7 + add x8, x8, x9 + b.gt 0b +.endm + + cbz x4, 2f + // need_left + cbz x11, 3f + // need_left + need_right + v_loop 1, 1 + b 5f + +2: + // !need_left + cbz x11, 4f + // !need_left + need_right + v_loop 0, 1 + b 5f + +3: + // need_left + !need_right + v_loop 1, 0 + b 5f + +4: + // !need_left + !need_right + v_loop 0, 0 + +5: + + cbz x10, 3f + // need_bottom + sub x8, x6, x7 // ref = dst - stride + mov x4, x0 +1: + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], #64 + mov x3, x10 +2: + subs x3, x3, #1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 + b.gt 2b + msub x6, x7, x10, x6 // dst -= bottom_ext * stride + subs x4, x4, #32 // bw -= 32 + add x6, x6, #64 // dst += 32 + b.gt 1b + +3: + cbz x5, 3f + // need_top + msub x6, x7, x5, x14 // dst = stored_dst - top_ext * stride +1: + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x14], #64 + mov x3, x5 +2: + subs x3, x3, #1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 + b.gt 2b + msub x6, x7, x5, x6 // dst -= top_ext * stride + subs x0, x0, #32 // bw -= 32 + add x6, x6, #64 // dst += 32 + b.gt 1b + +3: + ret +endfunc diff --git a/third_party/dav1d/src/arm/64/msac.S b/third_party/dav1d/src/arm/64/msac.S new file mode 100644 index 0000000000..3a6cf900a9 --- /dev/null +++ b/third_party/dav1d/src/arm/64/msac.S @@ -0,0 +1,480 @@ +/* + * Copyright © 2019, VideoLAN and dav1d authors + * Copyright © 2019, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +#define BUF_POS 0 +#define BUF_END 8 +#define DIF 16 +#define RNG 24 +#define CNT 28 +#define ALLOW_UPDATE_CDF 32 + +const coeffs + .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0 + .short 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +endconst + +const bits + .short 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 + .short 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000 +endconst + +.macro ld1_n d0, d1, src, sz, n +.if \n <= 8 + ld1 {\d0\sz}, [\src] +.else + ld1 {\d0\sz, \d1\sz}, [\src] +.endif +.endm + +.macro st1_n s0, s1, dst, sz, n +.if \n <= 8 + st1 {\s0\sz}, [\dst] +.else + st1 {\s0\sz, \s1\sz}, [\dst] +.endif +.endm + +.macro ushr_n d0, d1, s0, s1, shift, sz, n + ushr \d0\sz, \s0\sz, \shift +.if \n == 16 + ushr \d1\sz, \s1\sz, \shift +.endif +.endm + +.macro add_n d0, d1, s0, s1, s2, s3, sz, n + add \d0\sz, \s0\sz, \s2\sz +.if \n == 16 + add \d1\sz, \s1\sz, \s3\sz +.endif +.endm + +.macro sub_n d0, d1, s0, s1, s2, s3, sz, n + sub \d0\sz, \s0\sz, \s2\sz +.if \n == 16 + sub \d1\sz, \s1\sz, \s3\sz +.endif +.endm + +.macro and_n d0, d1, s0, s1, s2, s3, sz, n + and \d0\sz, \s0\sz, \s2\sz +.if \n == 16 + and \d1\sz, \s1\sz, \s3\sz +.endif +.endm + +.macro cmhs_n d0, d1, s0, s1, s2, s3, sz, n + cmhs \d0\sz, \s0\sz, \s2\sz +.if \n == 16 + cmhs \d1\sz, \s1\sz, \s3\sz +.endif +.endm + +.macro urhadd_n d0, d1, s0, s1, s2, s3, sz, n + urhadd \d0\sz, \s0\sz, \s2\sz +.if \n == 16 + urhadd \d1\sz, \s1\sz, \s3\sz +.endif +.endm + +.macro sshl_n d0, d1, s0, s1, s2, s3, sz, n + sshl \d0\sz, \s0\sz, \s2\sz +.if \n == 16 + sshl \d1\sz, \s1\sz, \s3\sz +.endif +.endm + +.macro sqdmulh_n d0, d1, s0, s1, s2, s3, sz, n + sqdmulh \d0\sz, \s0\sz, \s2\sz +.if \n == 16 + sqdmulh \d1\sz, \s1\sz, \s3\sz +.endif +.endm + +.macro str_n idx0, idx1, dstreg, dstoff, n + str \idx0, [\dstreg, \dstoff] +.if \n == 16 + str \idx1, [\dstreg, \dstoff + 16] +.endif +.endm + +// unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf, +// size_t n_symbols); + +function msac_decode_symbol_adapt4_neon, export=1 +.macro decode_update sz, szb, n + sub sp, sp, #48 + add x8, x0, #RNG + ld1_n v0, v1, x1, \sz, \n // cdf + ld1r {v4\sz}, [x8] // rng + movrel x9, coeffs, 30 + movi v31\sz, #0x7f, lsl #8 // 0x7f00 + sub x9, x9, x2, lsl #1 + mvni v30\sz, #0x3f // 0xffc0 + and v7\szb, v4\szb, v31\szb // rng & 0x7f00 + str h4, [sp, #14] // store original u = s->rng + and_n v2, v3, v0, v1, v30, v30, \szb, \n // cdf & 0xffc0 + + ld1_n v4, v5, x9, \sz, \n // EC_MIN_PROB * (n_symbols - ret) + sqdmulh_n v6, v7, v2, v3, v7, v7, \sz, \n // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1 + add x8, x0, #DIF + 6 + + add_n v4, v5, v2, v3, v4, v5, \sz, \n // v = cdf + EC_MIN_PROB * (n_symbols - ret) + add_n v4, v5, v6, v7, v4, v5, \sz, \n // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret) + + ld1r {v6.8h}, [x8] // dif >> (EC_WIN_SIZE - 16) + movrel x8, bits + str_n q4, q5, sp, #16, \n // store v values to allow indexed access + + ld1_n v16, v17, x8, .8h, \n + + cmhs_n v2, v3, v6, v6, v4, v5, .8h, \n // c >= v + + and_n v6, v7, v2, v3, v16, v17, .16b, \n // One bit per halfword set in the mask +.if \n == 16 + add v6.8h, v6.8h, v7.8h +.endif + addv h6, v6.8h // Aggregate mask bits + ldr w4, [x0, #ALLOW_UPDATE_CDF] + umov w3, v6.h[0] + rbit w3, w3 + clz w15, w3 // ret + + cbz w4, L(renorm) + // update_cdf + ldrh w3, [x1, x2, lsl #1] // count = cdf[n_symbols] + movi v5\szb, #0xff +.if \n == 16 + mov w4, #-5 +.else + mvn w14, w2 + mov w4, #-4 + cmn w14, #3 // set C if n_symbols <= 2 +.endif + urhadd_n v4, v5, v5, v5, v2, v3, \sz, \n // i >= val ? -1 : 32768 +.if \n == 16 + sub w4, w4, w3, lsr #4 // -((count >> 4) + 5) +.else + lsr w14, w3, #4 // count >> 4 + sbc w4, w4, w14 // -((count >> 4) + (n_symbols > 2) + 4) +.endif + sub_n v4, v5, v4, v5, v0, v1, \sz, \n // (32768 - cdf[i]) or (-1 - cdf[i]) + dup v6\sz, w4 // -rate + + sub w3, w3, w3, lsr #5 // count - (count == 32) + sub_n v0, v1, v0, v1, v2, v3, \sz, \n // cdf + (i >= val ? 1 : 0) + sshl_n v4, v5, v4, v5, v6, v6, \sz, \n // ({32768,-1} - cdf[i]) >> rate + add w3, w3, #1 // count + (count < 32) + add_n v0, v1, v0, v1, v4, v5, \sz, \n // cdf + (32768 - cdf[i]) >> rate + st1_n v0, v1, x1, \sz, \n + strh w3, [x1, x2, lsl #1] +.endm + + decode_update .4h, .8b, 4 + +L(renorm): + add x8, sp, #16 + add x8, x8, w15, uxtw #1 + ldrh w3, [x8] // v + ldurh w4, [x8, #-2] // u + ldr w6, [x0, #CNT] + ldr x7, [x0, #DIF] + sub w4, w4, w3 // rng = u - v + clz w5, w4 // clz(rng) + eor w5, w5, #16 // d = clz(rng) ^ 16 + mvn x7, x7 // ~dif + add x7, x7, x3, lsl #48 // ~dif + (v << 48) +L(renorm2): + lsl w4, w4, w5 // rng << d + subs w6, w6, w5 // cnt -= d + lsl x7, x7, x5 // (~dif + (v << 48)) << d + str w4, [x0, #RNG] + mvn x7, x7 // ~dif + b.hs 9f + + // refill + ldp x3, x4, [x0] // BUF_POS, BUF_END + add x5, x3, #8 + cmp x5, x4 + b.gt 2f + + ldr x3, [x3] // next_bits + add w8, w6, #23 // shift_bits = cnt + 23 + add w6, w6, #16 // cnt += 16 + rev x3, x3 // next_bits = bswap(next_bits) + sub x5, x5, x8, lsr #3 // buf_pos -= shift_bits >> 3 + and w8, w8, #24 // shift_bits &= 24 + lsr x3, x3, x8 // next_bits >>= shift_bits + sub w8, w8, w6 // shift_bits -= 16 + cnt + str x5, [x0, #BUF_POS] + lsl x3, x3, x8 // next_bits <<= shift_bits + mov w4, #48 + sub w6, w4, w8 // cnt = cnt + 64 - shift_bits + eor x7, x7, x3 // dif ^= next_bits + b 9f + +2: // refill_eob + mov w14, #40 + sub w5, w14, w6 // c = 40 - cnt +3: + cmp x3, x4 + b.ge 4f + ldrb w8, [x3], #1 + lsl x8, x8, x5 + eor x7, x7, x8 + subs w5, w5, #8 + b.ge 3b + +4: // refill_eob_end + str x3, [x0, #BUF_POS] + sub w6, w14, w5 // cnt = 40 - c + +9: + str w6, [x0, #CNT] + str x7, [x0, #DIF] + + mov w0, w15 + add sp, sp, #48 + ret +endfunc + +function msac_decode_symbol_adapt8_neon, export=1 + decode_update .8h, .16b, 8 + b L(renorm) +endfunc + +function msac_decode_symbol_adapt16_neon, export=1 + decode_update .8h, .16b, 16 + b L(renorm) +endfunc + +function msac_decode_hi_tok_neon, export=1 + ld1 {v0.4h}, [x1] // cdf + add x16, x0, #RNG + movi v31.4h, #0x7f, lsl #8 // 0x7f00 + movrel x17, coeffs, 30-2*3 + mvni v30.4h, #0x3f // 0xffc0 + ldrh w9, [x1, #6] // count = cdf[n_symbols] + ld1r {v3.4h}, [x16] // rng + movrel x16, bits + ld1 {v29.4h}, [x17] // EC_MIN_PROB * (n_symbols - ret) + add x17, x0, #DIF + 6 + ld1 {v16.8h}, [x16] + mov w13, #-24 + and v17.8b, v0.8b, v30.8b // cdf & 0xffc0 + ldr w10, [x0, #ALLOW_UPDATE_CDF] + ld1r {v1.8h}, [x17] // dif >> (EC_WIN_SIZE - 16) + sub sp, sp, #48 + ldr w6, [x0, #CNT] + ldr x7, [x0, #DIF] +1: + and v7.8b, v3.8b, v31.8b // rng & 0x7f00 + sqdmulh v6.4h, v17.4h, v7.4h // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1 + add v4.4h, v17.4h, v29.4h // v = cdf + EC_MIN_PROB * (n_symbols - ret) + add v4.4h, v6.4h, v4.4h // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret) + str h3, [sp, #14] // store original u = s->rng + cmhs v2.8h, v1.8h, v4.8h // c >= v + str q4, [sp, #16] // store v values to allow indexed access + and v6.16b, v2.16b, v16.16b // One bit per halfword set in the mask + addv h6, v6.8h // Aggregate mask bits + umov w3, v6.h[0] + add w13, w13, #5 + rbit w3, w3 + add x8, sp, #16 + clz w15, w3 // ret + + cbz w10, 2f + // update_cdf + movi v5.8b, #0xff + mov w4, #-5 + urhadd v4.4h, v5.4h, v2.4h // i >= val ? -1 : 32768 + sub w4, w4, w9, lsr #4 // -((count >> 4) + 5) + sub v4.4h, v4.4h, v0.4h // (32768 - cdf[i]) or (-1 - cdf[i]) + dup v6.4h, w4 // -rate + + sub w9, w9, w9, lsr #5 // count - (count == 32) + sub v0.4h, v0.4h, v2.4h // cdf + (i >= val ? 1 : 0) + sshl v4.4h, v4.4h, v6.4h // ({32768,-1} - cdf[i]) >> rate + add w9, w9, #1 // count + (count < 32) + add v0.4h, v0.4h, v4.4h // cdf + (32768 - cdf[i]) >> rate + st1 {v0.4h}, [x1] + and v17.8b, v0.8b, v30.8b // cdf & 0xffc0 + strh w9, [x1, #6] + +2: + add x8, x8, w15, uxtw #1 + ldrh w3, [x8] // v + ldurh w4, [x8, #-2] // u + sub w4, w4, w3 // rng = u - v + clz w5, w4 // clz(rng) + eor w5, w5, #16 // d = clz(rng) ^ 16 + mvn x7, x7 // ~dif + add x7, x7, x3, lsl #48 // ~dif + (v << 48) + lsl w4, w4, w5 // rng << d + subs w6, w6, w5 // cnt -= d + lsl x7, x7, x5 // (~dif + (v << 48)) << d + str w4, [x0, #RNG] + dup v3.4h, w4 + mvn x7, x7 // ~dif + b.hs 9f + + // refill + ldp x3, x4, [x0] // BUF_POS, BUF_END + add x5, x3, #8 + cmp x5, x4 + b.gt 2f + + ldr x3, [x3] // next_bits + add w8, w6, #23 // shift_bits = cnt + 23 + add w6, w6, #16 // cnt += 16 + rev x3, x3 // next_bits = bswap(next_bits) + sub x5, x5, x8, lsr #3 // buf_pos -= shift_bits >> 3 + and w8, w8, #24 // shift_bits &= 24 + lsr x3, x3, x8 // next_bits >>= shift_bits + sub w8, w8, w6 // shift_bits -= 16 + cnt + str x5, [x0, #BUF_POS] + lsl x3, x3, x8 // next_bits <<= shift_bits + mov w4, #48 + sub w6, w4, w8 // cnt = cnt + 64 - shift_bits + eor x7, x7, x3 // dif ^= next_bits + b 9f + +2: // refill_eob + mov w14, #40 + sub w5, w14, w6 // c = 40 - cnt +3: + cmp x3, x4 + b.ge 4f + ldrb w8, [x3], #1 + lsl x8, x8, x5 + eor x7, x7, x8 + subs w5, w5, #8 + b.ge 3b + +4: // refill_eob_end + str x3, [x0, #BUF_POS] + sub w6, w14, w5 // cnt = 40 - c + +9: + lsl w15, w15, #1 + sub w15, w15, #5 + lsr x12, x7, #48 + adds w13, w13, w15 // carry = tok_br < 3 || tok == 15 + dup v1.8h, w12 + b.cc 1b // loop if !carry + add w13, w13, #30 + str w6, [x0, #CNT] + add sp, sp, #48 + str x7, [x0, #DIF] + lsr w0, w13, #1 + ret +endfunc + +function msac_decode_bool_equi_neon, export=1 + ldp w5, w6, [x0, #RNG] // + CNT + sub sp, sp, #48 + ldr x7, [x0, #DIF] + bic w4, w5, #0xff // r &= 0xff00 + add w4, w4, #8 + subs x8, x7, x4, lsl #47 // dif - vw + lsr w4, w4, #1 // v + sub w5, w5, w4 // r - v + cset w15, lo + csel w4, w5, w4, hs // if (ret) v = r - v; + csel x7, x8, x7, hs // if (ret) dif = dif - vw; + + clz w5, w4 // clz(rng) + mvn x7, x7 // ~dif + eor w5, w5, #16 // d = clz(rng) ^ 16 + b L(renorm2) +endfunc + +function msac_decode_bool_neon, export=1 + ldp w5, w6, [x0, #RNG] // + CNT + sub sp, sp, #48 + ldr x7, [x0, #DIF] + lsr w4, w5, #8 // r >> 8 + bic w1, w1, #0x3f // f &= ~63 + mul w4, w4, w1 + lsr w4, w4, #7 + add w4, w4, #4 // v + subs x8, x7, x4, lsl #48 // dif - vw + sub w5, w5, w4 // r - v + cset w15, lo + csel w4, w5, w4, hs // if (ret) v = r - v; + csel x7, x8, x7, hs // if (ret) dif = dif - vw; + + clz w5, w4 // clz(rng) + mvn x7, x7 // ~dif + eor w5, w5, #16 // d = clz(rng) ^ 16 + b L(renorm2) +endfunc + +function msac_decode_bool_adapt_neon, export=1 + ldr w9, [x1] // cdf[0-1] + ldp w5, w6, [x0, #RNG] // + CNT + sub sp, sp, #48 + ldr x7, [x0, #DIF] + lsr w4, w5, #8 // r >> 8 + and w2, w9, #0xffc0 // f &= ~63 + mul w4, w4, w2 + lsr w4, w4, #7 + add w4, w4, #4 // v + subs x8, x7, x4, lsl #48 // dif - vw + sub w5, w5, w4 // r - v + cset w15, lo + csel w4, w5, w4, hs // if (ret) v = r - v; + csel x7, x8, x7, hs // if (ret) dif = dif - vw; + + ldr w10, [x0, #ALLOW_UPDATE_CDF] + + clz w5, w4 // clz(rng) + mvn x7, x7 // ~dif + eor w5, w5, #16 // d = clz(rng) ^ 16 + + cbz w10, L(renorm2) + + lsr w2, w9, #16 // count = cdf[1] + and w9, w9, #0xffff // cdf[0] + + sub w3, w2, w2, lsr #5 // count - (count >= 32) + lsr w2, w2, #4 // count >> 4 + add w10, w3, #1 // count + (count < 32) + add w2, w2, #4 // rate = (count >> 4) | 4 + + sub w9, w9, w15 // cdf[0] -= bit + sub w11, w9, w15, lsl #15 // {cdf[0], cdf[0] - 32769} + asr w11, w11, w2 // {cdf[0], cdf[0] - 32769} >> rate + sub w9, w9, w11 // cdf[0] + + strh w9, [x1] + strh w10, [x1, #2] + + b L(renorm2) +endfunc diff --git a/third_party/dav1d/src/arm/64/util.S b/third_party/dav1d/src/arm/64/util.S new file mode 100644 index 0000000000..fc0e0d04f1 --- /dev/null +++ b/third_party/dav1d/src/arm/64/util.S @@ -0,0 +1,197 @@ +/****************************************************************************** + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2015 Martin Storsjo + * Copyright © 2015 Janne Grunau + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#ifndef DAV1D_SRC_ARM_64_UTIL_S +#define DAV1D_SRC_ARM_64_UTIL_S + +#include "config.h" +#include "src/arm/asm.S" + +.macro movrel rd, val, offset=0 +#if defined(__APPLE__) + .if \offset < 0 + adrp \rd, \val@PAGE + add \rd, \rd, \val@PAGEOFF + sub \rd, \rd, -(\offset) + .else + adrp \rd, \val+(\offset)@PAGE + add \rd, \rd, \val+(\offset)@PAGEOFF + .endif +#elif defined(PIC) && defined(_WIN32) + .if \offset < 0 + adrp \rd, \val + add \rd, \rd, :lo12:\val + sub \rd, \rd, -(\offset) + .else + adrp \rd, \val+(\offset) + add \rd, \rd, :lo12:\val+(\offset) + .endif +#elif defined(PIC) + adrp \rd, \val+(\offset) + add \rd, \rd, :lo12:\val+(\offset) +#else + ldr \rd, =\val+\offset +#endif +.endm + +.macro transpose_8x8b r0, r1, r2, r3, r4, r5, r6, r7, t8, t9 + trn1 \t8\().8b, \r0\().8b, \r1\().8b + trn2 \t9\().8b, \r0\().8b, \r1\().8b + trn1 \r1\().8b, \r2\().8b, \r3\().8b + trn2 \r3\().8b, \r2\().8b, \r3\().8b + trn1 \r0\().8b, \r4\().8b, \r5\().8b + trn2 \r5\().8b, \r4\().8b, \r5\().8b + trn1 \r2\().8b, \r6\().8b, \r7\().8b + trn2 \r7\().8b, \r6\().8b, \r7\().8b + + trn1 \r4\().4h, \r0\().4h, \r2\().4h + trn2 \r2\().4h, \r0\().4h, \r2\().4h + trn1 \r6\().4h, \r5\().4h, \r7\().4h + trn2 \r7\().4h, \r5\().4h, \r7\().4h + trn1 \r5\().4h, \t9\().4h, \r3\().4h + trn2 \t9\().4h, \t9\().4h, \r3\().4h + trn1 \r3\().4h, \t8\().4h, \r1\().4h + trn2 \t8\().4h, \t8\().4h, \r1\().4h + + trn1 \r0\().2s, \r3\().2s, \r4\().2s + trn2 \r4\().2s, \r3\().2s, \r4\().2s + trn1 \r1\().2s, \r5\().2s, \r6\().2s + trn2 \r5\().2s, \r5\().2s, \r6\().2s + trn2 \r6\().2s, \t8\().2s, \r2\().2s + trn1 \r2\().2s, \t8\().2s, \r2\().2s + trn1 \r3\().2s, \t9\().2s, \r7\().2s + trn2 \r7\().2s, \t9\().2s, \r7\().2s +.endm + +.macro transpose_8x8h r0, r1, r2, r3, r4, r5, r6, r7, t8, t9 + trn1 \t8\().8h, \r0\().8h, \r1\().8h + trn2 \t9\().8h, \r0\().8h, \r1\().8h + trn1 \r1\().8h, \r2\().8h, \r3\().8h + trn2 \r3\().8h, \r2\().8h, \r3\().8h + trn1 \r0\().8h, \r4\().8h, \r5\().8h + trn2 \r5\().8h, \r4\().8h, \r5\().8h + trn1 \r2\().8h, \r6\().8h, \r7\().8h + trn2 \r7\().8h, \r6\().8h, \r7\().8h + + trn1 \r4\().4s, \r0\().4s, \r2\().4s + trn2 \r2\().4s, \r0\().4s, \r2\().4s + trn1 \r6\().4s, \r5\().4s, \r7\().4s + trn2 \r7\().4s, \r5\().4s, \r7\().4s + trn1 \r5\().4s, \t9\().4s, \r3\().4s + trn2 \t9\().4s, \t9\().4s, \r3\().4s + trn1 \r3\().4s, \t8\().4s, \r1\().4s + trn2 \t8\().4s, \t8\().4s, \r1\().4s + + trn1 \r0\().2d, \r3\().2d, \r4\().2d + trn2 \r4\().2d, \r3\().2d, \r4\().2d + trn1 \r1\().2d, \r5\().2d, \r6\().2d + trn2 \r5\().2d, \r5\().2d, \r6\().2d + trn2 \r6\().2d, \t8\().2d, \r2\().2d + trn1 \r2\().2d, \t8\().2d, \r2\().2d + trn1 \r3\().2d, \t9\().2d, \r7\().2d + trn2 \r7\().2d, \t9\().2d, \r7\().2d +.endm + +.macro transpose_8x16b r0, r1, r2, r3, r4, r5, r6, r7, t8, t9 + trn1 \t8\().16b, \r0\().16b, \r1\().16b + trn2 \t9\().16b, \r0\().16b, \r1\().16b + trn1 \r1\().16b, \r2\().16b, \r3\().16b + trn2 \r3\().16b, \r2\().16b, \r3\().16b + trn1 \r0\().16b, \r4\().16b, \r5\().16b + trn2 \r5\().16b, \r4\().16b, \r5\().16b + trn1 \r2\().16b, \r6\().16b, \r7\().16b + trn2 \r7\().16b, \r6\().16b, \r7\().16b + + trn1 \r4\().8h, \r0\().8h, \r2\().8h + trn2 \r2\().8h, \r0\().8h, \r2\().8h + trn1 \r6\().8h, \r5\().8h, \r7\().8h + trn2 \r7\().8h, \r5\().8h, \r7\().8h + trn1 \r5\().8h, \t9\().8h, \r3\().8h + trn2 \t9\().8h, \t9\().8h, \r3\().8h + trn1 \r3\().8h, \t8\().8h, \r1\().8h + trn2 \t8\().8h, \t8\().8h, \r1\().8h + + trn1 \r0\().4s, \r3\().4s, \r4\().4s + trn2 \r4\().4s, \r3\().4s, \r4\().4s + trn1 \r1\().4s, \r5\().4s, \r6\().4s + trn2 \r5\().4s, \r5\().4s, \r6\().4s + trn2 \r6\().4s, \t8\().4s, \r2\().4s + trn1 \r2\().4s, \t8\().4s, \r2\().4s + trn1 \r3\().4s, \t9\().4s, \r7\().4s + trn2 \r7\().4s, \t9\().4s, \r7\().4s +.endm + +.macro transpose_4x16b r0, r1, r2, r3, t4, t5, t6, t7 + trn1 \t4\().16b, \r0\().16b, \r1\().16b + trn2 \t5\().16b, \r0\().16b, \r1\().16b + trn1 \t6\().16b, \r2\().16b, \r3\().16b + trn2 \t7\().16b, \r2\().16b, \r3\().16b + + trn1 \r0\().8h, \t4\().8h, \t6\().8h + trn2 \r2\().8h, \t4\().8h, \t6\().8h + trn1 \r1\().8h, \t5\().8h, \t7\().8h + trn2 \r3\().8h, \t5\().8h, \t7\().8h +.endm + +.macro transpose_4x4h r0, r1, r2, r3, t4, t5, t6, t7 + trn1 \t4\().4h, \r0\().4h, \r1\().4h + trn2 \t5\().4h, \r0\().4h, \r1\().4h + trn1 \t6\().4h, \r2\().4h, \r3\().4h + trn2 \t7\().4h, \r2\().4h, \r3\().4h + + trn1 \r0\().2s, \t4\().2s, \t6\().2s + trn2 \r2\().2s, \t4\().2s, \t6\().2s + trn1 \r1\().2s, \t5\().2s, \t7\().2s + trn2 \r3\().2s, \t5\().2s, \t7\().2s +.endm + +.macro transpose_4x4s r0, r1, r2, r3, t4, t5, t6, t7 + trn1 \t4\().4s, \r0\().4s, \r1\().4s + trn2 \t5\().4s, \r0\().4s, \r1\().4s + trn1 \t6\().4s, \r2\().4s, \r3\().4s + trn2 \t7\().4s, \r2\().4s, \r3\().4s + + trn1 \r0\().2d, \t4\().2d, \t6\().2d + trn2 \r2\().2d, \t4\().2d, \t6\().2d + trn1 \r1\().2d, \t5\().2d, \t7\().2d + trn2 \r3\().2d, \t5\().2d, \t7\().2d +.endm + +.macro transpose_4x8h r0, r1, r2, r3, t4, t5, t6, t7 + trn1 \t4\().8h, \r0\().8h, \r1\().8h + trn2 \t5\().8h, \r0\().8h, \r1\().8h + trn1 \t6\().8h, \r2\().8h, \r3\().8h + trn2 \t7\().8h, \r2\().8h, \r3\().8h + + trn1 \r0\().4s, \t4\().4s, \t6\().4s + trn2 \r2\().4s, \t4\().4s, \t6\().4s + trn1 \r1\().4s, \t5\().4s, \t7\().4s + trn2 \r3\().4s, \t5\().4s, \t7\().4s +.endm + +#endif /* DAV1D_SRC_ARM_64_UTIL_S */ diff --git a/third_party/dav1d/src/arm/asm.S b/third_party/dav1d/src/arm/asm.S new file mode 100644 index 0000000000..0e59c0228b --- /dev/null +++ b/third_party/dav1d/src/arm/asm.S @@ -0,0 +1,155 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Janne Grunau + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_ARM_ASM_S +#define DAV1D_SRC_ARM_ASM_S + +#include "config.h" + +#if ARCH_ARM + .syntax unified +#ifdef __ELF__ + .arch armv7-a + .fpu neon + .eabi_attribute 10, 0 // suppress Tag_FP_arch + .eabi_attribute 12, 0 // suppress Tag_Advanced_SIMD_arch + .section .note.GNU-stack,"",%progbits // Mark stack as non-executable +#endif + +#ifdef _WIN32 +#define CONFIG_THUMB 1 +#else +#define CONFIG_THUMB 0 +#endif + +#if CONFIG_THUMB + .thumb +#define A @ +#define T +#else +#define A +#define T @ +#endif +#endif + +#if !defined(PIC) +#if defined(__PIC__) +#define PIC __PIC__ +#elif defined(__pic__) +#define PIC __pic__ +#endif +#endif + +#ifndef PRIVATE_PREFIX +#define PRIVATE_PREFIX dav1d_ +#endif + +#define PASTE(a,b) a ## b +#define CONCAT(a,b) PASTE(a,b) + +#ifdef PREFIX +#define EXTERN CONCAT(_,PRIVATE_PREFIX) +#else +#define EXTERN PRIVATE_PREFIX +#endif + +.macro function name, export=0, align=2 + .macro endfunc +#ifdef __ELF__ + .size \name, . - \name +#endif +#if HAVE_AS_FUNC + .endfunc +#endif + .purgem endfunc + .endm + .text + .align \align + .if \export + .global EXTERN\name +#ifdef __ELF__ + .type EXTERN\name, %function + .hidden EXTERN\name +#elif defined(__MACH__) + .private_extern EXTERN\name +#endif +#if HAVE_AS_FUNC + .func EXTERN\name +#endif +EXTERN\name: + .else +#ifdef __ELF__ + .type \name, %function +#endif +#if HAVE_AS_FUNC + .func \name +#endif + .endif +\name: +.endm + +.macro const name, export=0, align=2 + .macro endconst +#ifdef __ELF__ + .size \name, . - \name +#endif + .purgem endconst + .endm +#if defined(_WIN32) + .section .rdata +#elif !defined(__MACH__) + .section .rodata +#else + .const_data +#endif + .align \align + .if \export + .global EXTERN\name +#ifdef __ELF__ + .hidden EXTERN\name +#elif defined(__MACH__) + .private_extern EXTERN\name +#endif +EXTERN\name: + .endif +\name: +.endm + +#ifdef __APPLE__ +#define L(x) L ## x +#else +#define L(x) .L ## x +#endif + +#define X(x) CONCAT(EXTERN, x) + +#if ARCH_AARCH64 +#define x18 do_not_use_x18 +#define w18 do_not_use_w18 +#endif + +#endif /* DAV1D_SRC_ARM_ASM_S */ diff --git a/third_party/dav1d/src/arm/cdef_init_tmpl.c b/third_party/dav1d/src/arm/cdef_init_tmpl.c new file mode 100644 index 0000000000..1f59619ba3 --- /dev/null +++ b/third_party/dav1d/src/arm/cdef_init_tmpl.c @@ -0,0 +1,85 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/cdef.h" + +decl_cdef_dir_fn(BF(dav1d_cdef_find_dir, neon)); + +void BF(dav1d_cdef_padding4, neon)(uint16_t *tmp, const pixel *src, + ptrdiff_t src_stride, const pixel (*left)[2], + const pixel *const top, int h, + enum CdefEdgeFlags edges); +void BF(dav1d_cdef_padding8, neon)(uint16_t *tmp, const pixel *src, + ptrdiff_t src_stride, const pixel (*left)[2], + const pixel *const top, int h, + enum CdefEdgeFlags edges); + +// Passing edges to this function, to allow it to switch to a more +// optimized version for fully edged cases. Using size_t for edges, +// to avoid ABI differences for passing more than one argument on the stack. +void BF(dav1d_cdef_filter4, neon)(pixel *dst, ptrdiff_t dst_stride, + const uint16_t *tmp, int pri_strength, + int sec_strength, int dir, int damping, int h, + size_t edges HIGHBD_DECL_SUFFIX); +void BF(dav1d_cdef_filter8, neon)(pixel *dst, ptrdiff_t dst_stride, + const uint16_t *tmp, int pri_strength, + int sec_strength, int dir, int damping, int h, + size_t edges HIGHBD_DECL_SUFFIX); + +#define DEFINE_FILTER(w, h, tmp_stride) \ +static void \ +cdef_filter_##w##x##h##_neon(pixel *dst, \ + const ptrdiff_t stride, \ + const pixel (*left)[2], const pixel *const top, \ + const int pri_strength, const int sec_strength, \ + const int dir, const int damping, \ + const enum CdefEdgeFlags edges \ + HIGHBD_DECL_SUFFIX) \ +{ \ + ALIGN_STK_16(uint16_t, tmp_buf, 12 * tmp_stride + 8,); \ + uint16_t *tmp = tmp_buf + 2 * tmp_stride + 8; \ + BF(dav1d_cdef_padding##w, neon)(tmp, dst, stride, left, top, h, edges); \ + BF(dav1d_cdef_filter##w, neon)(dst, stride, tmp, pri_strength, \ + sec_strength, dir, damping, h, edges \ + HIGHBD_TAIL_SUFFIX); \ +} + +DEFINE_FILTER(8, 8, 16) +DEFINE_FILTER(4, 8, 8) +DEFINE_FILTER(4, 4, 8) + + +COLD void bitfn(dav1d_cdef_dsp_init_arm)(Dav1dCdefDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; + + c->dir = BF(dav1d_cdef_find_dir, neon); + c->fb[0] = cdef_filter_8x8_neon; + c->fb[1] = cdef_filter_4x8_neon; + c->fb[2] = cdef_filter_4x4_neon; +} diff --git a/third_party/dav1d/src/arm/cpu.c b/third_party/dav1d/src/arm/cpu.c new file mode 100644 index 0000000000..b7a0d3adbc --- /dev/null +++ b/third_party/dav1d/src/arm/cpu.c @@ -0,0 +1,99 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Janne Grunau + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include "common/attributes.h" + +#include "src/arm/cpu.h" + +#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64 +// NEON is always available; runtime tests are not needed. +#elif defined(HAVE_GETAUXVAL) && ARCH_ARM +#include + +#ifndef HWCAP_ARM_NEON +#define HWCAP_ARM_NEON (1 << 12) +#endif +#define NEON_HWCAP HWCAP_ARM_NEON + +#elif defined(HAVE_ELF_AUX_INFO) && ARCH_ARM +#include + +#define NEON_HWCAP HWCAP_NEON + +#elif defined(__ANDROID__) +#include +#include + +static unsigned parse_proc_cpuinfo(const char *flag) { + FILE *file = fopen("/proc/cpuinfo", "r"); + if (!file) + return 0; + + char line_buffer[120]; + const char *line; + + while ((line = fgets(line_buffer, sizeof(line_buffer), file))) { + if (strstr(line, flag)) { + fclose(file); + return 1; + } + // if line is incomplete seek back to avoid splitting the search + // string into two buffers + if (!strchr(line, '\n') && strlen(line) > strlen(flag)) { + // use fseek since the 64 bit fseeko is only available since + // Android API level 24 and meson defines _FILE_OFFSET_BITS + // by default 64 + if (fseek(file, -strlen(flag), SEEK_CUR)) + break; + } + } + + fclose(file); + + return 0; +} +#endif + +COLD unsigned dav1d_get_cpu_flags_arm(void) { + unsigned flags = 0; +#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64 + flags |= DAV1D_ARM_CPU_FLAG_NEON; +#elif defined(HAVE_GETAUXVAL) && ARCH_ARM + unsigned long hw_cap = getauxval(AT_HWCAP); + flags |= (hw_cap & NEON_HWCAP) ? DAV1D_ARM_CPU_FLAG_NEON : 0; +#elif defined(HAVE_ELF_AUX_INFO) && ARCH_ARM + unsigned long hw_cap = 0; + elf_aux_info(AT_HWCAP, &hw_cap, sizeof(hw_cap)); + flags |= (hw_cap & NEON_HWCAP) ? DAV1D_ARM_CPU_FLAG_NEON : 0; +#elif defined(__ANDROID__) + flags |= parse_proc_cpuinfo("neon") ? DAV1D_ARM_CPU_FLAG_NEON : 0; +#endif + + return flags; +} diff --git a/third_party/dav1d/src/arm/cpu.h b/third_party/dav1d/src/arm/cpu.h new file mode 100644 index 0000000000..8c10a1b6b0 --- /dev/null +++ b/third_party/dav1d/src/arm/cpu.h @@ -0,0 +1,37 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Janne Grunau + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_ARM_CPU_H +#define DAV1D_SRC_ARM_CPU_H + +enum CpuFlags { + DAV1D_ARM_CPU_FLAG_NEON = 1 << 0, +}; + +unsigned dav1d_get_cpu_flags_arm(void); + +#endif /* DAV1D_SRC_ARM_CPU_H */ diff --git a/third_party/dav1d/src/arm/ipred_init_tmpl.c b/third_party/dav1d/src/arm/ipred_init_tmpl.c new file mode 100644 index 0000000000..3662be087b --- /dev/null +++ b/third_party/dav1d/src/arm/ipred_init_tmpl.c @@ -0,0 +1,82 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/ipred.h" + +decl_angular_ipred_fn(BF(dav1d_ipred_dc, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_dc_128, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_dc_top, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_dc_left, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_h, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_v, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_paeth, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_smooth, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_smooth_v, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_smooth_h, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_filter, neon)); + +decl_cfl_pred_fn(BF(dav1d_ipred_cfl, neon)); +decl_cfl_pred_fn(BF(dav1d_ipred_cfl_128, neon)); +decl_cfl_pred_fn(BF(dav1d_ipred_cfl_top, neon)); +decl_cfl_pred_fn(BF(dav1d_ipred_cfl_left, neon)); + +decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_420, neon)); +decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_422, neon)); +decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_444, neon)); + +decl_pal_pred_fn(BF(dav1d_pal_pred, neon)); + +COLD void bitfn(dav1d_intra_pred_dsp_init_arm)(Dav1dIntraPredDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; + +#if BITDEPTH == 8 || ARCH_AARCH64 + c->intra_pred[DC_PRED] = BF(dav1d_ipred_dc, neon); + c->intra_pred[DC_128_PRED] = BF(dav1d_ipred_dc_128, neon); + c->intra_pred[TOP_DC_PRED] = BF(dav1d_ipred_dc_top, neon); + c->intra_pred[LEFT_DC_PRED] = BF(dav1d_ipred_dc_left, neon); + c->intra_pred[HOR_PRED] = BF(dav1d_ipred_h, neon); + c->intra_pred[VERT_PRED] = BF(dav1d_ipred_v, neon); + c->intra_pred[PAETH_PRED] = BF(dav1d_ipred_paeth, neon); + c->intra_pred[SMOOTH_PRED] = BF(dav1d_ipred_smooth, neon); + c->intra_pred[SMOOTH_V_PRED] = BF(dav1d_ipred_smooth_v, neon); + c->intra_pred[SMOOTH_H_PRED] = BF(dav1d_ipred_smooth_h, neon); + c->intra_pred[FILTER_PRED] = BF(dav1d_ipred_filter, neon); + + c->cfl_pred[DC_PRED] = BF(dav1d_ipred_cfl, neon); + c->cfl_pred[DC_128_PRED] = BF(dav1d_ipred_cfl_128, neon); + c->cfl_pred[TOP_DC_PRED] = BF(dav1d_ipred_cfl_top, neon); + c->cfl_pred[LEFT_DC_PRED] = BF(dav1d_ipred_cfl_left, neon); + + c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_ipred_cfl_ac_420, neon); + c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_ipred_cfl_ac_422, neon); + c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_ipred_cfl_ac_444, neon); + + c->pal_pred = BF(dav1d_pal_pred, neon); +#endif +} diff --git a/third_party/dav1d/src/arm/itx_init_tmpl.c b/third_party/dav1d/src/arm/itx_init_tmpl.c new file mode 100644 index 0000000000..ad418f2db5 --- /dev/null +++ b/third_party/dav1d/src/arm/itx_init_tmpl.c @@ -0,0 +1,143 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2019, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/itx.h" + +#define decl_itx2_fns(w, h, opt) \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt)) + +#define decl_itx12_fns(w, h, opt) \ +decl_itx2_fns(w, h, opt); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt)) + +#define decl_itx16_fns(w, h, opt) \ +decl_itx12_fns(w, h, opt); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt)) + +#define decl_itx17_fns(w, h, opt) \ +decl_itx16_fns(w, h, opt); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt)) + +decl_itx17_fns( 4, 4, neon); +decl_itx16_fns( 4, 8, neon); +decl_itx16_fns( 4, 16, neon); +decl_itx16_fns( 8, 4, neon); +decl_itx16_fns( 8, 8, neon); +decl_itx16_fns( 8, 16, neon); +decl_itx2_fns ( 8, 32, neon); +decl_itx16_fns(16, 4, neon); +decl_itx16_fns(16, 8, neon); +decl_itx12_fns(16, 16, neon); +decl_itx2_fns (16, 32, neon); +decl_itx2_fns (32, 8, neon); +decl_itx2_fns (32, 16, neon); +decl_itx2_fns (32, 32, neon); + +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x64, neon)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x64, neon)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x16, neon)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, neon)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, neon)); + +COLD void bitfn(dav1d_itx_dsp_init_arm)(Dav1dInvTxfmDSPContext *const c, int bpc) { +#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \ + c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \ + BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext) + +#define assign_itx1_fn(pfx, w, h, ext) \ + assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext) + +#define assign_itx2_fn(pfx, w, h, ext) \ + assign_itx1_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, identity_identity, IDTX, ext) + +#define assign_itx12_fn(pfx, w, h, ext) \ + assign_itx2_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, dct_adst, ADST_DCT, ext); \ + assign_itx_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, ext); \ + assign_itx_fn(pfx, w, h, dct_identity, H_DCT, ext); \ + assign_itx_fn(pfx, w, h, adst_dct, DCT_ADST, ext); \ + assign_itx_fn(pfx, w, h, adst_adst, ADST_ADST, ext); \ + assign_itx_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, identity_dct, V_DCT, ext) + +#define assign_itx16_fn(pfx, w, h, ext) \ + assign_itx12_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, adst_identity, H_ADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, identity_adst, V_ADST, ext); \ + assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST, ext) + +#define assign_itx17_fn(pfx, w, h, ext) \ + assign_itx16_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, wht_wht, WHT_WHT, ext) + + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; + + if (bpc > 10) return; + +#if ARCH_AARCH64 || BITDEPTH == 8 + assign_itx17_fn( , 4, 4, neon); + assign_itx16_fn(R, 4, 8, neon); + assign_itx16_fn(R, 4, 16, neon); + assign_itx16_fn(R, 8, 4, neon); + assign_itx16_fn( , 8, 8, neon); + assign_itx16_fn(R, 8, 16, neon); + assign_itx2_fn (R, 8, 32, neon); + assign_itx16_fn(R, 16, 4, neon); + assign_itx16_fn(R, 16, 8, neon); + assign_itx12_fn( , 16, 16, neon); + assign_itx2_fn (R, 16, 32, neon); + assign_itx1_fn (R, 16, 64, neon); + assign_itx2_fn (R, 32, 8, neon); + assign_itx2_fn (R, 32, 16, neon); + assign_itx2_fn ( , 32, 32, neon); + assign_itx1_fn (R, 32, 64, neon); + assign_itx1_fn (R, 64, 16, neon); + assign_itx1_fn (R, 64, 32, neon); + assign_itx1_fn ( , 64, 64, neon); +#endif +} diff --git a/third_party/dav1d/src/arm/loopfilter_init_tmpl.c b/third_party/dav1d/src/arm/loopfilter_init_tmpl.c new file mode 100644 index 0000000000..d44f8e1d96 --- /dev/null +++ b/third_party/dav1d/src/arm/loopfilter_init_tmpl.c @@ -0,0 +1,47 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/loopfilter.h" + +decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_y, neon)); +decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_y, neon)); +decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_uv, neon)); +decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_uv, neon)); + +COLD void bitfn(dav1d_loop_filter_dsp_init_arm)(Dav1dLoopFilterDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; + +#if BITDEPTH == 8 || ARCH_AARCH64 + c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, neon); + c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, neon); + c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, neon); + c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, neon); +#endif +} diff --git a/third_party/dav1d/src/arm/looprestoration_init_tmpl.c b/third_party/dav1d/src/arm/looprestoration_init_tmpl.c new file mode 100644 index 0000000000..125f28c62e --- /dev/null +++ b/third_party/dav1d/src/arm/looprestoration_init_tmpl.c @@ -0,0 +1,298 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/looprestoration.h" +#include "src/tables.h" + +// The 8bpc version calculates things slightly differently than the reference +// C version. That version calculates roughly this: +// int16_t sum = 0; +// for (int i = 0; i < 7; i++) +// sum += src[idx] * fh[i]; +// int16_t sum2 = (src[x] << 7) - (1 << (bitdepth + 6)) + rounding_off_h; +// sum = iclip(sum + sum2, INT16_MIN, INT16_MAX) >> round_bits_h; +// sum += 1 << (bitdepth + 6 - round_bits_h); +// Compared to the reference C version, this is the output of the first pass +// _subtracted_ by 1 << (bitdepth + 6 - round_bits_h) = 2048, i.e. +// with round_offset precompensated. +// The 16bpc version calculates things pretty much the same way as the +// reference C version, but with the end result subtracted by +// 1 << (bitdepth + 6 - round_bits_h). +void BF(dav1d_wiener_filter_h, neon)(int16_t *dst, const pixel (*left)[4], + const pixel *src, ptrdiff_t stride, + const int16_t fh[7], const intptr_t w, + int h, enum LrEdgeFlags edges + HIGHBD_DECL_SUFFIX); +// This calculates things slightly differently than the reference C version. +// This version calculates roughly this: +// fv[3] += 128; +// int32_t sum = 0; +// for (int i = 0; i < 7; i++) +// sum += mid[idx] * fv[i]; +// sum = (sum + rounding_off_v) >> round_bits_v; +// This function assumes that the width is a multiple of 8. +void BF(dav1d_wiener_filter_v, neon)(pixel *dst, ptrdiff_t stride, + const int16_t *mid, int w, int h, + const int16_t fv[7], enum LrEdgeFlags edges, + ptrdiff_t mid_stride HIGHBD_DECL_SUFFIX); +void BF(dav1d_copy_narrow, neon)(pixel *dst, ptrdiff_t stride, + const pixel *src, int w, int h); + +static void wiener_filter_neon(pixel *const dst, const ptrdiff_t dst_stride, + const pixel (*const left)[4], + const pixel *lpf, const ptrdiff_t lpf_stride, + const int w, const int h, const int16_t fh[7], + const int16_t fv[7], const enum LrEdgeFlags edges + HIGHBD_DECL_SUFFIX) +{ + ALIGN_STK_16(int16_t, mid, 68 * 384,); + int mid_stride = (w + 7) & ~7; + + // Horizontal filter + BF(dav1d_wiener_filter_h, neon)(&mid[2 * mid_stride], left, dst, dst_stride, + fh, w, h, edges HIGHBD_TAIL_SUFFIX); + if (edges & LR_HAVE_TOP) + BF(dav1d_wiener_filter_h, neon)(mid, NULL, lpf, lpf_stride, + fh, w, 2, edges HIGHBD_TAIL_SUFFIX); + if (edges & LR_HAVE_BOTTOM) + BF(dav1d_wiener_filter_h, neon)(&mid[(2 + h) * mid_stride], NULL, + lpf + 6 * PXSTRIDE(lpf_stride), + lpf_stride, fh, w, 2, edges + HIGHBD_TAIL_SUFFIX); + + // Vertical filter + if (w >= 8) + BF(dav1d_wiener_filter_v, neon)(dst, dst_stride, &mid[2*mid_stride], + w & ~7, h, fv, edges, + mid_stride * sizeof(*mid) + HIGHBD_TAIL_SUFFIX); + if (w & 7) { + // For uneven widths, do a full 8 pixel wide filtering into a temp + // buffer and copy out the narrow slice of pixels separately into dest. + ALIGN_STK_16(pixel, tmp, 64 * 8,); + BF(dav1d_wiener_filter_v, neon)(tmp, (w & 7) * sizeof(pixel), + &mid[2*mid_stride + (w & ~7)], + w & 7, h, fv, edges, + mid_stride * sizeof(*mid) + HIGHBD_TAIL_SUFFIX); + BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, tmp, w & 7, h); + } +} + +#if BITDEPTH == 8 || ARCH_AARCH64 +void BF(dav1d_sgr_box3_h, neon)(int32_t *sumsq, int16_t *sum, + const pixel (*left)[4], + const pixel *src, const ptrdiff_t stride, + const int w, const int h, + const enum LrEdgeFlags edges); +void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum, + const int w, const int h, + const enum LrEdgeFlags edges); +void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b, + const int w, const int h, const int strength, + const int bitdepth_max); +void BF(dav1d_sgr_finish_filter1, neon)(int16_t *tmp, + const pixel *src, const ptrdiff_t stride, + const int32_t *a, const int16_t *b, + const int w, const int h); + +/* filter with a 3x3 box (radius=1) */ +static void dav1d_sgr_filter1_neon(int16_t *tmp, + const pixel *src, const ptrdiff_t stride, + const pixel (*left)[4], + const pixel *lpf, const ptrdiff_t lpf_stride, + const int w, const int h, const int strength, + const enum LrEdgeFlags edges + HIGHBD_DECL_SUFFIX) +{ + ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); + int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; + ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,); + int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; + + BF(dav1d_sgr_box3_h, neon)(sumsq, sum, left, src, stride, w, h, edges); + if (edges & LR_HAVE_TOP) + BF(dav1d_sgr_box3_h, neon)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], + NULL, lpf, lpf_stride, w, 2, edges); + + if (edges & LR_HAVE_BOTTOM) + BF(dav1d_sgr_box3_h, neon)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], + NULL, lpf + 6 * PXSTRIDE(lpf_stride), + lpf_stride, w, 2, edges); + + dav1d_sgr_box3_v_neon(sumsq, sum, w, h, edges); + dav1d_sgr_calc_ab1_neon(a, b, w, h, strength, BITDEPTH_MAX); + BF(dav1d_sgr_finish_filter1, neon)(tmp, src, stride, a, b, w, h); +} + +void BF(dav1d_sgr_box5_h, neon)(int32_t *sumsq, int16_t *sum, + const pixel (*left)[4], + const pixel *src, const ptrdiff_t stride, + const int w, const int h, + const enum LrEdgeFlags edges); +void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum, + const int w, const int h, + const enum LrEdgeFlags edges); +void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b, + const int w, const int h, const int strength, + const int bitdepth_max); +void BF(dav1d_sgr_finish_filter2, neon)(int16_t *tmp, + const pixel *src, const ptrdiff_t stride, + const int32_t *a, const int16_t *b, + const int w, const int h); + +/* filter with a 5x5 box (radius=2) */ +static void dav1d_sgr_filter2_neon(int16_t *tmp, + const pixel *src, const ptrdiff_t stride, + const pixel (*left)[4], + const pixel *lpf, const ptrdiff_t lpf_stride, + const int w, const int h, const int strength, + const enum LrEdgeFlags edges + HIGHBD_DECL_SUFFIX) +{ + ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); + int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; + ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,); + int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; + + BF(dav1d_sgr_box5_h, neon)(sumsq, sum, left, src, stride, w, h, edges); + if (edges & LR_HAVE_TOP) + BF(dav1d_sgr_box5_h, neon)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], + NULL, lpf, lpf_stride, w, 2, edges); + + if (edges & LR_HAVE_BOTTOM) + BF(dav1d_sgr_box5_h, neon)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], + NULL, lpf + 6 * PXSTRIDE(lpf_stride), + lpf_stride, w, 2, edges); + + dav1d_sgr_box5_v_neon(sumsq, sum, w, h, edges); + dav1d_sgr_calc_ab2_neon(a, b, w, h, strength, BITDEPTH_MAX); + BF(dav1d_sgr_finish_filter2, neon)(tmp, src, stride, a, b, w, h); +} + +void BF(dav1d_sgr_weighted1, neon)(pixel *dst, const ptrdiff_t dst_stride, + const pixel *src, const ptrdiff_t src_stride, + const int16_t *t1, const int w, const int h, + const int wt HIGHBD_DECL_SUFFIX); +void BF(dav1d_sgr_weighted2, neon)(pixel *dst, const ptrdiff_t dst_stride, + const pixel *src, const ptrdiff_t src_stride, + const int16_t *t1, const int16_t *t2, + const int w, const int h, + const int16_t wt[2] HIGHBD_DECL_SUFFIX); + +static void sgr_filter_neon(pixel *const dst, const ptrdiff_t dst_stride, + const pixel (*const left)[4], + const pixel *lpf, const ptrdiff_t lpf_stride, + const int w, const int h, const int sgr_idx, + const int16_t sgr_wt[7], const enum LrEdgeFlags edges + HIGHBD_DECL_SUFFIX) +{ + if (!dav1d_sgr_params[sgr_idx][0]) { + ALIGN_STK_16(int16_t, tmp, 64 * 384,); + dav1d_sgr_filter1_neon(tmp, dst, dst_stride, left, lpf, lpf_stride, + w, h, dav1d_sgr_params[sgr_idx][3], edges + HIGHBD_TAIL_SUFFIX); + if (w >= 8) + BF(dav1d_sgr_weighted1, neon)(dst, dst_stride, dst, dst_stride, + tmp, w & ~7, h, (1 << 7) - sgr_wt[1] + HIGHBD_TAIL_SUFFIX); + if (w & 7) { + // For uneven widths, do a full 8 pixel wide filtering into a temp + // buffer and copy out the narrow slice of pixels separately into + // dest. + ALIGN_STK_16(pixel, stripe, 64 * 8,); + BF(dav1d_sgr_weighted1, neon)(stripe, (w & 7) * sizeof(pixel), + dst + (w & ~7), dst_stride, + tmp + (w & ~7), w & 7, h, + (1 << 7) - sgr_wt[1] + HIGHBD_TAIL_SUFFIX); + BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, stripe, + w & 7, h); + } + } else if (!dav1d_sgr_params[sgr_idx][1]) { + ALIGN_STK_16(int16_t, tmp, 64 * 384,); + dav1d_sgr_filter2_neon(tmp, dst, dst_stride, left, lpf, lpf_stride, + w, h, dav1d_sgr_params[sgr_idx][2], edges + HIGHBD_TAIL_SUFFIX); + if (w >= 8) + BF(dav1d_sgr_weighted1, neon)(dst, dst_stride, dst, dst_stride, + tmp, w & ~7, h, sgr_wt[0] + HIGHBD_TAIL_SUFFIX); + if (w & 7) { + // For uneven widths, do a full 8 pixel wide filtering into a temp + // buffer and copy out the narrow slice of pixels separately into + // dest. + ALIGN_STK_16(pixel, stripe, 64 * 8,); + BF(dav1d_sgr_weighted1, neon)(stripe, (w & 7) * sizeof(pixel), + dst + (w & ~7), dst_stride, + tmp + (w & ~7), w & 7, h, sgr_wt[0] + HIGHBD_TAIL_SUFFIX); + BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, stripe, + w & 7, h); + } + } else { + ALIGN_STK_16(int16_t, tmp1, 64 * 384,); + ALIGN_STK_16(int16_t, tmp2, 64 * 384,); + dav1d_sgr_filter2_neon(tmp1, dst, dst_stride, left, lpf, lpf_stride, + w, h, dav1d_sgr_params[sgr_idx][2], edges + HIGHBD_TAIL_SUFFIX); + dav1d_sgr_filter1_neon(tmp2, dst, dst_stride, left, lpf, lpf_stride, + w, h, dav1d_sgr_params[sgr_idx][3], edges + HIGHBD_TAIL_SUFFIX); + const int16_t wt[2] = { sgr_wt[0], 128 - sgr_wt[0] - sgr_wt[1] }; + if (w >= 8) + BF(dav1d_sgr_weighted2, neon)(dst, dst_stride, dst, dst_stride, + tmp1, tmp2, w & ~7, h, wt + HIGHBD_TAIL_SUFFIX); + if (w & 7) { + // For uneven widths, do a full 8 pixel wide filtering into a temp + // buffer and copy out the narrow slice of pixels separately into + // dest. + ALIGN_STK_16(pixel, stripe, 64 * 8,); + BF(dav1d_sgr_weighted2, neon)(stripe, (w & 7) * sizeof(pixel), + dst + (w & ~7), dst_stride, + tmp1 + (w & ~7), tmp2 + (w & ~7), + w & 7, h, wt HIGHBD_TAIL_SUFFIX); + BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, stripe, + w & 7, h); + } + } +} +#endif // BITDEPTH == 8 + +COLD void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPContext *const c, int bpc) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; + + c->wiener = wiener_filter_neon; +#if BITDEPTH == 8 || ARCH_AARCH64 + if (bpc <= 10) + c->selfguided = sgr_filter_neon; +#endif +} diff --git a/third_party/dav1d/src/arm/mc_init_tmpl.c b/third_party/dav1d/src/arm/mc_init_tmpl.c new file mode 100644 index 0000000000..b4db40ac83 --- /dev/null +++ b/third_party/dav1d/src/arm/mc_init_tmpl.c @@ -0,0 +1,118 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include "src/mc.h" +#include "src/cpu.h" + +decl_mc_fn(BF(dav1d_put_8tap_regular, neon)); +decl_mc_fn(BF(dav1d_put_8tap_regular_smooth, neon)); +decl_mc_fn(BF(dav1d_put_8tap_regular_sharp, neon)); +decl_mc_fn(BF(dav1d_put_8tap_smooth, neon)); +decl_mc_fn(BF(dav1d_put_8tap_smooth_regular, neon)); +decl_mc_fn(BF(dav1d_put_8tap_smooth_sharp, neon)); +decl_mc_fn(BF(dav1d_put_8tap_sharp, neon)); +decl_mc_fn(BF(dav1d_put_8tap_sharp_regular, neon)); +decl_mc_fn(BF(dav1d_put_8tap_sharp_smooth, neon)); +decl_mc_fn(BF(dav1d_put_bilin, neon)); + +decl_mct_fn(BF(dav1d_prep_8tap_regular, neon)); +decl_mct_fn(BF(dav1d_prep_8tap_regular_smooth, neon)); +decl_mct_fn(BF(dav1d_prep_8tap_regular_sharp, neon)); +decl_mct_fn(BF(dav1d_prep_8tap_smooth, neon)); +decl_mct_fn(BF(dav1d_prep_8tap_smooth_regular, neon)); +decl_mct_fn(BF(dav1d_prep_8tap_smooth_sharp, neon)); +decl_mct_fn(BF(dav1d_prep_8tap_sharp, neon)); +decl_mct_fn(BF(dav1d_prep_8tap_sharp_regular, neon)); +decl_mct_fn(BF(dav1d_prep_8tap_sharp_smooth, neon)); +decl_mct_fn(BF(dav1d_prep_bilin, neon)); + +decl_avg_fn(BF(dav1d_avg, neon)); +decl_w_avg_fn(BF(dav1d_w_avg, neon)); +decl_mask_fn(BF(dav1d_mask, neon)); +decl_blend_fn(BF(dav1d_blend, neon)); +decl_blend_dir_fn(BF(dav1d_blend_h, neon)); +decl_blend_dir_fn(BF(dav1d_blend_v, neon)); + +decl_w_mask_fn(BF(dav1d_w_mask_444, neon)); +decl_w_mask_fn(BF(dav1d_w_mask_422, neon)); +decl_w_mask_fn(BF(dav1d_w_mask_420, neon)); + +decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, neon)); +decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, neon)); + +decl_emu_edge_fn(BF(dav1d_emu_edge, neon)); + +void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) { +#define init_mc_fn(type, name, suffix) \ + c->mc[type] = BF(dav1d_put_##name, suffix) +#define init_mct_fn(type, name, suffix) \ + c->mct[type] = BF(dav1d_prep_##name, suffix) + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; + + init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, neon); + init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon); + init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon); + init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon); + init_mc_fn (FILTER_2D_8TAP_SMOOTH, 8tap_smooth, neon); + init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, neon); + init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, neon); + init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, neon); + init_mc_fn (FILTER_2D_8TAP_SHARP, 8tap_sharp, neon); + init_mc_fn (FILTER_2D_BILINEAR, bilin, neon); + + init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, neon); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon); + init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, neon); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, neon); + init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, neon); + init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, neon); + init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, neon); + init_mct_fn(FILTER_2D_BILINEAR, bilin, neon); + + c->avg = BF(dav1d_avg, neon); + c->w_avg = BF(dav1d_w_avg, neon); + c->mask = BF(dav1d_mask, neon); +#if BITDEPTH == 8 || ARCH_AARCH64 + c->blend = BF(dav1d_blend, neon); + c->blend_h = BF(dav1d_blend_h, neon); + c->blend_v = BF(dav1d_blend_v, neon); + c->w_mask[0] = BF(dav1d_w_mask_444, neon); + c->w_mask[1] = BF(dav1d_w_mask_422, neon); + c->w_mask[2] = BF(dav1d_w_mask_420, neon); +#endif + c->warp8x8 = BF(dav1d_warp_affine_8x8, neon); + c->warp8x8t = BF(dav1d_warp_affine_8x8t, neon); +#if BITDEPTH == 8 || ARCH_AARCH64 + c->emu_edge = BF(dav1d_emu_edge, neon); +#endif +} diff --git a/third_party/dav1d/src/arm/msac.h b/third_party/dav1d/src/arm/msac.h new file mode 100644 index 0000000000..9db0bf86ae --- /dev/null +++ b/third_party/dav1d/src/arm/msac.h @@ -0,0 +1,52 @@ +/* + * Copyright © 2019, VideoLAN and dav1d authors + * Copyright © 2019, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_ARM_MSAC_H +#define DAV1D_SRC_ARM_MSAC_H + +unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf, + size_t n_symbols); +unsigned dav1d_msac_decode_symbol_adapt8_neon(MsacContext *s, uint16_t *cdf, + size_t n_symbols); +unsigned dav1d_msac_decode_symbol_adapt16_neon(MsacContext *s, uint16_t *cdf, + size_t n_symbols); +unsigned dav1d_msac_decode_hi_tok_neon(MsacContext *s, uint16_t *cdf); +unsigned dav1d_msac_decode_bool_adapt_neon(MsacContext *s, uint16_t *cdf); +unsigned dav1d_msac_decode_bool_equi_neon(MsacContext *s); +unsigned dav1d_msac_decode_bool_neon(MsacContext *s, unsigned f); + +#if ARCH_AARCH64 || defined(__ARM_NEON) +#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_neon +#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_neon +#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_neon +#define dav1d_msac_decode_hi_tok dav1d_msac_decode_hi_tok_neon +#define dav1d_msac_decode_bool_adapt dav1d_msac_decode_bool_adapt_neon +#define dav1d_msac_decode_bool_equi dav1d_msac_decode_bool_equi_neon +#define dav1d_msac_decode_bool dav1d_msac_decode_bool_neon +#endif + +#endif /* DAV1D_SRC_ARM_MSAC_H */ diff --git a/third_party/dav1d/src/cdef.h b/third_party/dav1d/src/cdef.h new file mode 100644 index 0000000000..5dd52cf6c4 --- /dev/null +++ b/third_party/dav1d/src/cdef.h @@ -0,0 +1,73 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_CDEF_H +#define DAV1D_SRC_CDEF_H + +#include +#include + +#include "common/bitdepth.h" + +enum CdefEdgeFlags { + CDEF_HAVE_LEFT = 1 << 0, + CDEF_HAVE_RIGHT = 1 << 1, + CDEF_HAVE_TOP = 1 << 2, + CDEF_HAVE_BOTTOM = 1 << 3, +}; + +#ifdef BITDEPTH +typedef const pixel (*const_left_pixel_row_2px)[2]; +#else +typedef const void *const_left_pixel_row_2px; +#endif + +// CDEF operates entirely on pre-filter data; if bottom/right edges are +// present (according to $edges), then the pre-filter data is located in +// $dst. However, the edge pixels above $dst may be post-filter, so in +// order to get access to pre-filter top pixels, use $top. +#define decl_cdef_fn(name) \ +void (name)(pixel *dst, ptrdiff_t stride, const_left_pixel_row_2px left, \ + const pixel *top, int pri_strength, int sec_strength, \ + int dir, int damping, enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX) +typedef decl_cdef_fn(*cdef_fn); + +#define decl_cdef_dir_fn(name) \ +int (name)(const pixel *dst, ptrdiff_t dst_stride, unsigned *var HIGHBD_DECL_SUFFIX) +typedef decl_cdef_dir_fn(*cdef_dir_fn); + +typedef struct Dav1dCdefDSPContext { + cdef_dir_fn dir; + cdef_fn fb[3 /* 444/luma, 422, 420 */]; +} Dav1dCdefDSPContext; + +bitfn_decls(void dav1d_cdef_dsp_init, Dav1dCdefDSPContext *c); +bitfn_decls(void dav1d_cdef_dsp_init_arm, Dav1dCdefDSPContext *c); +bitfn_decls(void dav1d_cdef_dsp_init_ppc, Dav1dCdefDSPContext *c); +bitfn_decls(void dav1d_cdef_dsp_init_x86, Dav1dCdefDSPContext *c); + +#endif /* DAV1D_SRC_CDEF_H */ diff --git a/third_party/dav1d/src/cdef_apply.h b/third_party/dav1d/src/cdef_apply.h new file mode 100644 index 0000000000..ffdffba055 --- /dev/null +++ b/third_party/dav1d/src/cdef_apply.h @@ -0,0 +1,38 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_CDEF_APPLY_H +#define DAV1D_SRC_CDEF_APPLY_H + +#include "common/bitdepth.h" + +#include "src/internal.h" + +void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *f, pixel *const p[3], + const Av1Filter *lflvl, int by_start, int by_end); + +#endif /* DAV1D_SRC_CDEF_APPLY_H */ diff --git a/third_party/dav1d/src/cdef_apply_tmpl.c b/third_party/dav1d/src/cdef_apply_tmpl.c new file mode 100644 index 0000000000..c45c7109d8 --- /dev/null +++ b/third_party/dav1d/src/cdef_apply_tmpl.c @@ -0,0 +1,234 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include + +#include "common/intops.h" + +#include "src/cdef_apply.h" + + +enum Backup2x8Flags { + BACKUP_2X8_Y = 1 << 0, + BACKUP_2X8_UV = 1 << 1, +}; + +static void backup2lines(pixel *const dst[3], /*const*/ pixel *const src[3], + const ptrdiff_t stride[2], + const enum Dav1dPixelLayout layout) +{ + const ptrdiff_t y_stride = PXSTRIDE(stride[0]); + if (y_stride < 0) + pixel_copy(dst[0] + y_stride, src[0] + 7 * y_stride, -2 * y_stride); + else + pixel_copy(dst[0], src[0] + 6 * y_stride, 2 * y_stride); + + if (layout != DAV1D_PIXEL_LAYOUT_I400) { + const ptrdiff_t uv_stride = PXSTRIDE(stride[1]); + if (uv_stride < 0) { + const int uv_off = layout == DAV1D_PIXEL_LAYOUT_I420 ? 3 : 7; + pixel_copy(dst[1] + uv_stride, src[1] + uv_off * uv_stride, -2 * uv_stride); + pixel_copy(dst[2] + uv_stride, src[2] + uv_off * uv_stride, -2 * uv_stride); + } else { + const int uv_off = layout == DAV1D_PIXEL_LAYOUT_I420 ? 2 : 6; + pixel_copy(dst[1], src[1] + uv_off * uv_stride, 2 * uv_stride); + pixel_copy(dst[2], src[2] + uv_off * uv_stride, 2 * uv_stride); + } + } +} + +static void backup2x8(pixel dst[3][8][2], + /*const*/ pixel *const src[3], + const ptrdiff_t src_stride[2], int x_off, + const enum Dav1dPixelLayout layout, + const enum Backup2x8Flags flag) +{ + ptrdiff_t y_off = 0; + if (flag & BACKUP_2X8_Y) { + for (int y = 0; y < 8; y++, y_off += PXSTRIDE(src_stride[0])) + pixel_copy(dst[0][y], &src[0][y_off + x_off - 2], 2); + } + + if (layout == DAV1D_PIXEL_LAYOUT_I400 || !(flag & BACKUP_2X8_UV)) + return; + + const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420; + const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444; + + x_off >>= ss_hor; + y_off = 0; + for (int y = 0; y < (8 >> ss_ver); y++, y_off += PXSTRIDE(src_stride[1])) { + pixel_copy(dst[1][y], &src[1][y_off + x_off - 2], 2); + pixel_copy(dst[2][y], &src[2][y_off + x_off - 2], 2); + } +} + +static int adjust_strength(const int strength, const unsigned var) { + if (!var) return 0; + const int i = var >> 6 ? imin(ulog2(var >> 6), 12) : 0; + return (strength * (4 + i) + 8) >> 4; +} + +void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f, + pixel *const p[3], + const Av1Filter *const lflvl, + const int by_start, const int by_end) +{ + const int bitdepth_min_8 = BITDEPTH == 8 ? 0 : f->cur.p.bpc - 8; + const Dav1dDSPContext *const dsp = f->dsp; + enum CdefEdgeFlags edges = CDEF_HAVE_BOTTOM | (by_start > 0 ? CDEF_HAVE_TOP : 0); + pixel *ptrs[3] = { p[0], p[1], p[2] }; + const int sbsz = 16; + const int sb64w = f->sb128w << 1; + const int damping = f->frame_hdr->cdef.damping + bitdepth_min_8; + const enum Dav1dPixelLayout layout = f->cur.p.layout; + const int uv_idx = DAV1D_PIXEL_LAYOUT_I444 - layout; + const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420; + const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444; + static const uint8_t uv_dirs[2][8] = { { 0, 1, 2, 3, 4, 5, 6, 7 }, + { 7, 0, 2, 4, 5, 6, 6, 6 } }; + const uint8_t *uv_dir = uv_dirs[layout == DAV1D_PIXEL_LAYOUT_I422]; + + for (int bit = 0, by = by_start; by < by_end; by += 2, edges |= CDEF_HAVE_TOP) { + const int tf = f->lf.top_pre_cdef_toggle; + const int by_idx = by & 30; + if (by + 2 >= f->bh) edges &= ~CDEF_HAVE_BOTTOM; + + if (edges & CDEF_HAVE_BOTTOM) // backup pre-filter data for next iteration + backup2lines(f->lf.cdef_line[!tf], ptrs, f->cur.stride, layout); + + ALIGN_STK_16(pixel, lr_bak, 2 /* idx */, [3 /* plane */][8 /* y */][2 /* x */]); + pixel *iptrs[3] = { ptrs[0], ptrs[1], ptrs[2] }; + edges &= ~CDEF_HAVE_LEFT; + edges |= CDEF_HAVE_RIGHT; + enum Backup2x8Flags prev_flag = 0; + for (int sbx = 0, last_skip = 1; sbx < sb64w; sbx++, edges |= CDEF_HAVE_LEFT) { + const int sb128x = sbx >> 1; + const int sb64_idx = ((by & sbsz) >> 3) + (sbx & 1); + const int cdef_idx = lflvl[sb128x].cdef_idx[sb64_idx]; + if (cdef_idx == -1 || + (!f->frame_hdr->cdef.y_strength[cdef_idx] && + !f->frame_hdr->cdef.uv_strength[cdef_idx])) + { + last_skip = 1; + goto next_sb; + } + + const int y_lvl = f->frame_hdr->cdef.y_strength[cdef_idx]; + const int uv_lvl = f->frame_hdr->cdef.uv_strength[cdef_idx]; + const enum Backup2x8Flags flag = !!y_lvl + (!!uv_lvl << 1); + + const int y_pri_lvl = (y_lvl >> 2) << bitdepth_min_8; + int y_sec_lvl = y_lvl & 3; + y_sec_lvl += y_sec_lvl == 3; + y_sec_lvl <<= bitdepth_min_8; + + const int uv_pri_lvl = (uv_lvl >> 2) << bitdepth_min_8; + int uv_sec_lvl = uv_lvl & 3; + uv_sec_lvl += uv_sec_lvl == 3; + uv_sec_lvl <<= bitdepth_min_8; + + pixel *bptrs[3] = { iptrs[0], iptrs[1], iptrs[2] }; + for (int bx = sbx * sbsz; bx < imin((sbx + 1) * sbsz, f->bw); + bx += 2, edges |= CDEF_HAVE_LEFT) + { + if (bx + 2 >= f->bw) edges &= ~CDEF_HAVE_RIGHT; + + // check if this 8x8 block had any coded coefficients; if not, + // go to the next block + const unsigned bx_mask = 3U << (bx & 14); + const int bx_idx = (bx & 16) >> 4; + if (!((lflvl[sb128x].noskip_mask[by_idx + 0][bx_idx] | + lflvl[sb128x].noskip_mask[by_idx + 1][bx_idx]) & bx_mask)) + { + last_skip = 1; + goto next_b; + } + const int do_left = last_skip ? flag : (prev_flag ^ flag) & flag; + prev_flag = flag; + if (do_left && edges & CDEF_HAVE_LEFT) { + // we didn't backup the prefilter data because it wasn't + // there, so do it here instead + backup2x8(lr_bak[bit], bptrs, f->cur.stride, 0, layout, do_left); + } + if (edges & CDEF_HAVE_RIGHT) { + // backup pre-filter data for next iteration + backup2x8(lr_bak[!bit], bptrs, f->cur.stride, 8, layout, flag); + } + + int dir; + unsigned variance; + if (y_pri_lvl || uv_pri_lvl) + dir = dsp->cdef.dir(bptrs[0], f->cur.stride[0], + &variance HIGHBD_CALL_SUFFIX); + + if (y_pri_lvl) { + const int adj_y_pri_lvl = adjust_strength(y_pri_lvl, variance); + if (adj_y_pri_lvl || y_sec_lvl) + dsp->cdef.fb[0](bptrs[0], f->cur.stride[0], lr_bak[bit][0], + &f->lf.cdef_line[tf][0][bx * 4], + adj_y_pri_lvl, y_sec_lvl, dir, + damping, edges HIGHBD_CALL_SUFFIX); + } else if (y_sec_lvl) + dsp->cdef.fb[0](bptrs[0], f->cur.stride[0], lr_bak[bit][0], + &f->lf.cdef_line[tf][0][bx * 4], + 0, y_sec_lvl, 0, + damping, edges HIGHBD_CALL_SUFFIX); + if (uv_lvl) { + assert(layout != DAV1D_PIXEL_LAYOUT_I400); + const int uvdir = uv_pri_lvl ? uv_dir[dir] : 0; + for (int pl = 1; pl <= 2; pl++) { + dsp->cdef.fb[uv_idx](bptrs[pl], f->cur.stride[1], lr_bak[bit][pl], + &f->lf.cdef_line[tf][pl][bx * 4 >> ss_hor], + uv_pri_lvl, uv_sec_lvl, uvdir, + damping - 1, edges HIGHBD_CALL_SUFFIX); + } + } + + bit ^= 1; + last_skip = 0; + + next_b: + bptrs[0] += 8; + bptrs[1] += 8 >> ss_hor; + bptrs[2] += 8 >> ss_hor; + } + + next_sb: + iptrs[0] += sbsz * 4; + iptrs[1] += sbsz * 4 >> ss_hor; + iptrs[2] += sbsz * 4 >> ss_hor; + } + + ptrs[0] += 8 * PXSTRIDE(f->cur.stride[0]); + ptrs[1] += 8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver; + ptrs[2] += 8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver; + f->lf.top_pre_cdef_toggle ^= 1; + } +} diff --git a/third_party/dav1d/src/cdef_tmpl.c b/third_party/dav1d/src/cdef_tmpl.c new file mode 100644 index 0000000000..41e3fe6674 --- /dev/null +++ b/third_party/dav1d/src/cdef_tmpl.c @@ -0,0 +1,312 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include + +#include "common/intops.h" + +#include "src/cdef.h" +#include "src/tables.h" + +static inline int constrain(const int diff, const int threshold, + const int shift) +{ + const int adiff = abs(diff); + return apply_sign(imin(adiff, imax(0, threshold - (adiff >> shift))), diff); +} + +static inline void fill(int16_t *tmp, const ptrdiff_t stride, + const int w, const int h) +{ + /* Use a value that's a large positive number when interpreted as unsigned, + * and a large negative number when interpreted as signed. */ + for (int y = 0; y < h; y++) { + for (int x = 0; x < w; x++) + tmp[x] = INT16_MIN; + tmp += stride; + } +} + +static void padding(int16_t *tmp, const ptrdiff_t tmp_stride, + const pixel *src, const ptrdiff_t src_stride, + const pixel (*left)[2], const pixel *top, + const int w, const int h, + const enum CdefEdgeFlags edges) +{ + // fill extended input buffer + int x_start = -2, x_end = w + 2, y_start = -2, y_end = h + 2; + if (!(edges & CDEF_HAVE_TOP)) { + fill(tmp - 2 - 2 * tmp_stride, tmp_stride, w + 4, 2); + y_start = 0; + } + if (!(edges & CDEF_HAVE_BOTTOM)) { + fill(tmp + h * tmp_stride - 2, tmp_stride, w + 4, 2); + y_end -= 2; + } + if (!(edges & CDEF_HAVE_LEFT)) { + fill(tmp + y_start * tmp_stride - 2, tmp_stride, 2, y_end - y_start); + x_start = 0; + } + if (!(edges & CDEF_HAVE_RIGHT)) { + fill(tmp + y_start * tmp_stride + w, tmp_stride, 2, y_end - y_start); + x_end -= 2; + } + + for (int y = y_start; y < 0; y++) { + for (int x = x_start; x < x_end; x++) + tmp[x + y * tmp_stride] = top[x]; + top += PXSTRIDE(src_stride); + } + for (int y = 0; y < h; y++) + for (int x = x_start; x < 0; x++) + tmp[x + y * tmp_stride] = left[y][2 + x]; + for (int y = 0; y < y_end; y++) { + for (int x = (y < h) ? 0 : x_start; x < x_end; x++) + tmp[x] = src[x]; + src += PXSTRIDE(src_stride); + tmp += tmp_stride; + } +} + +static NOINLINE void +cdef_filter_block_c(pixel *dst, const ptrdiff_t dst_stride, + const pixel (*left)[2], const pixel *const top, + const int pri_strength, const int sec_strength, + const int dir, const int damping, const int w, int h, + const enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX) +{ + const ptrdiff_t tmp_stride = 12; + assert((w == 4 || w == 8) && (h == 4 || h == 8)); + int16_t tmp_buf[144]; // 12*12 is the maximum value of tmp_stride * (h + 4) + int16_t *tmp = tmp_buf + 2 * tmp_stride + 2; + + padding(tmp, tmp_stride, dst, dst_stride, left, top, w, h, edges); + + if (pri_strength) { + const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; + const int pri_tap = 4 - ((pri_strength >> bitdepth_min_8) & 1); + const int pri_shift = imax(0, damping - ulog2(pri_strength)); + if (sec_strength) { + const int sec_shift = imax(0, damping - ulog2(sec_strength)); + do { + for (int x = 0; x < w; x++) { + const int px = dst[x]; + int sum = 0; + int max = px, min = px; + int pri_tap_k = pri_tap; + for (int k = 0; k < 2; k++) { + const int off1 = dav1d_cdef_directions[dir + 2][k]; // dir + const int p0 = tmp[x + off1]; + const int p1 = tmp[x - off1]; + sum += pri_tap_k * constrain(p0 - px, pri_strength, pri_shift); + sum += pri_tap_k * constrain(p1 - px, pri_strength, pri_shift); + // if pri_tap_k == 4 then it becomes 2 else it remains 3 + pri_tap_k = (pri_tap_k & 3) | 2; + min = umin(p0, min); + max = imax(p0, max); + min = umin(p1, min); + max = imax(p1, max); + const int off2 = dav1d_cdef_directions[dir + 4][k]; // dir + 2 + const int off3 = dav1d_cdef_directions[dir + 0][k]; // dir - 2 + const int s0 = tmp[x + off2]; + const int s1 = tmp[x - off2]; + const int s2 = tmp[x + off3]; + const int s3 = tmp[x - off3]; + // sec_tap starts at 2 and becomes 1 + const int sec_tap = 2 - k; + sum += sec_tap * constrain(s0 - px, sec_strength, sec_shift); + sum += sec_tap * constrain(s1 - px, sec_strength, sec_shift); + sum += sec_tap * constrain(s2 - px, sec_strength, sec_shift); + sum += sec_tap * constrain(s3 - px, sec_strength, sec_shift); + min = umin(s0, min); + max = imax(s0, max); + min = umin(s1, min); + max = imax(s1, max); + min = umin(s2, min); + max = imax(s2, max); + min = umin(s3, min); + max = imax(s3, max); + } + dst[x] = iclip(px + ((sum - (sum < 0) + 8) >> 4), min, max); + } + dst += PXSTRIDE(dst_stride); + tmp += tmp_stride; + } while (--h); + } else { // pri_strength only + do { + for (int x = 0; x < w; x++) { + const int px = dst[x]; + int sum = 0; + int pri_tap_k = pri_tap; + for (int k = 0; k < 2; k++) { + const int off = dav1d_cdef_directions[dir + 2][k]; // dir + const int p0 = tmp[x + off]; + const int p1 = tmp[x - off]; + sum += pri_tap_k * constrain(p0 - px, pri_strength, pri_shift); + sum += pri_tap_k * constrain(p1 - px, pri_strength, pri_shift); + pri_tap_k = (pri_tap_k & 3) | 2; + } + dst[x] = px + ((sum - (sum < 0) + 8) >> 4); + } + dst += PXSTRIDE(dst_stride); + tmp += tmp_stride; + } while (--h); + } + } else { // sec_strength only + assert(sec_strength); + const int sec_shift = imax(0, damping - ulog2(sec_strength)); + do { + for (int x = 0; x < w; x++) { + const int px = dst[x]; + int sum = 0; + for (int k = 0; k < 2; k++) { + const int off1 = dav1d_cdef_directions[dir + 4][k]; // dir + 2 + const int off2 = dav1d_cdef_directions[dir + 0][k]; // dir - 2 + const int s0 = tmp[x + off1]; + const int s1 = tmp[x - off1]; + const int s2 = tmp[x + off2]; + const int s3 = tmp[x - off2]; + const int sec_tap = 2 - k; + sum += sec_tap * constrain(s0 - px, sec_strength, sec_shift); + sum += sec_tap * constrain(s1 - px, sec_strength, sec_shift); + sum += sec_tap * constrain(s2 - px, sec_strength, sec_shift); + sum += sec_tap * constrain(s3 - px, sec_strength, sec_shift); + } + dst[x] = px + ((sum - (sum < 0) + 8) >> 4); + } + dst += PXSTRIDE(dst_stride); + tmp += tmp_stride; + } while (--h); + } +} + +#define cdef_fn(w, h) \ +static void cdef_filter_block_##w##x##h##_c(pixel *const dst, \ + const ptrdiff_t stride, \ + const pixel (*left)[2], \ + const pixel *const top, \ + const int pri_strength, \ + const int sec_strength, \ + const int dir, \ + const int damping, \ + const enum CdefEdgeFlags edges \ + HIGHBD_DECL_SUFFIX) \ +{ \ + cdef_filter_block_c(dst, stride, left, top, pri_strength, sec_strength, \ + dir, damping, w, h, edges HIGHBD_TAIL_SUFFIX); \ +} + +cdef_fn(4, 4); +cdef_fn(4, 8); +cdef_fn(8, 8); + +static int cdef_find_dir_c(const pixel *img, const ptrdiff_t stride, + unsigned *const var HIGHBD_DECL_SUFFIX) +{ + const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; + int partial_sum_hv[2][8] = { { 0 } }; + int partial_sum_diag[2][15] = { { 0 } }; + int partial_sum_alt[4][11] = { { 0 } }; + + for (int y = 0; y < 8; y++) { + for (int x = 0; x < 8; x++) { + const int px = (img[x] >> bitdepth_min_8) - 128; + + partial_sum_diag[0][ y + x ] += px; + partial_sum_alt [0][ y + (x >> 1)] += px; + partial_sum_hv [0][ y ] += px; + partial_sum_alt [1][3 + y - (x >> 1)] += px; + partial_sum_diag[1][7 + y - x ] += px; + partial_sum_alt [2][3 - (y >> 1) + x ] += px; + partial_sum_hv [1][ x ] += px; + partial_sum_alt [3][ (y >> 1) + x ] += px; + } + img += PXSTRIDE(stride); + } + + unsigned cost[8] = { 0 }; + for (int n = 0; n < 8; n++) { + cost[2] += partial_sum_hv[0][n] * partial_sum_hv[0][n]; + cost[6] += partial_sum_hv[1][n] * partial_sum_hv[1][n]; + } + cost[2] *= 105; + cost[6] *= 105; + + static const uint16_t div_table[7] = { 840, 420, 280, 210, 168, 140, 120 }; + for (int n = 0; n < 7; n++) { + const int d = div_table[n]; + cost[0] += (partial_sum_diag[0][n] * partial_sum_diag[0][n] + + partial_sum_diag[0][14 - n] * partial_sum_diag[0][14 - n]) * d; + cost[4] += (partial_sum_diag[1][n] * partial_sum_diag[1][n] + + partial_sum_diag[1][14 - n] * partial_sum_diag[1][14 - n]) * d; + } + cost[0] += partial_sum_diag[0][7] * partial_sum_diag[0][7] * 105; + cost[4] += partial_sum_diag[1][7] * partial_sum_diag[1][7] * 105; + + for (int n = 0; n < 4; n++) { + unsigned *const cost_ptr = &cost[n * 2 + 1]; + for (int m = 0; m < 5; m++) + *cost_ptr += partial_sum_alt[n][3 + m] * partial_sum_alt[n][3 + m]; + *cost_ptr *= 105; + for (int m = 0; m < 3; m++) { + const int d = div_table[2 * m + 1]; + *cost_ptr += (partial_sum_alt[n][m] * partial_sum_alt[n][m] + + partial_sum_alt[n][10 - m] * partial_sum_alt[n][10 - m]) * d; + } + } + + int best_dir = 0; + unsigned best_cost = cost[0]; + for (int n = 1; n < 8; n++) { + if (cost[n] > best_cost) { + best_cost = cost[n]; + best_dir = n; + } + } + + *var = (best_cost - (cost[best_dir ^ 4])) >> 10; + return best_dir; +} + +COLD void bitfn(dav1d_cdef_dsp_init)(Dav1dCdefDSPContext *const c) { + c->dir = cdef_find_dir_c; + c->fb[0] = cdef_filter_block_8x8_c; + c->fb[1] = cdef_filter_block_4x8_c; + c->fb[2] = cdef_filter_block_4x4_c; + +#if HAVE_ASM +#if ARCH_AARCH64 || ARCH_ARM + bitfn(dav1d_cdef_dsp_init_arm)(c); +#elif ARCH_PPC64LE + bitfn(dav1d_cdef_dsp_init_ppc)(c); +#elif ARCH_X86 + bitfn(dav1d_cdef_dsp_init_x86)(c); +#endif +#endif +} diff --git a/third_party/dav1d/src/cdf.c b/third_party/dav1d/src/cdf.c new file mode 100644 index 0000000000..d2ef8d6cf7 --- /dev/null +++ b/third_party/dav1d/src/cdf.c @@ -0,0 +1,4142 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include + +#include "src/internal.h" +#include "src/tables.h" + +#define CDF1(x) (32768-(x)) + +#define CDF2(a,b) \ + CDF1(a), CDF1(b) +#define CDF3(a,b,c) \ + CDF1(a), CDF2(b,c) +#define CDF4(a,b,c,d) \ + CDF1(a), CDF3(b,c,d) +#define CDF5(a,b,c,d,e) \ + CDF1(a), CDF4(b,c,d,e) +#define CDF6(a,b,c,d,e,f) \ + CDF1(a), CDF5(b,c,d,e,f) +#define CDF7(a,b,c,d,e,f,g) \ + CDF1(a), CDF6(b,c,d,e,f,g) +#define CDF8(a,b,c,d,e,f,g,h) \ + CDF1(a), CDF7(b,c,d,e,f,g,h) +#define CDF9(a,b,c,d,e,f,g,h,i) \ + CDF1(a), CDF8(b,c,d,e,f,g,h,i) +#define CDF10(a,b,c,d,e,f,g,h,i,j) \ + CDF1(a), CDF9(b,c,d,e,f,g,h,i,j) +#define CDF11(a,b,c,d,e,f,g,h,i,j,k) \ + CDF1(a), CDF10(b,c,d,e,f,g,h,i,j,k) +#define CDF12(a,b,c,d,e,f,g,h,i,j,k,l) \ + CDF1(a), CDF11(b,c,d,e,f,g,h,i,j,k,l) +#define CDF13(a,b,c,d,e,f,g,h,i,j,k,l,m) \ + CDF1(a), CDF12(b,c,d,e,f,g,h,i,j,k,l,m) +#define CDF14(a,b,c,d,e,f,g,h,i,j,k,l,m,n) \ + CDF1(a), CDF13(b,c,d,e,f,g,h,i,j,k,l,m,n) +#define CDF15(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o) \ + CDF1(a), CDF14(b,c,d,e,f,g,h,i,j,k,l,m,n,o) + +static const CdfModeContext av1_default_cdf = { + .y_mode = { + { CDF12(22801, 23489, 24293, 24756, 25601, 26123, + 26606, 27418, 27945, 29228, 29685, 30349) }, + { CDF12(18673, 19845, 22631, 23318, 23950, 24649, + 25527, 27364, 28152, 29701, 29984, 30852) }, + { CDF12(19770, 20979, 23396, 23939, 24241, 24654, + 25136, 27073, 27830, 29360, 29730, 30659) }, + { CDF12(20155, 21301, 22838, 23178, 23261, 23533, + 23703, 24804, 25352, 26575, 27016, 28049) }, + }, .use_filter_intra = { + [BS_4x4] = { CDF1( 4621) }, + [BS_4x8] = { CDF1( 6743) }, + [BS_8x4] = { CDF1( 5893) }, + [BS_8x8] = { CDF1( 7866) }, + [BS_8x16] = { CDF1(12551) }, + [BS_16x8] = { CDF1( 9394) }, + [BS_16x16] = { CDF1(12408) }, + [BS_16x32] = { CDF1(14301) }, + [BS_32x16] = { CDF1(12756) }, + [BS_32x32] = { CDF1(22343) }, + [BS_32x64] = { CDF1(16384) }, + [BS_64x32] = { CDF1(16384) }, + [BS_64x64] = { CDF1(16384) }, + [BS_64x128] = { CDF1(16384) }, + [BS_128x64] = { CDF1(16384) }, + [BS_128x128] = { CDF1(16384) }, + [BS_4x16] = { CDF1(12770) }, + [BS_16x4] = { CDF1(10368) }, + [BS_8x32] = { CDF1(20229) }, + [BS_32x8] = { CDF1(18101) }, + [BS_16x64] = { CDF1(16384) }, + [BS_64x16] = { CDF1(16384) }, + }, .filter_intra = { + CDF4(8949, 12776, 17211, 29558), + }, .uv_mode = { + { + { CDF12(22631, 24152, 25378, 25661, 25986, 26520, + 27055, 27923, 28244, 30059, 30941, 31961) }, + { CDF12( 9513, 26881, 26973, 27046, 27118, 27664, + 27739, 27824, 28359, 29505, 29800, 31796) }, + { CDF12( 9845, 9915, 28663, 28704, 28757, 28780, + 29198, 29822, 29854, 30764, 31777, 32029) }, + { CDF12(13639, 13897, 14171, 25331, 25606, 25727, + 25953, 27148, 28577, 30612, 31355, 32493) }, + { CDF12( 9764, 9835, 9930, 9954, 25386, 27053, + 27958, 28148, 28243, 31101, 31744, 32363) }, + { CDF12(11825, 13589, 13677, 13720, 15048, 29213, + 29301, 29458, 29711, 31161, 31441, 32550) }, + { CDF12(14175, 14399, 16608, 16821, 17718, 17775, + 28551, 30200, 30245, 31837, 32342, 32667) }, + { CDF12(12885, 13038, 14978, 15590, 15673, 15748, + 16176, 29128, 29267, 30643, 31961, 32461) }, + { CDF12(12026, 13661, 13874, 15305, 15490, 15726, + 15995, 16273, 28443, 30388, 30767, 32416) }, + { CDF12(19052, 19840, 20579, 20916, 21150, 21467, + 21885, 22719, 23174, 28861, 30379, 32175) }, + { CDF12(18627, 19649, 20974, 21219, 21492, 21816, + 22199, 23119, 23527, 27053, 31397, 32148) }, + { CDF12(17026, 19004, 19997, 20339, 20586, 21103, + 21349, 21907, 22482, 25896, 26541, 31819) }, + { CDF12(12124, 13759, 14959, 14992, 15007, 15051, + 15078, 15166, 15255, 15753, 16039, 16606) }, + }, { + { CDF13(10407, 11208, 12900, 13181, 13823, 14175, 14899, + 15656, 15986, 20086, 20995, 22455, 24212) }, + { CDF13( 4532, 19780, 20057, 20215, 20428, 21071, 21199, + 21451, 22099, 24228, 24693, 27032, 29472) }, + { CDF13( 5273, 5379, 20177, 20270, 20385, 20439, 20949, + 21695, 21774, 23138, 24256, 24703, 26679) }, + { CDF13( 6740, 7167, 7662, 14152, 14536, 14785, 15034, + 16741, 18371, 21520, 22206, 23389, 24182) }, + { CDF13( 4987, 5368, 5928, 6068, 19114, 20315, 21857, + 22253, 22411, 24911, 25380, 26027, 26376) }, + { CDF13( 5370, 6889, 7247, 7393, 9498, 21114, 21402, + 21753, 21981, 24780, 25386, 26517, 27176) }, + { CDF13( 4816, 4961, 7204, 7326, 8765, 8930, 20169, + 20682, 20803, 23188, 23763, 24455, 24940) }, + { CDF13( 6608, 6740, 8529, 9049, 9257, 9356, 9735, + 18827, 19059, 22336, 23204, 23964, 24793) }, + { CDF13( 5998, 7419, 7781, 8933, 9255, 9549, 9753, + 10417, 18898, 22494, 23139, 24764, 25989) }, + { CDF13(10660, 11298, 12550, 12957, 13322, 13624, 14040, + 15004, 15534, 20714, 21789, 23443, 24861) }, + { CDF13(10522, 11530, 12552, 12963, 13378, 13779, 14245, + 15235, 15902, 20102, 22696, 23774, 25838) }, + { CDF13(10099, 10691, 12639, 13049, 13386, 13665, 14125, + 15163, 15636, 19676, 20474, 23519, 25208) }, + { CDF13( 3144, 5087, 7382, 7504, 7593, 7690, 7801, + 8064, 8232, 9248, 9875, 10521, 29048) }, + }, + }, .angle_delta = { + { CDF6( 2180, 5032, 7567, 22776, 26989, 30217) }, + { CDF6( 2301, 5608, 8801, 23487, 26974, 30330) }, + { CDF6( 3780, 11018, 13699, 19354, 23083, 31286) }, + { CDF6( 4581, 11226, 15147, 17138, 21834, 28397) }, + { CDF6( 1737, 10927, 14509, 19588, 22745, 28823) }, + { CDF6( 2664, 10176, 12485, 17650, 21600, 30495) }, + { CDF6( 2240, 11096, 15453, 20341, 22561, 28917) }, + { CDF6( 3605, 10428, 12459, 17676, 21244, 30655) }, + }, .filter = { + { + { CDF2(31935, 32720) }, { CDF2( 5568, 32719) }, + { CDF2( 422, 2938) }, { CDF2(28244, 32608) }, + { CDF2(31206, 31953) }, { CDF2( 4862, 32121) }, + { CDF2( 770, 1152) }, { CDF2(20889, 25637) }, + }, { + { CDF2(31910, 32724) }, { CDF2( 4120, 32712) }, + { CDF2( 305, 2247) }, { CDF2(27403, 32636) }, + { CDF2(31022, 32009) }, { CDF2( 2963, 32093) }, + { CDF2( 601, 943) }, { CDF2(14969, 21398) }, + }, + }, .newmv_mode = { + { CDF1(24035) }, { CDF1(16630) }, { CDF1(15339) }, + { CDF1( 8386) }, { CDF1(12222) }, { CDF1( 4676) }, + }, .globalmv_mode = { + { CDF1( 2175) }, { CDF1( 1054) }, + }, .refmv_mode = { + { CDF1(23974) }, { CDF1(24188) }, { CDF1(17848) }, + { CDF1(28622) }, { CDF1(24312) }, { CDF1(19923) }, + }, .drl_bit = { + { CDF1(13104) }, { CDF1(24560) }, { CDF1(18945) }, + }, .comp_inter_mode = { + { CDF7( 7760, 13823, 15808, 17641, 19156, 20666, 26891) }, + { CDF7(10730, 19452, 21145, 22749, 24039, 25131, 28724) }, + { CDF7(10664, 20221, 21588, 22906, 24295, 25387, 28436) }, + { CDF7(13298, 16984, 20471, 24182, 25067, 25736, 26422) }, + { CDF7(18904, 23325, 25242, 27432, 27898, 28258, 30758) }, + { CDF7(10725, 17454, 20124, 22820, 24195, 25168, 26046) }, + { CDF7(17125, 24273, 25814, 27492, 28214, 28704, 30592) }, + { CDF7(13046, 23214, 24505, 25942, 27435, 28442, 29330) }, + }, .intra = { + { CDF1( 806) }, { CDF1(16662) }, { CDF1(20186) }, + { CDF1(26538) }, + }, .comp = { + { CDF1(26828) }, { CDF1(24035) }, { CDF1(12031) }, + { CDF1(10640) }, { CDF1( 2901) }, + }, .comp_dir = { + { CDF1( 1198) }, { CDF1( 2070) }, { CDF1( 9166) }, + { CDF1( 7499) }, { CDF1(22475) }, + }, .jnt_comp = { + { CDF1(18244) }, { CDF1(12865) }, { CDF1( 7053) }, + { CDF1(13259) }, { CDF1( 9334) }, { CDF1( 4644) }, + }, .mask_comp = { + { CDF1(26607) }, { CDF1(22891) }, { CDF1(18840) }, + { CDF1(24594) }, { CDF1(19934) }, { CDF1(22674) }, + }, .wedge_comp = { + { CDF1(23431) }, { CDF1(13171) }, { CDF1(11470) }, + { CDF1( 9770) }, { CDF1( 9100) }, { CDF1( 8233) }, + { CDF1( 6172) }, { CDF1(11820) }, { CDF1( 7701) }, + }, .wedge_idx = { + { CDF15( 2438, 4440, 6599, 8663, 11005, 12874, 15751, 18094, + 20359, 22362, 24127, 25702, 27752, 29450, 31171) }, + { CDF15( 806, 3266, 6005, 6738, 7218, 7367, 7771, 14588, + 16323, 17367, 18452, 19422, 22839, 26127, 29629) }, + { CDF15( 2779, 3738, 4683, 7213, 7775, 8017, 8655, 14357, + 17939, 21332, 24520, 27470, 29456, 30529, 31656) }, + { CDF15( 1684, 3625, 5675, 7108, 9302, 11274, 14429, 17144, + 19163, 20961, 22884, 24471, 26719, 28714, 30877) }, + { CDF15( 1142, 3491, 6277, 7314, 8089, 8355, 9023, 13624, + 15369, 16730, 18114, 19313, 22521, 26012, 29550) }, + { CDF15( 2742, 4195, 5727, 8035, 8980, 9336, 10146, 14124, + 17270, 20533, 23434, 25972, 27944, 29570, 31416) }, + { CDF15( 1727, 3948, 6101, 7796, 9841, 12344, 15766, 18944, + 20638, 22038, 23963, 25311, 26988, 28766, 31012) }, + { CDF15( 154, 987, 1925, 2051, 2088, 2111, 2151, 23033, + 23703, 24284, 24985, 25684, 27259, 28883, 30911) }, + { CDF15( 1135, 1322, 1493, 2635, 2696, 2737, 2770, 21016, + 22935, 25057, 27251, 29173, 30089, 30960, 31933) }, + }, .interintra = { + { CDF1(16384) }, { CDF1(26887) }, { CDF1(27597) }, + { CDF1(30237) }, + }, .interintra_mode = { + { CDF3(8192, 16384, 24576) }, + { CDF3(1875, 11082, 27332) }, + { CDF3(2473, 9996, 26388) }, + { CDF3(4238, 11537, 25926) }, + }, .interintra_wedge = { + { CDF1(20036) }, { CDF1(24957) }, { CDF1(26704) }, + { CDF1(27530) }, { CDF1(29564) }, { CDF1(29444) }, + { CDF1(26872) }, + }, .ref = { + { { CDF1( 4897) }, { CDF1(16973) }, { CDF1(29744) } }, + { { CDF1( 1555) }, { CDF1(16751) }, { CDF1(30279) } }, + { { CDF1( 4236) }, { CDF1(19647) }, { CDF1(31194) } }, + { { CDF1( 8650) }, { CDF1(24773) }, { CDF1(31895) } }, + { { CDF1( 904) }, { CDF1(11014) }, { CDF1(26875) } }, + { { CDF1( 1444) }, { CDF1(15087) }, { CDF1(30304) } }, + }, .comp_fwd_ref = { + { { CDF1( 4946) }, { CDF1(19891) }, { CDF1(30731) } }, + { { CDF1( 9468) }, { CDF1(22441) }, { CDF1(31059) } }, + { { CDF1( 1503) }, { CDF1(15160) }, { CDF1(27544) } }, + }, .comp_bwd_ref = { + { { CDF1( 2235) }, { CDF1(17182) }, { CDF1(30606) } }, + { { CDF1( 1423) }, { CDF1(15175) }, { CDF1(30489) } }, + }, .comp_uni_ref = { + { { CDF1( 5284) }, { CDF1(23152) }, { CDF1(31774) } }, + { { CDF1( 3865) }, { CDF1(14173) }, { CDF1(25120) } }, + { { CDF1( 3128) }, { CDF1(15270) }, { CDF1(26710) } }, + }, .txsz = { + { + { CDF1(19968) }, { CDF1(19968) }, { CDF1(24320) }, + }, { + { CDF2(12272, 30172) }, { CDF2(12272, 30172) }, + { CDF2(18677, 30848) }, + }, { + { CDF2(12986, 15180) }, { CDF2(12986, 15180) }, + { CDF2(24302, 25602) }, + }, { + { CDF2( 5782, 11475) }, { CDF2( 5782, 11475) }, + { CDF2(16803, 22759) }, + }, + }, .txpart = { + { { CDF1(28581) }, { CDF1(23846) }, { CDF1(20847) } }, + { { CDF1(24315) }, { CDF1(18196) }, { CDF1(12133) } }, + { { CDF1(18791) }, { CDF1(10887) }, { CDF1(11005) } }, + { { CDF1(27179) }, { CDF1(20004) }, { CDF1(11281) } }, + { { CDF1(26549) }, { CDF1(19308) }, { CDF1(14224) } }, + { { CDF1(28015) }, { CDF1(21546) }, { CDF1(14400) } }, + { { CDF1(28165) }, { CDF1(22401) }, { CDF1(16088) } }, + }, .txtp_inter1 = { + { CDF15( 4458, 5560, 7695, 9709, 13330, 14789, 17537, 20266, + 21504, 22848, 23934, 25474, 27727, 28915, 30631) }, + { CDF15( 1645, 2573, 4778, 5711, 7807, 8622, 10522, 15357, + 17674, 20408, 22517, 25010, 27116, 28856, 30749) }, + }, .txtp_inter2 = { + CDF11( 770, 2421, 5225, 12907, 15819, 18927, + 21561, 24089, 26595, 28526, 30529) + }, .txtp_inter3 = { + { CDF1(16384) }, { CDF1( 4167) }, { CDF1( 1998) }, { CDF1( 748) }, + }, .txtp_intra1 = { + { + { CDF6( 1535, 8035, 9461, 12751, 23467, 27825) }, + { CDF6( 564, 3335, 9709, 10870, 18143, 28094) }, + { CDF6( 672, 3247, 3676, 11982, 19415, 23127) }, + { CDF6( 5279, 13885, 15487, 18044, 23527, 30252) }, + { CDF6( 4423, 6074, 7985, 10416, 25693, 29298) }, + { CDF6( 1486, 4241, 9460, 10662, 16456, 27694) }, + { CDF6( 439, 2838, 3522, 6737, 18058, 23754) }, + { CDF6( 1190, 4233, 4855, 11670, 20281, 24377) }, + { CDF6( 1045, 4312, 8647, 10159, 18644, 29335) }, + { CDF6( 202, 3734, 4747, 7298, 17127, 24016) }, + { CDF6( 447, 4312, 6819, 8884, 16010, 23858) }, + { CDF6( 277, 4369, 5255, 8905, 16465, 22271) }, + { CDF6( 3409, 5436, 10599, 15599, 19687, 24040) }, + }, { + { CDF6( 1870, 13742, 14530, 16498, 23770, 27698) }, + { CDF6( 326, 8796, 14632, 15079, 19272, 27486) }, + { CDF6( 484, 7576, 7712, 14443, 19159, 22591) }, + { CDF6( 1126, 15340, 15895, 17023, 20896, 30279) }, + { CDF6( 655, 4854, 5249, 5913, 22099, 27138) }, + { CDF6( 1299, 6458, 8885, 9290, 14851, 25497) }, + { CDF6( 311, 5295, 5552, 6885, 16107, 22672) }, + { CDF6( 883, 8059, 8270, 11258, 17289, 21549) }, + { CDF6( 741, 7580, 9318, 10345, 16688, 29046) }, + { CDF6( 110, 7406, 7915, 9195, 16041, 23329) }, + { CDF6( 363, 7974, 9357, 10673, 15629, 24474) }, + { CDF6( 153, 7647, 8112, 9936, 15307, 19996) }, + { CDF6( 3511, 6332, 11165, 15335, 19323, 23594) }, + }, + }, .txtp_intra2 = { + { + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + }, { + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + }, { + { CDF4( 1127, 12814, 22772, 27483) }, + { CDF4( 145, 6761, 11980, 26667) }, + { CDF4( 362, 5887, 11678, 16725) }, + { CDF4( 385, 15213, 18587, 30693) }, + { CDF4( 25, 2914, 23134, 27903) }, + { CDF4( 60, 4470, 11749, 23991) }, + { CDF4( 37, 3332, 14511, 21448) }, + { CDF4( 157, 6320, 13036, 17439) }, + { CDF4( 119, 6719, 12906, 29396) }, + { CDF4( 47, 5537, 12576, 21499) }, + { CDF4( 269, 6076, 11258, 23115) }, + { CDF4( 83, 5615, 12001, 17228) }, + { CDF4( 1968, 5556, 12023, 18547) }, + }, + }, .skip = { + { CDF1(31671) }, { CDF1(16515) }, { CDF1( 4576) }, + }, .skip_mode = { + { CDF1(32621) }, { CDF1(20708) }, { CDF1( 8127) }, + }, .partition = { + { + // 128x128 -> 64x64 + { CDF7(27899, 28219, 28529, 32484, 32539, 32619, 32639) }, + { CDF7( 6607, 6990, 8268, 32060, 32219, 32338, 32371) }, + { CDF7( 5429, 6676, 7122, 32027, 32227, 32531, 32582) }, + { CDF7( 711, 966, 1172, 32448, 32538, 32617, 32664) }, + }, { + // 64x64 -> 32x32 + { CDF9(20137, 21547, 23078, 29566, 29837, + 30261, 30524, 30892, 31724) }, + { CDF9( 6732, 7490, 9497, 27944, 28250, + 28515, 28969, 29630, 30104) }, + { CDF9( 5945, 7663, 8348, 28683, 29117, + 29749, 30064, 30298, 32238) }, + { CDF9( 870, 1212, 1487, 31198, 31394, + 31574, 31743, 31881, 32332) }, + }, { + // 32x32 -> 16x16 + { CDF9(18462, 20920, 23124, 27647, 28227, + 29049, 29519, 30178, 31544) }, + { CDF9( 7689, 9060, 12056, 24992, 25660, + 26182, 26951, 28041, 29052) }, + { CDF9( 6015, 9009, 10062, 24544, 25409, + 26545, 27071, 27526, 32047) }, + { CDF9( 1394, 2208, 2796, 28614, 29061, + 29466, 29840, 30185, 31899) }, + }, { + // 16x16 -> 8x8 + { CDF9(15597, 20929, 24571, 26706, 27664, + 28821, 29601, 30571, 31902) }, + { CDF9( 7925, 11043, 16785, 22470, 23971, + 25043, 26651, 28701, 29834) }, + { CDF9( 5414, 13269, 15111, 20488, 22360, + 24500, 25537, 26336, 32117) }, + { CDF9( 2662, 6362, 8614, 20860, 23053, + 24778, 26436, 27829, 31171) }, + }, { + // 8x8 -> 4x4 only supports the four legacy partition types + { CDF3(19132, 25510, 30392) }, + { CDF3(13928, 19855, 28540) }, + { CDF3(12522, 23679, 28629) }, + { CDF3( 9896, 18783, 25853) }, + }, + }, .seg_pred = { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) }, + }, .seg_id = { + { CDF7( 5622, 7893, 16093, 18233, 27809, 28373, 32533) }, + { CDF7(14274, 18230, 22557, 24935, 29980, 30851, 32344) }, + { CDF7(27527, 28487, 28723, 28890, 32397, 32647, 32679) }, + }, .cfl_sign = { + CDF7( 1418, 2123, 13340, 18405, 26972, 28343, 32294) + }, .cfl_alpha = { + { CDF15( 7637, 20719, 31401, 32481, 32657, 32688, 32692, 32696, + 32700, 32704, 32708, 32712, 32716, 32720, 32724) }, + { CDF15(14365, 23603, 28135, 31168, 32167, 32395, 32487, 32573, + 32620, 32647, 32668, 32672, 32676, 32680, 32684) }, + { CDF15(11532, 22380, 28445, 31360, 32349, 32523, 32584, 32649, + 32673, 32677, 32681, 32685, 32689, 32693, 32697) }, + { CDF15(26990, 31402, 32282, 32571, 32692, 32696, 32700, 32704, + 32708, 32712, 32716, 32720, 32724, 32728, 32732) }, + { CDF15(17248, 26058, 28904, 30608, 31305, 31877, 32126, 32321, + 32394, 32464, 32516, 32560, 32576, 32593, 32622) }, + { CDF15(14738, 21678, 25779, 27901, 29024, 30302, 30980, 31843, + 32144, 32413, 32520, 32594, 32622, 32656, 32660) }, + }, .restore_wiener = { + CDF1(11570) + }, .restore_sgrproj = { + CDF1(16855) + }, .restore_switchable = { + CDF2( 9413, 22581) + }, .delta_q = { + CDF3(28160, 32120, 32677) + }, .delta_lf = { + { CDF3(28160, 32120, 32677) }, + { CDF3(28160, 32120, 32677) }, + { CDF3(28160, 32120, 32677) }, + { CDF3(28160, 32120, 32677) }, + { CDF3(28160, 32120, 32677) }, + }, .motion_mode = { + [BS_8x8] = { CDF2( 7651, 24760) }, + [BS_8x16] = { CDF2( 4738, 24765) }, + [BS_8x32] = { CDF2(28799, 31390) }, + [BS_16x8] = { CDF2( 5391, 25528) }, + [BS_16x16] = { CDF2(19419, 26810) }, + [BS_16x32] = { CDF2( 5123, 23606) }, + [BS_16x64] = { CDF2(28973, 31594) }, + [BS_32x8] = { CDF2(26431, 30774) }, + [BS_32x16] = { CDF2(11606, 24308) }, + [BS_32x32] = { CDF2(26260, 29116) }, + [BS_32x64] = { CDF2(20360, 28062) }, + [BS_64x16] = { CDF2(29742, 31203) }, + [BS_64x32] = { CDF2(21679, 26830) }, + [BS_64x64] = { CDF2(29516, 30701) }, + [BS_64x128] = { CDF2(28898, 30397) }, + [BS_128x64] = { CDF2(30878, 31335) }, + [BS_128x128] = { CDF2(32507, 32558) }, + }, .obmc = { + [BS_8x8] = { CDF1(10437) }, + [BS_8x16] = { CDF1( 9371) }, + [BS_8x32] = { CDF1(23664) }, + [BS_16x8] = { CDF1( 9301) }, + [BS_16x16] = { CDF1(17432) }, + [BS_16x32] = { CDF1(14423) }, + [BS_16x64] = { CDF1(24008) }, + [BS_32x8] = { CDF1(20901) }, + [BS_32x16] = { CDF1(15142) }, + [BS_32x32] = { CDF1(25817) }, + [BS_32x64] = { CDF1(22823) }, + [BS_64x16] = { CDF1(26879) }, + [BS_64x32] = { CDF1(22083) }, + [BS_64x64] = { CDF1(30128) }, + [BS_64x128] = { CDF1(31014) }, + [BS_128x64] = { CDF1(31560) }, + [BS_128x128] = { CDF1(32638) }, + }, .pal_y = { + { { CDF1(31676) }, { CDF1( 3419) }, { CDF1( 1261) } }, + { { CDF1(31912) }, { CDF1( 2859) }, { CDF1( 980) } }, + { { CDF1(31823) }, { CDF1( 3400) }, { CDF1( 781) } }, + { { CDF1(32030) }, { CDF1( 3561) }, { CDF1( 904) } }, + { { CDF1(32309) }, { CDF1( 7337) }, { CDF1( 1462) } }, + { { CDF1(32265) }, { CDF1( 4015) }, { CDF1( 1521) } }, + { { CDF1(32450) }, { CDF1( 7946) }, { CDF1( 129) } }, + }, .pal_sz = { + { + { CDF6( 7952, 13000, 18149, 21478, 25527, 29241) }, + { CDF6( 7139, 11421, 16195, 19544, 23666, 28073) }, + { CDF6( 7788, 12741, 17325, 20500, 24315, 28530) }, + { CDF6( 8271, 14064, 18246, 21564, 25071, 28533) }, + { CDF6(12725, 19180, 21863, 24839, 27535, 30120) }, + { CDF6( 9711, 14888, 16923, 21052, 25661, 27875) }, + { CDF6(14940, 20797, 21678, 24186, 27033, 28999) }, + }, { + { CDF6( 8713, 19979, 27128, 29609, 31331, 32272) }, + { CDF6( 5839, 15573, 23581, 26947, 29848, 31700) }, + { CDF6( 4426, 11260, 17999, 21483, 25863, 29430) }, + { CDF6( 3228, 9464, 14993, 18089, 22523, 27420) }, + { CDF6( 3768, 8886, 13091, 17852, 22495, 27207) }, + { CDF6( 2464, 8451, 12861, 21632, 25525, 28555) }, + { CDF6( 1269, 5435, 10433, 18963, 21700, 25865) }, + }, + }, .pal_uv = { + { CDF1(32461) }, { CDF1(21488) }, + }, .color_map = { + { /* y */ + { + { CDF1(28710) }, { CDF1(16384) }, { CDF1(10553) }, + { CDF1(27036) }, { CDF1(31603) }, + }, { + { CDF2(27877, 30490) }, { CDF2(11532, 25697) }, + { CDF2( 6544, 30234) }, { CDF2(23018, 28072) }, + { CDF2(31915, 32385) }, + }, { + { CDF3(25572, 28046, 30045) }, + { CDF3( 9478, 21590, 27256) }, + { CDF3( 7248, 26837, 29824) }, + { CDF3(19167, 24486, 28349) }, + { CDF3(31400, 31825, 32250) }, + }, { + { CDF4(24779, 26955, 28576, 30282) }, + { CDF4( 8669, 20364, 24073, 28093) }, + { CDF4( 4255, 27565, 29377, 31067) }, + { CDF4(19864, 23674, 26716, 29530) }, + { CDF4(31646, 31893, 32147, 32426) }, + }, { + { CDF5(23132, 25407, 26970, 28435, 30073) }, + { CDF5( 7443, 17242, 20717, 24762, 27982) }, + { CDF5( 6300, 24862, 26944, 28784, 30671) }, + { CDF5(18916, 22895, 25267, 27435, 29652) }, + { CDF5(31270, 31550, 31808, 32059, 32353) }, + }, { + { CDF6(23105, 25199, 26464, 27684, 28931, 30318) }, + { CDF6( 6950, 15447, 18952, 22681, 25567, 28563) }, + { CDF6( 7560, 23474, 25490, 27203, 28921, 30708) }, + { CDF6(18544, 22373, 24457, 26195, 28119, 30045) }, + { CDF6(31198, 31451, 31670, 31882, 32123, 32391) }, + }, { + { CDF7(21689, 23883, 25163, 26352, 27506, 28827, 30195) }, + { CDF7( 6892, 15385, 17840, 21606, 24287, 26753, 29204) }, + { CDF7( 5651, 23182, 25042, 26518, 27982, 29392, 30900) }, + { CDF7(19349, 22578, 24418, 25994, 27524, 29031, 30448) }, + { CDF7(31028, 31270, 31504, 31705, 31927, 32153, 32392) }, + }, + }, { /* uv */ + { + { CDF1(29089) }, { CDF1(16384) }, { CDF1( 8713) }, + { CDF1(29257) }, { CDF1(31610) }, + }, { + { CDF2(25257, 29145) }, { CDF2(12287, 27293) }, + { CDF2( 7033, 27960) }, { CDF2(20145, 25405) }, + { CDF2(30608, 31639) }, + }, { + { CDF3(24210, 27175, 29903) }, + { CDF3( 9888, 22386, 27214) }, + { CDF3( 5901, 26053, 29293) }, + { CDF3(18318, 22152, 28333) }, + { CDF3(30459, 31136, 31926) }, + }, { + { CDF4(22980, 25479, 27781, 29986) }, + { CDF4( 8413, 21408, 24859, 28874) }, + { CDF4( 2257, 29449, 30594, 31598) }, + { CDF4(19189, 21202, 25915, 28620) }, + { CDF4(31844, 32044, 32281, 32518) }, + }, { + { CDF5(22217, 24567, 26637, 28683, 30548) }, + { CDF5( 7307, 16406, 19636, 24632, 28424) }, + { CDF5( 4441, 25064, 26879, 28942, 30919) }, + { CDF5(17210, 20528, 23319, 26750, 29582) }, + { CDF5(30674, 30953, 31396, 31735, 32207) }, + }, { + { CDF6(21239, 23168, 25044, 26962, 28705, 30506) }, + { CDF6( 6545, 15012, 18004, 21817, 25503, 28701) }, + { CDF6( 3448, 26295, 27437, 28704, 30126, 31442) }, + { CDF6(15889, 18323, 21704, 24698, 26976, 29690) }, + { CDF6(30988, 31204, 31479, 31734, 31983, 32325) }, + }, { + { CDF7(21442, 23288, 24758, 26246, 27649, 28980, 30563) }, + { CDF7( 5863, 14933, 17552, 20668, 23683, 26411, 29273) }, + { CDF7( 3415, 25810, 26877, 27990, 29223, 30394, 31618) }, + { CDF7(17965, 20084, 22232, 23974, 26274, 28402, 30390) }, + { CDF7(31190, 31329, 31516, 31679, 31825, 32026, 32322) }, + }, + }, + }, .intrabc = { + CDF1(30531) + }, +}; + +static const CdfMvComponent default_mv_component_cdf = { + .classes = { + CDF10(28672, 30976, 31858, 32320, 32551, + 32656, 32740, 32757, 32762, 32767) + }, .class0 = { + CDF1(27648) + }, .classN = { + { CDF1(17408) }, { CDF1(17920) }, { CDF1(18944) }, + { CDF1(20480) }, { CDF1(22528) }, { CDF1(24576) }, + { CDF1(28672) }, { CDF1(29952) }, { CDF1(29952) }, + { CDF1(30720) }, + }, .class0_fp = { + { CDF3(16384, 24576, 26624) }, + { CDF3(12288, 21248, 24128) }, + }, .classN_fp = { + CDF3( 8192, 17408, 21248) + }, .class0_hp = { + CDF1(20480) + }, .classN_hp = { + CDF1(16384) + }, .sign = { + CDF1(16384) + }, +}; + +static const uint16_t ALIGN(default_mv_joint_cdf[N_MV_JOINTS], 8) = { + CDF3( 4096, 11264, 19328) +}; + +static const uint16_t ALIGN(default_kf_y_mode_cdf[5][5][N_INTRA_PRED_MODES + 3], 32) = { + { + { CDF12(15588, 17027, 19338, 20218, 20682, 21110, + 21825, 23244, 24189, 28165, 29093, 30466) }, + { CDF12(12016, 18066, 19516, 20303, 20719, 21444, + 21888, 23032, 24434, 28658, 30172, 31409) }, + { CDF12(10052, 10771, 22296, 22788, 23055, 23239, + 24133, 25620, 26160, 29336, 29929, 31567) }, + { CDF12(14091, 15406, 16442, 18808, 19136, 19546, + 19998, 22096, 24746, 29585, 30958, 32462) }, + { CDF12(12122, 13265, 15603, 16501, 18609, 20033, + 22391, 25583, 26437, 30261, 31073, 32475) }, + }, { + { CDF12(10023, 19585, 20848, 21440, 21832, 22760, + 23089, 24023, 25381, 29014, 30482, 31436) }, + { CDF12( 5983, 24099, 24560, 24886, 25066, 25795, + 25913, 26423, 27610, 29905, 31276, 31794) }, + { CDF12( 7444, 12781, 20177, 20728, 21077, 21607, + 22170, 23405, 24469, 27915, 29090, 30492) }, + { CDF12( 8537, 14689, 15432, 17087, 17408, 18172, + 18408, 19825, 24649, 29153, 31096, 32210) }, + { CDF12( 7543, 14231, 15496, 16195, 17905, 20717, + 21984, 24516, 26001, 29675, 30981, 31994) }, + }, { + { CDF12(12613, 13591, 21383, 22004, 22312, 22577, + 23401, 25055, 25729, 29538, 30305, 32077) }, + { CDF12( 9687, 13470, 18506, 19230, 19604, 20147, + 20695, 22062, 23219, 27743, 29211, 30907) }, + { CDF12( 6183, 6505, 26024, 26252, 26366, 26434, + 27082, 28354, 28555, 30467, 30794, 32086) }, + { CDF12(10718, 11734, 14954, 17224, 17565, 17924, + 18561, 21523, 23878, 28975, 30287, 32252) }, + { CDF12( 9194, 9858, 16501, 17263, 18424, 19171, + 21563, 25961, 26561, 30072, 30737, 32463) }, + }, { + { CDF12(12602, 14399, 15488, 18381, 18778, 19315, + 19724, 21419, 25060, 29696, 30917, 32409) }, + { CDF12( 8203, 13821, 14524, 17105, 17439, 18131, + 18404, 19468, 25225, 29485, 31158, 32342) }, + { CDF12( 8451, 9731, 15004, 17643, 18012, 18425, + 19070, 21538, 24605, 29118, 30078, 32018) }, + { CDF12( 7714, 9048, 9516, 16667, 16817, 16994, + 17153, 18767, 26743, 30389, 31536, 32528) }, + { CDF12( 8843, 10280, 11496, 15317, 16652, 17943, + 19108, 22718, 25769, 29953, 30983, 32485) }, + }, { + { CDF12(12578, 13671, 15979, 16834, 19075, 20913, + 22989, 25449, 26219, 30214, 31150, 32477) }, + { CDF12( 9563, 13626, 15080, 15892, 17756, 20863, + 22207, 24236, 25380, 29653, 31143, 32277) }, + { CDF12( 8356, 8901, 17616, 18256, 19350, 20106, + 22598, 25947, 26466, 29900, 30523, 32261) }, + { CDF12(10835, 11815, 13124, 16042, 17018, 18039, + 18947, 22753, 24615, 29489, 30883, 32482) }, + { CDF12( 7618, 8288, 9859, 10509, 15386, 18657, + 22903, 28776, 29180, 31355, 31802, 32593) }, + }, +}; + +static const CdfCoefContext av1_default_coef_cdf[4] = { + [0] = { + .skip = { + { + { CDF1(31849) }, { CDF1( 5892) }, { CDF1(12112) }, + { CDF1(21935) }, { CDF1(20289) }, { CDF1(27473) }, + { CDF1(32487) }, { CDF1( 7654) }, { CDF1(19473) }, + { CDF1(29984) }, { CDF1( 9961) }, { CDF1(30242) }, + { CDF1(32117) }, + }, { + { CDF1(31548) }, { CDF1( 1549) }, { CDF1(10130) }, + { CDF1(16656) }, { CDF1(18591) }, { CDF1(26308) }, + { CDF1(32537) }, { CDF1( 5403) }, { CDF1(18096) }, + { CDF1(30003) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, + }, { + { CDF1(29957) }, { CDF1( 5391) }, { CDF1(18039) }, + { CDF1(23566) }, { CDF1(22431) }, { CDF1(25822) }, + { CDF1(32197) }, { CDF1( 3778) }, { CDF1(15336) }, + { CDF1(28981) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, + }, { + { CDF1(17920) }, { CDF1( 1818) }, { CDF1( 7282) }, + { CDF1(25273) }, { CDF1(10923) }, { CDF1(31554) }, + { CDF1(32624) }, { CDF1( 1366) }, { CDF1(15628) }, + { CDF1(30462) }, { CDF1( 146) }, { CDF1( 5132) }, + { CDF1(31657) }, + }, { + { CDF1( 6308) }, { CDF1( 117) }, { CDF1( 1638) }, + { CDF1( 2161) }, { CDF1(16384) }, { CDF1(10923) }, + { CDF1(30247) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, + }, + }, .eob_bin_16 = { + { + { CDF4( 840, 1039, 1980, 4895) }, + { CDF4( 370, 671, 1883, 4471) }, + }, { + { CDF4( 3247, 4950, 9688, 14563) }, + { CDF4( 1904, 3354, 7763, 14647) }, + }, + }, .eob_bin_32 = { + { + { CDF5( 400, 520, 977, 2102, 6542) }, + { CDF5( 210, 405, 1315, 3326, 7537) }, + }, { + { CDF5( 2636, 4273, 7588, 11794, 20401) }, + { CDF5( 1786, 3179, 6902, 11357, 19054) }, + }, + }, .eob_bin_64 = { + { + { CDF6( 329, 498, 1101, 1784, 3265, 7758) }, + { CDF6( 335, 730, 1459, 5494, 8755, 12997) }, + }, { + { CDF6( 3505, 5304, 10086, 13814, 17684, 23370) }, + { CDF6( 1563, 2700, 4876, 10911, 14706, 22480) }, + }, + }, .eob_bin_128 = { + { + { CDF7( 219, 482, 1140, 2091, 3680, 6028, 12586) }, + { CDF7( 371, 699, 1254, 4830, 9479, 12562, 17497) }, + }, { + { CDF7( 5245, 7456, 12880, 15852, 20033, 23932, 27608) }, + { CDF7( 2054, 3472, 5869, 14232, 18242, 20590, 26752) }, + }, + }, .eob_bin_256 = { + { + { CDF8( 310, 584, 1887, 3589, + 6168, 8611, 11352, 15652) }, + { CDF8( 998, 1850, 2998, 5604, + 17341, 19888, 22899, 25583) }, + }, { + { CDF8( 2520, 3240, 5952, 8870, + 12577, 17558, 19954, 24168) }, + { CDF8( 2203, 4130, 7435, 10739, + 20652, 23681, 25609, 27261) }, + }, + }, .eob_bin_512 = { + { CDF9( 641, 983, 3707, 5430, 10234, + 14958, 18788, 23412, 26061) }, + { CDF9( 5095, 6446, 9996, 13354, 16017, + 17986, 20919, 26129, 29140) }, + }, .eob_bin_1024 = { + { CDF10( 393, 421, 751, 1623, 3160, + 6352, 13345, 18047, 22571, 25830) }, + { CDF10( 1865, 1988, 2930, 4242, 10533, + 16538, 21354, 27255, 28546, 31784) }, + }, .eob_hi_bit = { + { + { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(16961) }, + { CDF1(17223) }, { CDF1( 7621) }, { CDF1(16384) }, + { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, { CDF1(16384) }, + }, { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(19069) }, + { CDF1(22525) }, { CDF1(13377) }, { CDF1(16384) }, + { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, { CDF1(16384) }, + }, + }, { + { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(20401) }, + { CDF1(17025) }, { CDF1(12845) }, { CDF1(12873) }, + { CDF1(14094) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, { CDF1(16384) }, + }, { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(20681) }, + { CDF1(20701) }, { CDF1(15250) }, { CDF1(15017) }, + { CDF1(14928) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, { CDF1(16384) }, + }, + }, { + { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(23905) }, + { CDF1(17194) }, { CDF1(16170) }, { CDF1(17695) }, + { CDF1(13826) }, { CDF1(15810) }, { CDF1(12036) }, + { CDF1(16384) }, { CDF1(16384) }, + }, { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(23959) }, + { CDF1(20799) }, { CDF1(19021) }, { CDF1(16203) }, + { CDF1(17886) }, { CDF1(14144) }, { CDF1(12010) }, + { CDF1(16384) }, { CDF1(16384) }, + }, + }, { + { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(27399) }, + { CDF1(16327) }, { CDF1(18071) }, { CDF1(19584) }, + { CDF1(20721) }, { CDF1(18432) }, { CDF1(19560) }, + { CDF1(10150) }, { CDF1( 8805) }, + }, { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(24932) }, + { CDF1(20833) }, { CDF1(12027) }, { CDF1(16670) }, + { CDF1(19914) }, { CDF1(15106) }, { CDF1(17662) }, + { CDF1(13783) }, { CDF1(28756) }, + }, + }, { + { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(23406) }, + { CDF1(21845) }, { CDF1(18432) }, { CDF1(16384) }, + { CDF1(17096) }, { CDF1(12561) }, { CDF1(17320) }, + { CDF1(22395) }, { CDF1(21370) }, + }, { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, { CDF1(16384) }, + }, + }, + }, .eob_base_tok = { + { + { + { CDF2(17837, 29055) }, { CDF2(29600, 31446) }, + { CDF2(30844, 31878) }, { CDF2(24926, 28948) }, + }, { + { CDF2(21365, 30026) }, { CDF2(30512, 32423) }, + { CDF2(31658, 32621) }, { CDF2(29630, 31881) }, + }, + }, { + { + { CDF2( 5717, 26477) }, { CDF2(30491, 31703) }, + { CDF2(31550, 32158) }, { CDF2(29648, 31491) }, + }, { + { CDF2(12608, 27820) }, { CDF2(30680, 32225) }, + { CDF2(30809, 32335) }, { CDF2(31299, 32423) }, + }, + }, { + { + { CDF2( 1786, 12612) }, { CDF2(30663, 31625) }, + { CDF2(32339, 32468) }, { CDF2(31148, 31833) }, + }, { + { CDF2(18857, 23865) }, { CDF2(31428, 32428) }, + { CDF2(31744, 32373) }, { CDF2(31775, 32526) }, + }, + }, { + { + { CDF2( 1787, 2532) }, { CDF2(30832, 31662) }, + { CDF2(31824, 32682) }, { CDF2(32133, 32569) }, + }, { + { CDF2(13751, 22235) }, { CDF2(32089, 32409) }, + { CDF2(27084, 27920) }, { CDF2(29291, 32594) }, + }, + }, { + { + { CDF2( 1725, 3449) }, { CDF2(31102, 31935) }, + { CDF2(32457, 32613) }, { CDF2(32412, 32649) }, + }, { + { CDF2(10923, 21845) }, { CDF2(10923, 21845) }, + { CDF2(10923, 21845) }, { CDF2(10923, 21845) }, + }, + }, + }, .base_tok = { + { + { + { CDF3( 4034, 8930, 12727) }, + { CDF3(18082, 29741, 31877) }, + { CDF3(12596, 26124, 30493) }, + { CDF3( 9446, 21118, 27005) }, + { CDF3( 6308, 15141, 21279) }, + { CDF3( 2463, 6357, 9783) }, + { CDF3(20667, 30546, 31929) }, + { CDF3(13043, 26123, 30134) }, + { CDF3( 8151, 18757, 24778) }, + { CDF3( 5255, 12839, 18632) }, + { CDF3( 2820, 7206, 11161) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3(15736, 27553, 30604) }, + { CDF3(11210, 23794, 28787) }, + { CDF3( 5947, 13874, 19701) }, + { CDF3( 4215, 9323, 13891) }, + { CDF3( 2833, 6462, 10059) }, + { CDF3(19605, 30393, 31582) }, + { CDF3(13523, 26252, 30248) }, + { CDF3( 8446, 18622, 24512) }, + { CDF3( 3818, 10343, 15974) }, + { CDF3( 1481, 4117, 6796) }, + { CDF3(22649, 31302, 32190) }, + { CDF3(14829, 27127, 30449) }, + { CDF3( 8313, 17702, 23304) }, + { CDF3( 3022, 8301, 12786) }, + { CDF3( 1536, 4412, 7184) }, + { CDF3(22354, 29774, 31372) }, + { CDF3(14723, 25472, 29214) }, + { CDF3( 6673, 13745, 18662) }, + { CDF3( 2068, 5766, 9322) }, + { CDF3( 8192, 16384, 24576) }, + }, { + { CDF3( 6302, 16444, 21761) }, + { CDF3(23040, 31538, 32475) }, + { CDF3(15196, 28452, 31496) }, + { CDF3(10020, 22946, 28514) }, + { CDF3( 6533, 16862, 23501) }, + { CDF3( 3538, 9816, 15076) }, + { CDF3(24444, 31875, 32525) }, + { CDF3(15881, 28924, 31635) }, + { CDF3( 9922, 22873, 28466) }, + { CDF3( 6527, 16966, 23691) }, + { CDF3( 4114, 11303, 17220) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3(20201, 30770, 32209) }, + { CDF3(14754, 28071, 31258) }, + { CDF3( 8378, 20186, 26517) }, + { CDF3( 5916, 15299, 21978) }, + { CDF3( 4268, 11583, 17901) }, + { CDF3(24361, 32025, 32581) }, + { CDF3(18673, 30105, 31943) }, + { CDF3(10196, 22244, 27576) }, + { CDF3( 5495, 14349, 20417) }, + { CDF3( 2676, 7415, 11498) }, + { CDF3(24678, 31958, 32585) }, + { CDF3(18629, 29906, 31831) }, + { CDF3( 9364, 20724, 26315) }, + { CDF3( 4641, 12318, 18094) }, + { CDF3( 2758, 7387, 11579) }, + { CDF3(25433, 31842, 32469) }, + { CDF3(18795, 29289, 31411) }, + { CDF3( 7644, 17584, 23592) }, + { CDF3( 3408, 9014, 15047) }, + { CDF3( 8192, 16384, 24576) }, + }, + }, { + { + { CDF3( 4536, 10072, 14001) }, + { CDF3(25459, 31416, 32206) }, + { CDF3(16605, 28048, 30818) }, + { CDF3(11008, 22857, 27719) }, + { CDF3( 6915, 16268, 22315) }, + { CDF3( 2625, 6812, 10537) }, + { CDF3(24257, 31788, 32499) }, + { CDF3(16880, 29454, 31879) }, + { CDF3(11958, 25054, 29778) }, + { CDF3( 7916, 18718, 25084) }, + { CDF3( 3383, 8777, 13446) }, + { CDF3(22720, 31603, 32393) }, + { CDF3(14960, 28125, 31335) }, + { CDF3( 9731, 22210, 27928) }, + { CDF3( 6304, 15832, 22277) }, + { CDF3( 2910, 7818, 12166) }, + { CDF3(20375, 30627, 32131) }, + { CDF3(13904, 27284, 30887) }, + { CDF3( 9368, 21558, 27144) }, + { CDF3( 5937, 14966, 21119) }, + { CDF3( 2667, 7225, 11319) }, + { CDF3(23970, 31470, 32378) }, + { CDF3(17173, 29734, 32018) }, + { CDF3(12795, 25441, 29965) }, + { CDF3( 8981, 19680, 25893) }, + { CDF3( 4728, 11372, 16902) }, + { CDF3(24287, 31797, 32439) }, + { CDF3(16703, 29145, 31696) }, + { CDF3(10833, 23554, 28725) }, + { CDF3( 6468, 16566, 23057) }, + { CDF3( 2415, 6562, 10278) }, + { CDF3(26610, 32395, 32659) }, + { CDF3(18590, 30498, 32117) }, + { CDF3(12420, 25756, 29950) }, + { CDF3( 7639, 18746, 24710) }, + { CDF3( 3001, 8086, 12347) }, + { CDF3(25076, 32064, 32580) }, + { CDF3(17946, 30128, 32028) }, + { CDF3(12024, 24985, 29378) }, + { CDF3( 7517, 18390, 24304) }, + { CDF3( 3243, 8781, 13331) }, + }, { + { CDF3( 6037, 16771, 21957) }, + { CDF3(24774, 31704, 32426) }, + { CDF3(16830, 28589, 31056) }, + { CDF3(10602, 22828, 27760) }, + { CDF3( 6733, 16829, 23071) }, + { CDF3( 3250, 8914, 13556) }, + { CDF3(25582, 32220, 32668) }, + { CDF3(18659, 30342, 32223) }, + { CDF3(12546, 26149, 30515) }, + { CDF3( 8420, 20451, 26801) }, + { CDF3( 4636, 12420, 18344) }, + { CDF3(27581, 32362, 32639) }, + { CDF3(18987, 30083, 31978) }, + { CDF3(11327, 24248, 29084) }, + { CDF3( 7264, 17719, 24120) }, + { CDF3( 3995, 10768, 16169) }, + { CDF3(25893, 31831, 32487) }, + { CDF3(16577, 28587, 31379) }, + { CDF3(10189, 22748, 28182) }, + { CDF3( 6832, 17094, 23556) }, + { CDF3( 3708, 10110, 15334) }, + { CDF3(25904, 32282, 32656) }, + { CDF3(19721, 30792, 32276) }, + { CDF3(12819, 26243, 30411) }, + { CDF3( 8572, 20614, 26891) }, + { CDF3( 5364, 14059, 20467) }, + { CDF3(26580, 32438, 32677) }, + { CDF3(20852, 31225, 32340) }, + { CDF3(12435, 25700, 29967) }, + { CDF3( 8691, 20825, 26976) }, + { CDF3( 4446, 12209, 17269) }, + { CDF3(27350, 32429, 32696) }, + { CDF3(21372, 30977, 32272) }, + { CDF3(12673, 25270, 29853) }, + { CDF3( 9208, 20925, 26640) }, + { CDF3( 5018, 13351, 18732) }, + { CDF3(27351, 32479, 32713) }, + { CDF3(21398, 31209, 32387) }, + { CDF3(12162, 25047, 29842) }, + { CDF3( 7896, 18691, 25319) }, + { CDF3( 4670, 12882, 18881) }, + }, + }, { + { + { CDF3( 5487, 10460, 13708) }, + { CDF3(21597, 28303, 30674) }, + { CDF3(11037, 21953, 26476) }, + { CDF3( 8147, 17962, 22952) }, + { CDF3( 5242, 13061, 18532) }, + { CDF3( 1889, 5208, 8182) }, + { CDF3(26774, 32133, 32590) }, + { CDF3(17844, 29564, 31767) }, + { CDF3(11690, 24438, 29171) }, + { CDF3( 7542, 18215, 24459) }, + { CDF3( 2993, 8050, 12319) }, + { CDF3(28023, 32328, 32591) }, + { CDF3(18651, 30126, 31954) }, + { CDF3(12164, 25146, 29589) }, + { CDF3( 7762, 18530, 24771) }, + { CDF3( 3492, 9183, 13920) }, + { CDF3(27591, 32008, 32491) }, + { CDF3(17149, 28853, 31510) }, + { CDF3(11485, 24003, 28860) }, + { CDF3( 7697, 18086, 24210) }, + { CDF3( 3075, 7999, 12218) }, + { CDF3(28268, 32482, 32654) }, + { CDF3(19631, 31051, 32404) }, + { CDF3(13860, 27260, 31020) }, + { CDF3( 9605, 21613, 27594) }, + { CDF3( 4876, 12162, 17908) }, + { CDF3(27248, 32316, 32576) }, + { CDF3(18955, 30457, 32075) }, + { CDF3(11824, 23997, 28795) }, + { CDF3( 7346, 18196, 24647) }, + { CDF3( 3403, 9247, 14111) }, + { CDF3(29711, 32655, 32735) }, + { CDF3(21169, 31394, 32417) }, + { CDF3(13487, 27198, 30957) }, + { CDF3( 8828, 21683, 27614) }, + { CDF3( 4270, 11451, 17038) }, + { CDF3(28708, 32578, 32731) }, + { CDF3(20120, 31241, 32482) }, + { CDF3(13692, 27550, 31321) }, + { CDF3( 9418, 22514, 28439) }, + { CDF3( 4999, 13283, 19462) }, + }, { + { CDF3( 5673, 14302, 19711) }, + { CDF3(26251, 30701, 31834) }, + { CDF3(12782, 23783, 27803) }, + { CDF3( 9127, 20657, 25808) }, + { CDF3( 6368, 16208, 21462) }, + { CDF3( 2465, 7177, 10822) }, + { CDF3(29961, 32563, 32719) }, + { CDF3(18318, 29891, 31949) }, + { CDF3(11361, 24514, 29357) }, + { CDF3( 7900, 19603, 25607) }, + { CDF3( 4002, 10590, 15546) }, + { CDF3(29637, 32310, 32595) }, + { CDF3(18296, 29913, 31809) }, + { CDF3(10144, 21515, 26871) }, + { CDF3( 5358, 14322, 20394) }, + { CDF3( 3067, 8362, 13346) }, + { CDF3(28652, 32470, 32676) }, + { CDF3(17538, 30771, 32209) }, + { CDF3(13924, 26882, 30494) }, + { CDF3(10496, 22837, 27869) }, + { CDF3( 7236, 16396, 21621) }, + { CDF3(30743, 32687, 32746) }, + { CDF3(23006, 31676, 32489) }, + { CDF3(14494, 27828, 31120) }, + { CDF3(10174, 22801, 28352) }, + { CDF3( 6242, 15281, 21043) }, + { CDF3(25817, 32243, 32720) }, + { CDF3(18618, 31367, 32325) }, + { CDF3(13997, 28318, 31878) }, + { CDF3(12255, 26534, 31383) }, + { CDF3( 9561, 21588, 28450) }, + { CDF3(28188, 32635, 32724) }, + { CDF3(22060, 32365, 32728) }, + { CDF3(18102, 30690, 32528) }, + { CDF3(14196, 28864, 31999) }, + { CDF3(12262, 25792, 30865) }, + { CDF3(24176, 32109, 32628) }, + { CDF3(18280, 29681, 31963) }, + { CDF3(10205, 23703, 29664) }, + { CDF3( 7889, 20025, 27676) }, + { CDF3( 6060, 16743, 23970) }, + }, + }, { + { + { CDF3( 5141, 7096, 8260) }, + { CDF3(27186, 29022, 29789) }, + { CDF3( 6668, 12568, 15682) }, + { CDF3( 2172, 6181, 8638) }, + { CDF3( 1126, 3379, 4531) }, + { CDF3( 443, 1361, 2254) }, + { CDF3(26083, 31153, 32436) }, + { CDF3(13486, 24603, 28483) }, + { CDF3( 6508, 14840, 19910) }, + { CDF3( 3386, 8800, 13286) }, + { CDF3( 1530, 4322, 7054) }, + { CDF3(29639, 32080, 32548) }, + { CDF3(15897, 27552, 30290) }, + { CDF3( 8588, 20047, 25383) }, + { CDF3( 4889, 13339, 19269) }, + { CDF3( 2240, 6871, 10498) }, + { CDF3(28165, 32197, 32517) }, + { CDF3(20735, 30427, 31568) }, + { CDF3(14325, 24671, 27692) }, + { CDF3( 5119, 12554, 17805) }, + { CDF3( 1810, 5441, 8261) }, + { CDF3(31212, 32724, 32748) }, + { CDF3(23352, 31766, 32545) }, + { CDF3(14669, 27570, 31059) }, + { CDF3( 8492, 20894, 27272) }, + { CDF3( 3644, 10194, 15204) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + }, { + { CDF3( 2461, 7013, 9371) }, + { CDF3(24749, 29600, 30986) }, + { CDF3( 9466, 19037, 22417) }, + { CDF3( 3584, 9280, 14400) }, + { CDF3( 1505, 3929, 5433) }, + { CDF3( 677, 1500, 2736) }, + { CDF3(23987, 30702, 32117) }, + { CDF3(13554, 24571, 29263) }, + { CDF3( 6211, 14556, 21155) }, + { CDF3( 3135, 10972, 15625) }, + { CDF3( 2435, 7127, 11427) }, + { CDF3(31300, 32532, 32550) }, + { CDF3(14757, 30365, 31954) }, + { CDF3( 4405, 11612, 18553) }, + { CDF3( 580, 4132, 7322) }, + { CDF3( 1695, 10169, 14124) }, + { CDF3(30008, 32282, 32591) }, + { CDF3(19244, 30108, 31748) }, + { CDF3(11180, 24158, 29555) }, + { CDF3( 5650, 14972, 19209) }, + { CDF3( 2114, 5109, 8456) }, + { CDF3(31856, 32716, 32748) }, + { CDF3(23012, 31664, 32572) }, + { CDF3(13694, 26656, 30636) }, + { CDF3( 8142, 19508, 26093) }, + { CDF3( 4253, 10955, 16724) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + }, + }, { + { + { CDF3( 601, 983, 1311) }, + { CDF3(18725, 23406, 28087) }, + { CDF3( 5461, 8192, 10923) }, + { CDF3( 3781, 15124, 21425) }, + { CDF3( 2587, 7761, 12072) }, + { CDF3( 106, 458, 810) }, + { CDF3(22282, 29710, 31894) }, + { CDF3( 8508, 20926, 25984) }, + { CDF3( 3726, 12713, 18083) }, + { CDF3( 1620, 7112, 10893) }, + { CDF3( 729, 2236, 3495) }, + { CDF3(30163, 32474, 32684) }, + { CDF3(18304, 30464, 32000) }, + { CDF3(11443, 26526, 29647) }, + { CDF3( 6007, 15292, 21299) }, + { CDF3( 2234, 6703, 8937) }, + { CDF3(30954, 32177, 32571) }, + { CDF3(17363, 29562, 31076) }, + { CDF3( 9686, 22464, 27410) }, + { CDF3( 8192, 16384, 21390) }, + { CDF3( 1755, 8046, 11264) }, + { CDF3(31168, 32734, 32748) }, + { CDF3(22486, 31441, 32471) }, + { CDF3(12833, 25627, 29738) }, + { CDF3( 6980, 17379, 23122) }, + { CDF3( 3111, 8887, 13479) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + }, { + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + }, + }, + }, .dc_sign = { + { { CDF1(16000) }, { CDF1(13056) }, { CDF1(18816) } }, + { { CDF1(15232) }, { CDF1(12928) }, { CDF1(17280) } }, + }, .br_tok = { + { + { + { CDF3(14298, 20718, 24174) }, + { CDF3(12536, 19601, 23789) }, + { CDF3( 8712, 15051, 19503) }, + { CDF3( 6170, 11327, 15434) }, + { CDF3( 4742, 8926, 12538) }, + { CDF3( 3803, 7317, 10546) }, + { CDF3( 1696, 3317, 4871) }, + { CDF3(14392, 19951, 22756) }, + { CDF3(15978, 23218, 26818) }, + { CDF3(12187, 19474, 23889) }, + { CDF3( 9176, 15640, 20259) }, + { CDF3( 7068, 12655, 17028) }, + { CDF3( 5656, 10442, 14472) }, + { CDF3( 2580, 4992, 7244) }, + { CDF3(12136, 18049, 21426) }, + { CDF3(13784, 20721, 24481) }, + { CDF3(10836, 17621, 21900) }, + { CDF3( 8372, 14444, 18847) }, + { CDF3( 6523, 11779, 16000) }, + { CDF3( 5337, 9898, 13760) }, + { CDF3( 3034, 5860, 8462) }, + }, { + { CDF3(15967, 22905, 26286) }, + { CDF3(13534, 20654, 24579) }, + { CDF3( 9504, 16092, 20535) }, + { CDF3( 6975, 12568, 16903) }, + { CDF3( 5364, 10091, 14020) }, + { CDF3( 4357, 8370, 11857) }, + { CDF3( 2506, 4934, 7218) }, + { CDF3(23032, 28815, 30936) }, + { CDF3(19540, 26704, 29719) }, + { CDF3(15158, 22969, 27097) }, + { CDF3(11408, 18865, 23650) }, + { CDF3( 8885, 15448, 20250) }, + { CDF3( 7108, 12853, 17416) }, + { CDF3( 4231, 8041, 11480) }, + { CDF3(19823, 26490, 29156) }, + { CDF3(18890, 25929, 28932) }, + { CDF3(15660, 23491, 27433) }, + { CDF3(12147, 19776, 24488) }, + { CDF3( 9728, 16774, 21649) }, + { CDF3( 7919, 14277, 19066) }, + { CDF3( 5440, 10170, 14185) }, + }, + }, { + { + { CDF3(14406, 20862, 24414) }, + { CDF3(11824, 18907, 23109) }, + { CDF3( 8257, 14393, 18803) }, + { CDF3( 5860, 10747, 14778) }, + { CDF3( 4475, 8486, 11984) }, + { CDF3( 3606, 6954, 10043) }, + { CDF3( 1736, 3410, 5048) }, + { CDF3(14430, 20046, 22882) }, + { CDF3(15593, 22899, 26709) }, + { CDF3(12102, 19368, 23811) }, + { CDF3( 9059, 15584, 20262) }, + { CDF3( 6999, 12603, 17048) }, + { CDF3( 5684, 10497, 14553) }, + { CDF3( 2822, 5438, 7862) }, + { CDF3(15785, 21585, 24359) }, + { CDF3(18347, 25229, 28266) }, + { CDF3(14974, 22487, 26389) }, + { CDF3(11423, 18681, 23271) }, + { CDF3( 8863, 15350, 20008) }, + { CDF3( 7153, 12852, 17278) }, + { CDF3( 3707, 7036, 9982) }, + }, { + { CDF3(15460, 21696, 25469) }, + { CDF3(12170, 19249, 23191) }, + { CDF3( 8723, 15027, 19332) }, + { CDF3( 6428, 11704, 15874) }, + { CDF3( 4922, 9292, 13052) }, + { CDF3( 4139, 7695, 11010) }, + { CDF3( 2291, 4508, 6598) }, + { CDF3(19856, 26920, 29828) }, + { CDF3(17923, 25289, 28792) }, + { CDF3(14278, 21968, 26297) }, + { CDF3(10910, 18136, 22950) }, + { CDF3( 8423, 14815, 19627) }, + { CDF3( 6771, 12283, 16774) }, + { CDF3( 4074, 7750, 11081) }, + { CDF3(19852, 26074, 28672) }, + { CDF3(19371, 26110, 28989) }, + { CDF3(16265, 23873, 27663) }, + { CDF3(12758, 20378, 24952) }, + { CDF3(10095, 17098, 21961) }, + { CDF3( 8250, 14628, 19451) }, + { CDF3( 5205, 9745, 13622) }, + }, + }, { + { + { CDF3(10563, 16233, 19763) }, + { CDF3( 9794, 16022, 19804) }, + { CDF3( 6750, 11945, 15759) }, + { CDF3( 4963, 9186, 12752) }, + { CDF3( 3845, 7435, 10627) }, + { CDF3( 3051, 6085, 8834) }, + { CDF3( 1311, 2596, 3830) }, + { CDF3(11246, 16404, 19689) }, + { CDF3(12315, 18911, 22731) }, + { CDF3(10557, 17095, 21289) }, + { CDF3( 8136, 14006, 18249) }, + { CDF3( 6348, 11474, 15565) }, + { CDF3( 5196, 9655, 13400) }, + { CDF3( 2349, 4526, 6587) }, + { CDF3(13337, 18730, 21569) }, + { CDF3(19306, 26071, 28882) }, + { CDF3(15952, 23540, 27254) }, + { CDF3(12409, 19934, 24430) }, + { CDF3( 9760, 16706, 21389) }, + { CDF3( 8004, 14220, 18818) }, + { CDF3( 4138, 7794, 10961) }, + }, { + { CDF3(10870, 16684, 20949) }, + { CDF3( 9664, 15230, 18680) }, + { CDF3( 6886, 12109, 15408) }, + { CDF3( 4825, 8900, 12305) }, + { CDF3( 3630, 7162, 10314) }, + { CDF3( 3036, 6429, 9387) }, + { CDF3( 1671, 3296, 4940) }, + { CDF3(13819, 19159, 23026) }, + { CDF3(11984, 19108, 23120) }, + { CDF3(10690, 17210, 21663) }, + { CDF3( 7984, 14154, 18333) }, + { CDF3( 6868, 12294, 16124) }, + { CDF3( 5274, 8994, 12868) }, + { CDF3( 2988, 5771, 8424) }, + { CDF3(19736, 26647, 29141) }, + { CDF3(18933, 26070, 28984) }, + { CDF3(15779, 23048, 27200) }, + { CDF3(12638, 20061, 24532) }, + { CDF3(10692, 17545, 22220) }, + { CDF3( 9217, 15251, 20054) }, + { CDF3( 5078, 9284, 12594) }, + }, + }, { + { + { CDF3( 2331, 3662, 5244) }, + { CDF3( 2891, 4771, 6145) }, + { CDF3( 4598, 7623, 9729) }, + { CDF3( 3520, 6845, 9199) }, + { CDF3( 3417, 6119, 9324) }, + { CDF3( 2601, 5412, 7385) }, + { CDF3( 600, 1173, 1744) }, + { CDF3( 7672, 13286, 17469) }, + { CDF3( 4232, 7792, 10793) }, + { CDF3( 2915, 5317, 7397) }, + { CDF3( 2318, 4356, 6152) }, + { CDF3( 2127, 4000, 5554) }, + { CDF3( 1850, 3478, 5275) }, + { CDF3( 977, 1933, 2843) }, + { CDF3(18280, 24387, 27989) }, + { CDF3(15852, 22671, 26185) }, + { CDF3(13845, 20951, 24789) }, + { CDF3(11055, 17966, 22129) }, + { CDF3( 9138, 15422, 19801) }, + { CDF3( 7454, 13145, 17456) }, + { CDF3( 3370, 6393, 9013) }, + }, { + { CDF3( 5842, 9229, 10838) }, + { CDF3( 2313, 3491, 4276) }, + { CDF3( 2998, 6104, 7496) }, + { CDF3( 2420, 7447, 9868) }, + { CDF3( 3034, 8495, 10923) }, + { CDF3( 4076, 8937, 10975) }, + { CDF3( 1086, 2370, 3299) }, + { CDF3( 9714, 17254, 20444) }, + { CDF3( 8543, 13698, 17123) }, + { CDF3( 4918, 9007, 11910) }, + { CDF3( 4129, 7532, 10553) }, + { CDF3( 2364, 5533, 8058) }, + { CDF3( 1834, 3546, 5563) }, + { CDF3( 1473, 2908, 4133) }, + { CDF3(15405, 21193, 25619) }, + { CDF3(15691, 21952, 26561) }, + { CDF3(12962, 19194, 24165) }, + { CDF3(10272, 17855, 22129) }, + { CDF3( 8588, 15270, 20718) }, + { CDF3( 8682, 14669, 19500) }, + { CDF3( 4870, 9636, 13205) }, + }, + }, + }, + }, [1] = { + .skip = { + { + { CDF1(30371) }, { CDF1( 7570) }, { CDF1(13155) }, + { CDF1(20751) }, { CDF1(20969) }, { CDF1(27067) }, + { CDF1(32013) }, { CDF1( 5495) }, { CDF1(17942) }, + { CDF1(28280) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, + }, { + { CDF1(31782) }, { CDF1( 1836) }, { CDF1(10689) }, + { CDF1(17604) }, { CDF1(21622) }, { CDF1(27518) }, + { CDF1(32399) }, { CDF1( 4419) }, { CDF1(16294) }, + { CDF1(28345) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, + }, { + { CDF1(31901) }, { CDF1(10311) }, { CDF1(18047) }, + { CDF1(24806) }, { CDF1(23288) }, { CDF1(27914) }, + { CDF1(32296) }, { CDF1( 4215) }, { CDF1(15756) }, + { CDF1(28341) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, + }, { + { CDF1(26726) }, { CDF1( 1045) }, { CDF1(11703) }, + { CDF1(20590) }, { CDF1(18554) }, { CDF1(25970) }, + { CDF1(31938) }, { CDF1( 5583) }, { CDF1(21313) }, + { CDF1(29390) }, { CDF1( 641) }, { CDF1(22265) }, + { CDF1(31452) }, + }, { + { CDF1(26584) }, { CDF1( 188) }, { CDF1( 8847) }, + { CDF1(24519) }, { CDF1(22938) }, { CDF1(30583) }, + { CDF1(32608) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, + }, + }, .eob_bin_16 = { + { + { CDF4( 2125, 2551, 5165, 8946) }, + { CDF4( 513, 765, 1859, 6339) }, + }, { + { CDF4( 7637, 9498, 14259, 19108) }, + { CDF4( 2497, 4096, 8866, 16993) }, + }, + }, .eob_bin_32 = { + { + { CDF5( 989, 1249, 2019, 4151, 10785) }, + { CDF5( 313, 441, 1099, 2917, 8562) }, + }, { + { CDF5( 8394, 10352, 13932, 18855, 26014) }, + { CDF5( 2578, 4124, 8181, 13670, 24234) }, + }, + }, .eob_bin_64 = { + { + { CDF6( 1260, 1446, 2253, 3712, 6652, 13369) }, + { CDF6( 401, 605, 1029, 2563, 5845, 12626) }, + }, { + { CDF6( 8609, 10612, 14624, 18714, 22614, 29024) }, + { CDF6( 1923, 3127, 5867, 9703, 14277, 27100) }, + }, + }, .eob_bin_128 = { + { + { CDF7( 685, 933, 1488, 2714, 4766, 8562, 19254) }, + { CDF7( 217, 352, 618, 2303, 5261, 9969, 17472) }, + }, { + { CDF7( 8045, 11200, 15497, 19595, 23948, 27408, 30938) }, + { CDF7( 2310, 4160, 7471, 14997, 17931, 20768, 30240) }, + }, + }, .eob_bin_256 = { + { + { CDF8( 1448, 2109, 4151, 6263, + 9329, 13260, 17944, 23300) }, + { CDF8( 399, 1019, 1749, 3038, + 10444, 15546, 22739, 27294) }, + }, { + { CDF8( 6402, 8148, 12623, 15072, + 18728, 22847, 26447, 29377) }, + { CDF8( 1674, 3252, 5734, 10159, + 22397, 23802, 24821, 30940) }, + }, + }, .eob_bin_512 = { + { CDF9( 1230, 2278, 5035, 7776, 11871, + 15346, 19590, 24584, 28749) }, + { CDF9( 7265, 9979, 15819, 19250, 21780, + 23846, 26478, 28396, 31811) }, + }, .eob_bin_1024 = { + { CDF10( 696, 948, 3145, 5702, 9706, + 13217, 17851, 21856, 25692, 28034) }, + { CDF10( 2672, 3591, 9330, 17084, 22725, + 24284, 26527, 28027, 28377, 30876) }, + }, .eob_hi_bit = { + { + { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(17471) }, + { CDF1(20223) }, { CDF1(11357) }, { CDF1(16384) }, + { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, { CDF1(16384) }, + }, { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(20335) }, + { CDF1(21667) }, { CDF1(14818) }, { CDF1(16384) }, + { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, { CDF1(16384) }, + }, + }, { + { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(20430) }, + { CDF1(20662) }, { CDF1(15367) }, { CDF1(16970) }, + { CDF1(14657) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, { CDF1(16384) }, + }, { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(22117) }, + { CDF1(22028) }, { CDF1(18650) }, { CDF1(16042) }, + { CDF1(15885) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, { CDF1(16384) }, + }, + }, { + { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(22409) }, + { CDF1(21012) }, { CDF1(15650) }, { CDF1(17395) }, + { CDF1(15469) }, { CDF1(20205) }, { CDF1(19511) }, + { CDF1(16384) }, { CDF1(16384) }, + }, { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(24220) }, + { CDF1(22480) }, { CDF1(17737) }, { CDF1(18916) }, + { CDF1(19268) }, { CDF1(18412) }, { CDF1(18844) }, + { CDF1(16384) }, { CDF1(16384) }, + }, + }, { + { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(25991) }, + { CDF1(20314) }, { CDF1(17731) }, { CDF1(19678) }, + { CDF1(18649) }, { CDF1(17307) }, { CDF1(21798) }, + { CDF1(17549) }, { CDF1(15630) }, + }, { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(26585) }, + { CDF1(21469) }, { CDF1(20432) }, { CDF1(17735) }, + { CDF1(19280) }, { CDF1(15235) }, { CDF1(20297) }, + { CDF1(22471) }, { CDF1(28997) }, + }, + }, { + { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(26605) }, + { CDF1(11304) }, { CDF1(16726) }, { CDF1(16560) }, + { CDF1(20866) }, { CDF1(23524) }, { CDF1(19878) }, + { CDF1(13469) }, { CDF1(23084) }, + }, { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, { CDF1(16384) }, + }, + }, + }, .eob_base_tok = { + { + { + { CDF2(17560, 29888) }, { CDF2(29671, 31549) }, + { CDF2(31007, 32056) }, { CDF2(27286, 30006) }, + }, { + { CDF2(26594, 31212) }, { CDF2(31208, 32582) }, + { CDF2(31835, 32637) }, { CDF2(30595, 32206) }, + }, + }, { + { + { CDF2(15239, 29932) }, { CDF2(31315, 32095) }, + { CDF2(32130, 32434) }, { CDF2(30864, 31996) }, + }, { + { CDF2(26279, 30968) }, { CDF2(31142, 32495) }, + { CDF2(31713, 32540) }, { CDF2(31929, 32594) }, + }, + }, { + { + { CDF2( 2644, 25198) }, { CDF2(32038, 32451) }, + { CDF2(32639, 32695) }, { CDF2(32166, 32518) }, + }, { + { CDF2(17187, 27668) }, { CDF2(31714, 32550) }, + { CDF2(32283, 32678) }, { CDF2(31930, 32563) }, + }, + }, { + { + { CDF2( 1044, 2257) }, { CDF2(30755, 31923) }, + { CDF2(32208, 32693) }, { CDF2(32244, 32615) }, + }, { + { CDF2(21317, 26207) }, { CDF2(29133, 30868) }, + { CDF2(29311, 31231) }, { CDF2(29657, 31087) }, + }, + }, { + { + { CDF2( 478, 1834) }, { CDF2(31005, 31987) }, + { CDF2(32317, 32724) }, { CDF2(30865, 32648) }, + }, { + { CDF2(10923, 21845) }, { CDF2(10923, 21845) }, + { CDF2(10923, 21845) }, { CDF2(10923, 21845) }, + }, + }, + }, .base_tok = { + { + { + { CDF3( 6041, 11854, 15927) }, + { CDF3(20326, 30905, 32251) }, + { CDF3(14164, 26831, 30725) }, + { CDF3( 9760, 20647, 26585) }, + { CDF3( 6416, 14953, 21219) }, + { CDF3( 2966, 7151, 10891) }, + { CDF3(23567, 31374, 32254) }, + { CDF3(14978, 27416, 30946) }, + { CDF3( 9434, 20225, 26254) }, + { CDF3( 6658, 14558, 20535) }, + { CDF3( 3916, 8677, 12989) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3(18088, 29545, 31587) }, + { CDF3(13062, 25843, 30073) }, + { CDF3( 8940, 16827, 22251) }, + { CDF3( 7654, 13220, 17973) }, + { CDF3( 5733, 10316, 14456) }, + { CDF3(22879, 31388, 32114) }, + { CDF3(15215, 27993, 30955) }, + { CDF3( 9397, 19445, 24978) }, + { CDF3( 3442, 9813, 15344) }, + { CDF3( 1368, 3936, 6532) }, + { CDF3(25494, 32033, 32406) }, + { CDF3(16772, 27963, 30718) }, + { CDF3( 9419, 18165, 23260) }, + { CDF3( 2677, 7501, 11797) }, + { CDF3( 1516, 4344, 7170) }, + { CDF3(26556, 31454, 32101) }, + { CDF3(17128, 27035, 30108) }, + { CDF3( 8324, 15344, 20249) }, + { CDF3( 1903, 5696, 9469) }, + { CDF3( 8192, 16384, 24576) }, + }, { + { CDF3( 8455, 19003, 24368) }, + { CDF3(23563, 32021, 32604) }, + { CDF3(16237, 29446, 31935) }, + { CDF3(10724, 23999, 29358) }, + { CDF3( 6725, 17528, 24416) }, + { CDF3( 3927, 10927, 16825) }, + { CDF3(26313, 32288, 32634) }, + { CDF3(17430, 30095, 32095) }, + { CDF3(11116, 24606, 29679) }, + { CDF3( 7195, 18384, 25269) }, + { CDF3( 4726, 12852, 19315) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3(22822, 31648, 32483) }, + { CDF3(16724, 29633, 31929) }, + { CDF3(10261, 23033, 28725) }, + { CDF3( 7029, 17840, 24528) }, + { CDF3( 4867, 13886, 21502) }, + { CDF3(25298, 31892, 32491) }, + { CDF3(17809, 29330, 31512) }, + { CDF3( 9668, 21329, 26579) }, + { CDF3( 4774, 12956, 18976) }, + { CDF3( 2322, 7030, 11540) }, + { CDF3(25472, 31920, 32543) }, + { CDF3(17957, 29387, 31632) }, + { CDF3( 9196, 20593, 26400) }, + { CDF3( 4680, 12705, 19202) }, + { CDF3( 2917, 8456, 13436) }, + { CDF3(26471, 32059, 32574) }, + { CDF3(18458, 29783, 31909) }, + { CDF3( 8400, 19464, 25956) }, + { CDF3( 3812, 10973, 17206) }, + { CDF3( 8192, 16384, 24576) }, + }, + }, { + { + { CDF3( 6779, 13743, 17678) }, + { CDF3(24806, 31797, 32457) }, + { CDF3(17616, 29047, 31372) }, + { CDF3(11063, 23175, 28003) }, + { CDF3( 6521, 16110, 22324) }, + { CDF3( 2764, 7504, 11654) }, + { CDF3(25266, 32367, 32637) }, + { CDF3(19054, 30553, 32175) }, + { CDF3(12139, 25212, 29807) }, + { CDF3( 7311, 18162, 24704) }, + { CDF3( 3397, 9164, 14074) }, + { CDF3(25988, 32208, 32522) }, + { CDF3(16253, 28912, 31526) }, + { CDF3( 9151, 21387, 27372) }, + { CDF3( 5688, 14915, 21496) }, + { CDF3( 2717, 7627, 12004) }, + { CDF3(23144, 31855, 32443) }, + { CDF3(16070, 28491, 31325) }, + { CDF3( 8702, 20467, 26517) }, + { CDF3( 5243, 13956, 20367) }, + { CDF3( 2621, 7335, 11567) }, + { CDF3(26636, 32340, 32630) }, + { CDF3(19990, 31050, 32341) }, + { CDF3(13243, 26105, 30315) }, + { CDF3( 8588, 19521, 25918) }, + { CDF3( 4717, 11585, 17304) }, + { CDF3(25844, 32292, 32582) }, + { CDF3(19090, 30635, 32097) }, + { CDF3(11963, 24546, 28939) }, + { CDF3( 6218, 16087, 22354) }, + { CDF3( 2340, 6608, 10426) }, + { CDF3(28046, 32576, 32694) }, + { CDF3(21178, 31313, 32296) }, + { CDF3(13486, 26184, 29870) }, + { CDF3( 7149, 17871, 23723) }, + { CDF3( 2833, 7958, 12259) }, + { CDF3(27710, 32528, 32686) }, + { CDF3(20674, 31076, 32268) }, + { CDF3(12413, 24955, 29243) }, + { CDF3( 6676, 16927, 23097) }, + { CDF3( 2966, 8333, 12919) }, + }, { + { CDF3( 8639, 19339, 24429) }, + { CDF3(24404, 31837, 32525) }, + { CDF3(16997, 29425, 31784) }, + { CDF3(11253, 24234, 29149) }, + { CDF3( 6751, 17394, 24028) }, + { CDF3( 3490, 9830, 15191) }, + { CDF3(26283, 32471, 32714) }, + { CDF3(19599, 31168, 32442) }, + { CDF3(13146, 26954, 30893) }, + { CDF3( 8214, 20588, 26890) }, + { CDF3( 4699, 13081, 19300) }, + { CDF3(28212, 32458, 32669) }, + { CDF3(18594, 30316, 32100) }, + { CDF3(11219, 24408, 29234) }, + { CDF3( 6865, 17656, 24149) }, + { CDF3( 3678, 10362, 16006) }, + { CDF3(25825, 32136, 32616) }, + { CDF3(17313, 29853, 32021) }, + { CDF3(11197, 24471, 29472) }, + { CDF3( 6947, 17781, 24405) }, + { CDF3( 3768, 10660, 16261) }, + { CDF3(27352, 32500, 32706) }, + { CDF3(20850, 31468, 32469) }, + { CDF3(14021, 27707, 31133) }, + { CDF3( 8964, 21748, 27838) }, + { CDF3( 5437, 14665, 21187) }, + { CDF3(26304, 32492, 32698) }, + { CDF3(20409, 31380, 32385) }, + { CDF3(13682, 27222, 30632) }, + { CDF3( 8974, 21236, 26685) }, + { CDF3( 4234, 11665, 16934) }, + { CDF3(26273, 32357, 32711) }, + { CDF3(20672, 31242, 32441) }, + { CDF3(14172, 27254, 30902) }, + { CDF3( 9870, 21898, 27275) }, + { CDF3( 5164, 13506, 19270) }, + { CDF3(26725, 32459, 32728) }, + { CDF3(20991, 31442, 32527) }, + { CDF3(13071, 26434, 30811) }, + { CDF3( 8184, 20090, 26742) }, + { CDF3( 4803, 13255, 19895) }, + }, + }, { + { + { CDF3( 7555, 14942, 18501) }, + { CDF3(24410, 31178, 32287) }, + { CDF3(14394, 26738, 30253) }, + { CDF3( 8413, 19554, 25195) }, + { CDF3( 4766, 12924, 18785) }, + { CDF3( 2029, 5806, 9207) }, + { CDF3(26776, 32364, 32663) }, + { CDF3(18732, 29967, 31931) }, + { CDF3(11005, 23786, 28852) }, + { CDF3( 6466, 16909, 23510) }, + { CDF3( 3044, 8638, 13419) }, + { CDF3(29208, 32582, 32704) }, + { CDF3(20068, 30857, 32208) }, + { CDF3(12003, 25085, 29595) }, + { CDF3( 6947, 17750, 24189) }, + { CDF3( 3245, 9103, 14007) }, + { CDF3(27359, 32465, 32669) }, + { CDF3(19421, 30614, 32174) }, + { CDF3(11915, 25010, 29579) }, + { CDF3( 6950, 17676, 24074) }, + { CDF3( 3007, 8473, 13096) }, + { CDF3(29002, 32676, 32735) }, + { CDF3(22102, 31849, 32576) }, + { CDF3(14408, 28009, 31405) }, + { CDF3( 9027, 21679, 27931) }, + { CDF3( 4694, 12678, 18748) }, + { CDF3(28216, 32528, 32682) }, + { CDF3(20849, 31264, 32318) }, + { CDF3(12756, 25815, 29751) }, + { CDF3( 7565, 18801, 24923) }, + { CDF3( 3509, 9533, 14477) }, + { CDF3(30133, 32687, 32739) }, + { CDF3(23063, 31910, 32515) }, + { CDF3(14588, 28051, 31132) }, + { CDF3( 9085, 21649, 27457) }, + { CDF3( 4261, 11654, 17264) }, + { CDF3(29518, 32691, 32748) }, + { CDF3(22451, 31959, 32613) }, + { CDF3(14864, 28722, 31700) }, + { CDF3( 9695, 22964, 28716) }, + { CDF3( 4932, 13358, 19502) }, + }, { + { CDF3( 6465, 16958, 21688) }, + { CDF3(25199, 31514, 32360) }, + { CDF3(14774, 27149, 30607) }, + { CDF3( 9257, 21438, 26972) }, + { CDF3( 5723, 15183, 21882) }, + { CDF3( 3150, 8879, 13731) }, + { CDF3(26989, 32262, 32682) }, + { CDF3(17396, 29937, 32085) }, + { CDF3(11387, 24901, 29784) }, + { CDF3( 7289, 18821, 25548) }, + { CDF3( 3734, 10577, 16086) }, + { CDF3(29728, 32501, 32695) }, + { CDF3(17431, 29701, 31903) }, + { CDF3( 9921, 22826, 28300) }, + { CDF3( 5896, 15434, 22068) }, + { CDF3( 3430, 9646, 14757) }, + { CDF3(28614, 32511, 32705) }, + { CDF3(19364, 30638, 32263) }, + { CDF3(13129, 26254, 30402) }, + { CDF3( 8754, 20484, 26440) }, + { CDF3( 4378, 11607, 17110) }, + { CDF3(30292, 32671, 32744) }, + { CDF3(21780, 31603, 32501) }, + { CDF3(14314, 27829, 31291) }, + { CDF3( 9611, 22327, 28263) }, + { CDF3( 4890, 13087, 19065) }, + { CDF3(25862, 32567, 32733) }, + { CDF3(20794, 32050, 32567) }, + { CDF3(17243, 30625, 32254) }, + { CDF3(13283, 27628, 31474) }, + { CDF3( 9669, 22532, 28918) }, + { CDF3(27435, 32697, 32748) }, + { CDF3(24922, 32390, 32714) }, + { CDF3(21449, 31504, 32536) }, + { CDF3(16392, 29729, 31832) }, + { CDF3(11692, 24884, 29076) }, + { CDF3(24193, 32290, 32735) }, + { CDF3(18909, 31104, 32563) }, + { CDF3(12236, 26841, 31403) }, + { CDF3( 8171, 21840, 29082) }, + { CDF3( 7224, 17280, 25275) }, + }, + }, { + { + { CDF3( 3078, 6839, 9890) }, + { CDF3(13837, 20450, 24479) }, + { CDF3( 5914, 14222, 19328) }, + { CDF3( 3866, 10267, 14762) }, + { CDF3( 2612, 7208, 11042) }, + { CDF3( 1067, 2991, 4776) }, + { CDF3(25817, 31646, 32529) }, + { CDF3(13708, 26338, 30385) }, + { CDF3( 7328, 18585, 24870) }, + { CDF3( 4691, 13080, 19276) }, + { CDF3( 1825, 5253, 8352) }, + { CDF3(29386, 32315, 32624) }, + { CDF3(17160, 29001, 31360) }, + { CDF3( 9602, 21862, 27396) }, + { CDF3( 5915, 15772, 22148) }, + { CDF3( 2786, 7779, 12047) }, + { CDF3(29246, 32450, 32663) }, + { CDF3(18696, 29929, 31818) }, + { CDF3(10510, 23369, 28560) }, + { CDF3( 6229, 16499, 23125) }, + { CDF3( 2608, 7448, 11705) }, + { CDF3(30753, 32710, 32748) }, + { CDF3(21638, 31487, 32503) }, + { CDF3(12937, 26854, 30870) }, + { CDF3( 8182, 20596, 26970) }, + { CDF3( 3637, 10269, 15497) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + }, { + { CDF3( 5244, 12150, 16906) }, + { CDF3(20486, 26858, 29701) }, + { CDF3( 7756, 18317, 23735) }, + { CDF3( 3452, 9256, 13146) }, + { CDF3( 2020, 5206, 8229) }, + { CDF3( 1801, 4993, 7903) }, + { CDF3(27051, 31858, 32531) }, + { CDF3(15988, 27531, 30619) }, + { CDF3( 9188, 21484, 26719) }, + { CDF3( 6273, 17186, 23800) }, + { CDF3( 3108, 9355, 14764) }, + { CDF3(31076, 32520, 32680) }, + { CDF3(18119, 30037, 31850) }, + { CDF3(10244, 22969, 27472) }, + { CDF3( 4692, 14077, 19273) }, + { CDF3( 3694, 11677, 17556) }, + { CDF3(30060, 32581, 32720) }, + { CDF3(21011, 30775, 32120) }, + { CDF3(11931, 24820, 29289) }, + { CDF3( 7119, 17662, 24356) }, + { CDF3( 3833, 10706, 16304) }, + { CDF3(31954, 32731, 32748) }, + { CDF3(23913, 31724, 32489) }, + { CDF3(15520, 28060, 31286) }, + { CDF3(11517, 23008, 28571) }, + { CDF3( 6193, 14508, 20629) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + }, + }, { + { + { CDF3( 1035, 2807, 4156) }, + { CDF3(13162, 18138, 20939) }, + { CDF3( 2696, 6633, 8755) }, + { CDF3( 1373, 4161, 6853) }, + { CDF3( 1099, 2746, 4716) }, + { CDF3( 340, 1021, 1599) }, + { CDF3(22826, 30419, 32135) }, + { CDF3(10395, 21762, 26942) }, + { CDF3( 4726, 12407, 17361) }, + { CDF3( 2447, 7080, 10593) }, + { CDF3( 1227, 3717, 6011) }, + { CDF3(28156, 31424, 31934) }, + { CDF3(16915, 27754, 30373) }, + { CDF3( 9148, 20990, 26431) }, + { CDF3( 5950, 15515, 21148) }, + { CDF3( 2492, 7327, 11526) }, + { CDF3(30602, 32477, 32670) }, + { CDF3(20026, 29955, 31568) }, + { CDF3(11220, 23628, 28105) }, + { CDF3( 6652, 17019, 22973) }, + { CDF3( 3064, 8536, 13043) }, + { CDF3(31769, 32724, 32748) }, + { CDF3(22230, 30887, 32373) }, + { CDF3(12234, 25079, 29731) }, + { CDF3( 7326, 18816, 25353) }, + { CDF3( 3933, 10907, 16616) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + }, { + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + }, + }, + }, .dc_sign = { + { { CDF1(16000) }, { CDF1(13056) }, { CDF1(18816) } }, + { { CDF1(15232) }, { CDF1(12928) }, { CDF1(17280) } }, + }, .br_tok = { + { + { + { CDF3(14995, 21341, 24749) }, + { CDF3(13158, 20289, 24601) }, + { CDF3( 8941, 15326, 19876) }, + { CDF3( 6297, 11541, 15807) }, + { CDF3( 4817, 9029, 12776) }, + { CDF3( 3731, 7273, 10627) }, + { CDF3( 1847, 3617, 5354) }, + { CDF3(14472, 19659, 22343) }, + { CDF3(16806, 24162, 27533) }, + { CDF3(12900, 20404, 24713) }, + { CDF3( 9411, 16112, 20797) }, + { CDF3( 7056, 12697, 17148) }, + { CDF3( 5544, 10339, 14460) }, + { CDF3( 2954, 5704, 8319) }, + { CDF3(12464, 18071, 21354) }, + { CDF3(15482, 22528, 26034) }, + { CDF3(12070, 19269, 23624) }, + { CDF3( 8953, 15406, 20106) }, + { CDF3( 7027, 12730, 17220) }, + { CDF3( 5887, 10913, 15140) }, + { CDF3( 3793, 7278, 10447) }, + }, { + { CDF3(15571, 22232, 25749) }, + { CDF3(14506, 21575, 25374) }, + { CDF3(10189, 17089, 21569) }, + { CDF3( 7316, 13301, 17915) }, + { CDF3( 5783, 10912, 15190) }, + { CDF3( 4760, 9155, 13088) }, + { CDF3( 2993, 5966, 8774) }, + { CDF3(23424, 28903, 30778) }, + { CDF3(20775, 27666, 30290) }, + { CDF3(16474, 24410, 28299) }, + { CDF3(12471, 20180, 24987) }, + { CDF3( 9410, 16487, 21439) }, + { CDF3( 7536, 13614, 18529) }, + { CDF3( 5048, 9586, 13549) }, + { CDF3(21090, 27290, 29756) }, + { CDF3(20796, 27402, 30026) }, + { CDF3(17819, 25485, 28969) }, + { CDF3(13860, 21909, 26462) }, + { CDF3(11002, 18494, 23529) }, + { CDF3( 8953, 15929, 20897) }, + { CDF3( 6448, 11918, 16454) }, + }, + }, { + { + { CDF3(15999, 22208, 25449) }, + { CDF3(13050, 19988, 24122) }, + { CDF3( 8594, 14864, 19378) }, + { CDF3( 6033, 11079, 15238) }, + { CDF3( 4554, 8683, 12347) }, + { CDF3( 3672, 7139, 10337) }, + { CDF3( 1900, 3771, 5576) }, + { CDF3(15788, 21340, 23949) }, + { CDF3(16825, 24235, 27758) }, + { CDF3(12873, 20402, 24810) }, + { CDF3( 9590, 16363, 21094) }, + { CDF3( 7352, 13209, 17733) }, + { CDF3( 5960, 10989, 15184) }, + { CDF3( 3232, 6234, 9007) }, + { CDF3(15761, 20716, 23224) }, + { CDF3(19318, 25989, 28759) }, + { CDF3(15529, 23094, 26929) }, + { CDF3(11662, 18989, 23641) }, + { CDF3( 8955, 15568, 20366) }, + { CDF3( 7281, 13106, 17708) }, + { CDF3( 4248, 8059, 11440) }, + }, { + { CDF3(14899, 21217, 24503) }, + { CDF3(13519, 20283, 24047) }, + { CDF3( 9429, 15966, 20365) }, + { CDF3( 6700, 12355, 16652) }, + { CDF3( 5088, 9704, 13716) }, + { CDF3( 4243, 8154, 11731) }, + { CDF3( 2702, 5364, 7861) }, + { CDF3(22745, 28388, 30454) }, + { CDF3(20235, 27146, 29922) }, + { CDF3(15896, 23715, 27637) }, + { CDF3(11840, 19350, 24131) }, + { CDF3( 9122, 15932, 20880) }, + { CDF3( 7488, 13581, 18362) }, + { CDF3( 5114, 9568, 13370) }, + { CDF3(20845, 26553, 28932) }, + { CDF3(20981, 27372, 29884) }, + { CDF3(17781, 25335, 28785) }, + { CDF3(13760, 21708, 26297) }, + { CDF3(10975, 18415, 23365) }, + { CDF3( 9045, 15789, 20686) }, + { CDF3( 6130, 11199, 15423) }, + }, + }, { + { + { CDF3(13549, 19724, 23158) }, + { CDF3(11844, 18382, 22246) }, + { CDF3( 7919, 13619, 17773) }, + { CDF3( 5486, 10143, 13946) }, + { CDF3( 4166, 7983, 11324) }, + { CDF3( 3364, 6506, 9427) }, + { CDF3( 1598, 3160, 4674) }, + { CDF3(15281, 20979, 23781) }, + { CDF3(14939, 22119, 25952) }, + { CDF3(11363, 18407, 22812) }, + { CDF3( 8609, 14857, 19370) }, + { CDF3( 6737, 12184, 16480) }, + { CDF3( 5506, 10263, 14262) }, + { CDF3( 2990, 5786, 8380) }, + { CDF3(20249, 25253, 27417) }, + { CDF3(21070, 27518, 30001) }, + { CDF3(16854, 24469, 28074) }, + { CDF3(12864, 20486, 25000) }, + { CDF3( 9962, 16978, 21778) }, + { CDF3( 8074, 14338, 19048) }, + { CDF3( 4494, 8479, 11906) }, + }, { + { CDF3(13960, 19617, 22829) }, + { CDF3(11150, 17341, 21228) }, + { CDF3( 7150, 12964, 17190) }, + { CDF3( 5331, 10002, 13867) }, + { CDF3( 4167, 7744, 11057) }, + { CDF3( 3480, 6629, 9646) }, + { CDF3( 1883, 3784, 5686) }, + { CDF3(18752, 25660, 28912) }, + { CDF3(16968, 24586, 28030) }, + { CDF3(13520, 21055, 25313) }, + { CDF3(10453, 17626, 22280) }, + { CDF3( 8386, 14505, 19116) }, + { CDF3( 6742, 12595, 17008) }, + { CDF3( 4273, 8140, 11499) }, + { CDF3(22120, 27827, 30233) }, + { CDF3(20563, 27358, 29895) }, + { CDF3(17076, 24644, 28153) }, + { CDF3(13362, 20942, 25309) }, + { CDF3(10794, 17965, 22695) }, + { CDF3( 9014, 15652, 20319) }, + { CDF3( 5708, 10512, 14497) }, + }, + }, { + { + { CDF3( 5705, 10930, 15725) }, + { CDF3( 7946, 12765, 16115) }, + { CDF3( 6801, 12123, 16226) }, + { CDF3( 5462, 10135, 14200) }, + { CDF3( 4189, 8011, 11507) }, + { CDF3( 3191, 6229, 9408) }, + { CDF3( 1057, 2137, 3212) }, + { CDF3(10018, 17067, 21491) }, + { CDF3( 7380, 12582, 16453) }, + { CDF3( 6068, 10845, 14339) }, + { CDF3( 5098, 9198, 12555) }, + { CDF3( 4312, 8010, 11119) }, + { CDF3( 3700, 6966, 9781) }, + { CDF3( 1693, 3326, 4887) }, + { CDF3(18757, 24930, 27774) }, + { CDF3(17648, 24596, 27817) }, + { CDF3(14707, 22052, 26026) }, + { CDF3(11720, 18852, 23292) }, + { CDF3( 9357, 15952, 20525) }, + { CDF3( 7810, 13753, 18210) }, + { CDF3( 3879, 7333, 10328) }, + }, { + { CDF3( 8278, 13242, 15922) }, + { CDF3(10547, 15867, 18919) }, + { CDF3( 9106, 15842, 20609) }, + { CDF3( 6833, 13007, 17218) }, + { CDF3( 4811, 9712, 13923) }, + { CDF3( 3985, 7352, 11128) }, + { CDF3( 1688, 3458, 5262) }, + { CDF3(12951, 21861, 26510) }, + { CDF3( 9788, 16044, 20276) }, + { CDF3( 6309, 11244, 14870) }, + { CDF3( 5183, 9349, 12566) }, + { CDF3( 4389, 8229, 11492) }, + { CDF3( 3633, 6945, 10620) }, + { CDF3( 3600, 6847, 9907) }, + { CDF3(21748, 28137, 30255) }, + { CDF3(19436, 26581, 29560) }, + { CDF3(16359, 24201, 27953) }, + { CDF3(13961, 21693, 25871) }, + { CDF3(11544, 18686, 23322) }, + { CDF3( 9372, 16462, 20952) }, + { CDF3( 6138, 11210, 15390) }, + }, + }, + }, + }, [2] = { + .skip = { + { + { CDF1(29614) }, { CDF1( 9068) }, { CDF1(12924) }, + { CDF1(19538) }, { CDF1(17737) }, { CDF1(24619) }, + { CDF1(30642) }, { CDF1( 4119) }, { CDF1(16026) }, + { CDF1(25657) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, + }, { + { CDF1(31957) }, { CDF1( 3230) }, { CDF1(11153) }, + { CDF1(18123) }, { CDF1(20143) }, { CDF1(26536) }, + { CDF1(31986) }, { CDF1( 3050) }, { CDF1(14603) }, + { CDF1(25155) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, + }, { + { CDF1(32363) }, { CDF1(10692) }, { CDF1(19090) }, + { CDF1(24357) }, { CDF1(24442) }, { CDF1(28312) }, + { CDF1(32169) }, { CDF1( 3648) }, { CDF1(15690) }, + { CDF1(26815) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, + }, { + { CDF1(30669) }, { CDF1( 3832) }, { CDF1(11663) }, + { CDF1(18889) }, { CDF1(19782) }, { CDF1(23313) }, + { CDF1(31330) }, { CDF1( 5124) }, { CDF1(18719) }, + { CDF1(28468) }, { CDF1( 3082) }, { CDF1(20982) }, + { CDF1(29443) }, + }, { + { CDF1(28573) }, { CDF1( 3183) }, { CDF1(17802) }, + { CDF1(25977) }, { CDF1(26677) }, { CDF1(27832) }, + { CDF1(32387) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, + }, + }, .eob_bin_16 = { + { + { CDF4( 4016, 4897, 8881, 14968) }, + { CDF4( 716, 1105, 2646, 10056) }, + }, { + { CDF4(11139, 13270, 18241, 23566) }, + { CDF4( 3192, 5032, 10297, 19755) }, + }, + }, .eob_bin_32 = { + { + { CDF5( 2515, 3003, 4452, 8162, 16041) }, + { CDF5( 574, 821, 1836, 5089, 13128) }, + }, { + { CDF5(13468, 16303, 20361, 25105, 29281) }, + { CDF5( 3542, 5502, 10415, 16760, 25644) }, + }, + }, .eob_bin_64 = { + { + { CDF6( 2374, 2772, 4583, 7276, 12288, 19706) }, + { CDF6( 497, 810, 1315, 3000, 7004, 15641) }, + }, { + { CDF6(15050, 17126, 21410, 24886, 28156, 30726) }, + { CDF6( 4034, 6290, 10235, 14982, 21214, 28491) }, + }, + }, .eob_bin_128 = { + { + { CDF7( 1366, 1738, 2527, 5016, 9355, 15797, 24643) }, + { CDF7( 354, 558, 944, 2760, 7287, 14037, 21779) }, + }, { + { CDF7(13627, 16246, 20173, 24429, 27948, 30415, 31863) }, + { CDF7( 6275, 9889, 14769, 23164, 27988, 30493, 32272) }, + }, + }, .eob_bin_256 = { + { + { CDF8( 3089, 3920, 6038, 9460, + 14266, 19881, 25766, 29176) }, + { CDF8( 1084, 2358, 3488, 5122, + 11483, 18103, 26023, 29799) }, + }, { + { CDF8(11514, 13794, 17480, 20754, + 24361, 27378, 29492, 31277) }, + { CDF8( 6571, 9610, 15516, 21826, + 29092, 30829, 31842, 32708) }, + }, + }, .eob_bin_512 = { + { CDF9( 2624, 3936, 6480, 9686, 13979, + 17726, 23267, 28410, 31078) }, + { CDF9(12015, 14769, 19588, 22052, 24222, + 25812, 27300, 29219, 32114) }, + }, .eob_bin_1024 = { + { CDF10( 2784, 3831, 7041, 10521, 14847, + 18844, 23155, 26682, 29229, 31045) }, + { CDF10( 9577, 12466, 17739, 20750, 22061, + 23215, 24601, 25483, 25843, 32056) }, + }, .eob_hi_bit = { + { + { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(18983) }, + { CDF1(20512) }, { CDF1(14885) }, { CDF1(16384) }, + { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, { CDF1(16384) }, + }, { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(20090) }, + { CDF1(19444) }, { CDF1(17286) }, { CDF1(16384) }, + { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, { CDF1(16384) }, + }, + }, { + { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(19139) }, + { CDF1(21487) }, { CDF1(18959) }, { CDF1(20910) }, + { CDF1(19089) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, { CDF1(16384) }, + }, { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(20536) }, + { CDF1(20664) }, { CDF1(20625) }, { CDF1(19123) }, + { CDF1(14862) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, { CDF1(16384) }, + }, + }, { + { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(19833) }, + { CDF1(21502) }, { CDF1(17485) }, { CDF1(20267) }, + { CDF1(18353) }, { CDF1(23329) }, { CDF1(21478) }, + { CDF1(16384) }, { CDF1(16384) }, + }, { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(22041) }, + { CDF1(23434) }, { CDF1(20001) }, { CDF1(20554) }, + { CDF1(20951) }, { CDF1(20145) }, { CDF1(15562) }, + { CDF1(16384) }, { CDF1(16384) }, + }, + }, { + { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(23312) }, + { CDF1(21607) }, { CDF1(16526) }, { CDF1(18957) }, + { CDF1(18034) }, { CDF1(18934) }, { CDF1(24247) }, + { CDF1(16921) }, { CDF1(17080) }, + }, { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(26579) }, + { CDF1(24910) }, { CDF1(18637) }, { CDF1(19800) }, + { CDF1(20388) }, { CDF1( 9887) }, { CDF1(15642) }, + { CDF1(30198) }, { CDF1(24721) }, + }, + }, { + { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(26998) }, + { CDF1(16737) }, { CDF1(17838) }, { CDF1(18922) }, + { CDF1(19515) }, { CDF1(18636) }, { CDF1(17333) }, + { CDF1(15776) }, { CDF1(22658) }, + }, { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, { CDF1(16384) }, + }, + }, + }, .eob_base_tok = { + { + { + { CDF2(20092, 30774) }, { CDF2(30695, 32020) }, + { CDF2(31131, 32103) }, { CDF2(28666, 30870) }, + }, { + { CDF2(27258, 31095) }, { CDF2(31804, 32623) }, + { CDF2(31763, 32528) }, { CDF2(31438, 32506) }, + }, + }, { + { + { CDF2(18049, 30489) }, { CDF2(31706, 32286) }, + { CDF2(32163, 32473) }, { CDF2(31550, 32184) }, + }, { + { CDF2(27116, 30842) }, { CDF2(31971, 32598) }, + { CDF2(32088, 32576) }, { CDF2(32067, 32664) }, + }, + }, { + { + { CDF2(12854, 29093) }, { CDF2(32272, 32558) }, + { CDF2(32667, 32729) }, { CDF2(32306, 32585) }, + }, { + { CDF2(25476, 30366) }, { CDF2(32169, 32687) }, + { CDF2(32479, 32689) }, { CDF2(31673, 32634) }, + }, + }, { + { + { CDF2( 2809, 19301) }, { CDF2(32205, 32622) }, + { CDF2(32338, 32730) }, { CDF2(31786, 32616) }, + }, { + { CDF2(22737, 29105) }, { CDF2(30810, 32362) }, + { CDF2(30014, 32627) }, { CDF2(30528, 32574) }, + }, + }, { + { + { CDF2( 935, 3382) }, { CDF2(30789, 31909) }, + { CDF2(32466, 32756) }, { CDF2(30860, 32513) }, + }, { + { CDF2(10923, 21845) }, { CDF2(10923, 21845) }, + { CDF2(10923, 21845) }, { CDF2(10923, 21845) }, + }, + }, + }, .base_tok = { + { + { + { CDF3( 8896, 16227, 20630) }, + { CDF3(23629, 31782, 32527) }, + { CDF3(15173, 27755, 31321) }, + { CDF3(10158, 21233, 27382) }, + { CDF3( 6420, 14857, 21558) }, + { CDF3( 3269, 8155, 12646) }, + { CDF3(24835, 32009, 32496) }, + { CDF3(16509, 28421, 31579) }, + { CDF3(10957, 21514, 27418) }, + { CDF3( 7881, 15930, 22096) }, + { CDF3( 5388, 10960, 15918) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3(20745, 30773, 32093) }, + { CDF3(15200, 27221, 30861) }, + { CDF3(13032, 20873, 25667) }, + { CDF3(12285, 18663, 23494) }, + { CDF3(11563, 17481, 21489) }, + { CDF3(26260, 31982, 32320) }, + { CDF3(15397, 28083, 31100) }, + { CDF3( 9742, 19217, 24824) }, + { CDF3( 3261, 9629, 15362) }, + { CDF3( 1480, 4322, 7499) }, + { CDF3(27599, 32256, 32460) }, + { CDF3(16857, 27659, 30774) }, + { CDF3( 9551, 18290, 23748) }, + { CDF3( 3052, 8933, 14103) }, + { CDF3( 2021, 5910, 9787) }, + { CDF3(29005, 32015, 32392) }, + { CDF3(17677, 27694, 30863) }, + { CDF3( 9204, 17356, 23219) }, + { CDF3( 2403, 7516, 12814) }, + { CDF3( 8192, 16384, 24576) }, + }, { + { CDF3(10808, 22056, 26896) }, + { CDF3(25739, 32313, 32676) }, + { CDF3(17288, 30203, 32221) }, + { CDF3(11359, 24878, 29896) }, + { CDF3( 6949, 17767, 24893) }, + { CDF3( 4287, 11796, 18071) }, + { CDF3(27880, 32521, 32705) }, + { CDF3(19038, 31004, 32414) }, + { CDF3(12564, 26345, 30768) }, + { CDF3( 8269, 19947, 26779) }, + { CDF3( 5674, 14657, 21674) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3(25742, 32319, 32671) }, + { CDF3(19557, 31164, 32454) }, + { CDF3(13381, 26381, 30755) }, + { CDF3(10101, 21466, 26722) }, + { CDF3( 9209, 19650, 26825) }, + { CDF3(27107, 31917, 32432) }, + { CDF3(18056, 28893, 31203) }, + { CDF3(10200, 21434, 26764) }, + { CDF3( 4660, 12913, 19502) }, + { CDF3( 2368, 6930, 12504) }, + { CDF3(26960, 32158, 32613) }, + { CDF3(18628, 30005, 32031) }, + { CDF3(10233, 22442, 28232) }, + { CDF3( 5471, 14630, 21516) }, + { CDF3( 3235, 10767, 17109) }, + { CDF3(27696, 32440, 32692) }, + { CDF3(20032, 31167, 32438) }, + { CDF3( 8700, 21341, 28442) }, + { CDF3( 5662, 14831, 21795) }, + { CDF3( 8192, 16384, 24576) }, + }, + }, { + { + { CDF3( 9704, 17294, 21132) }, + { CDF3(26762, 32278, 32633) }, + { CDF3(18382, 29620, 31819) }, + { CDF3(10891, 23475, 28723) }, + { CDF3( 6358, 16583, 23309) }, + { CDF3( 3248, 9118, 14141) }, + { CDF3(27204, 32573, 32699) }, + { CDF3(19818, 30824, 32329) }, + { CDF3(11772, 25120, 30041) }, + { CDF3( 6995, 18033, 25039) }, + { CDF3( 3752, 10442, 16098) }, + { CDF3(27222, 32256, 32559) }, + { CDF3(15356, 28399, 31475) }, + { CDF3( 8821, 20635, 27057) }, + { CDF3( 5511, 14404, 21239) }, + { CDF3( 2935, 8222, 13051) }, + { CDF3(24875, 32120, 32529) }, + { CDF3(15233, 28265, 31445) }, + { CDF3( 8605, 20570, 26932) }, + { CDF3( 5431, 14413, 21196) }, + { CDF3( 2994, 8341, 13223) }, + { CDF3(28201, 32604, 32700) }, + { CDF3(21041, 31446, 32456) }, + { CDF3(13221, 26213, 30475) }, + { CDF3( 8255, 19385, 26037) }, + { CDF3( 4930, 12585, 18830) }, + { CDF3(28768, 32448, 32627) }, + { CDF3(19705, 30561, 32021) }, + { CDF3(11572, 23589, 28220) }, + { CDF3( 5532, 15034, 21446) }, + { CDF3( 2460, 7150, 11456) }, + { CDF3(29874, 32619, 32699) }, + { CDF3(21621, 31071, 32201) }, + { CDF3(12511, 24747, 28992) }, + { CDF3( 6281, 16395, 22748) }, + { CDF3( 3246, 9278, 14497) }, + { CDF3(29715, 32625, 32712) }, + { CDF3(20958, 31011, 32283) }, + { CDF3(11233, 23671, 28806) }, + { CDF3( 6012, 16128, 22868) }, + { CDF3( 3427, 9851, 15414) }, + }, { + { CDF3(11016, 22111, 26794) }, + { CDF3(25946, 32357, 32677) }, + { CDF3(17890, 30452, 32252) }, + { CDF3(11678, 25142, 29816) }, + { CDF3( 6720, 17534, 24584) }, + { CDF3( 4230, 11665, 17820) }, + { CDF3(28400, 32623, 32747) }, + { CDF3(21164, 31668, 32575) }, + { CDF3(13572, 27388, 31182) }, + { CDF3( 8234, 20750, 27358) }, + { CDF3( 5065, 14055, 20897) }, + { CDF3(28981, 32547, 32705) }, + { CDF3(18681, 30543, 32239) }, + { CDF3(10919, 24075, 29286) }, + { CDF3( 6431, 17199, 24077) }, + { CDF3( 3819, 10464, 16618) }, + { CDF3(26870, 32467, 32693) }, + { CDF3(19041, 30831, 32347) }, + { CDF3(11794, 25211, 30016) }, + { CDF3( 6888, 18019, 24970) }, + { CDF3( 4370, 12363, 18992) }, + { CDF3(29578, 32670, 32744) }, + { CDF3(23159, 32007, 32613) }, + { CDF3(15315, 28669, 31676) }, + { CDF3( 9298, 22607, 28782) }, + { CDF3( 6144, 15913, 22968) }, + { CDF3(28110, 32499, 32669) }, + { CDF3(21574, 30937, 32015) }, + { CDF3(12759, 24818, 28727) }, + { CDF3( 6545, 16761, 23042) }, + { CDF3( 3649, 10597, 16833) }, + { CDF3(28163, 32552, 32728) }, + { CDF3(22101, 31469, 32464) }, + { CDF3(13160, 25472, 30143) }, + { CDF3( 7303, 18684, 25468) }, + { CDF3( 5241, 13975, 20955) }, + { CDF3(28400, 32631, 32744) }, + { CDF3(22104, 31793, 32603) }, + { CDF3(13557, 26571, 30846) }, + { CDF3( 7749, 19861, 26675) }, + { CDF3( 4873, 14030, 21234) }, + }, + }, { + { + { CDF3( 9800, 17635, 21073) }, + { CDF3(26153, 31885, 32527) }, + { CDF3(15038, 27852, 31006) }, + { CDF3( 8718, 20564, 26486) }, + { CDF3( 5128, 14076, 20514) }, + { CDF3( 2636, 7566, 11925) }, + { CDF3(27551, 32504, 32701) }, + { CDF3(18310, 30054, 32100) }, + { CDF3(10211, 23420, 29082) }, + { CDF3( 6222, 16876, 23916) }, + { CDF3( 3462, 9954, 15498) }, + { CDF3(29991, 32633, 32721) }, + { CDF3(19883, 30751, 32201) }, + { CDF3(11141, 24184, 29285) }, + { CDF3( 6420, 16940, 23774) }, + { CDF3( 3392, 9753, 15118) }, + { CDF3(28465, 32616, 32712) }, + { CDF3(19850, 30702, 32244) }, + { CDF3(10983, 24024, 29223) }, + { CDF3( 6294, 16770, 23582) }, + { CDF3( 3244, 9283, 14509) }, + { CDF3(30023, 32717, 32748) }, + { CDF3(22940, 32032, 32626) }, + { CDF3(14282, 27928, 31473) }, + { CDF3( 8562, 21327, 27914) }, + { CDF3( 4846, 13393, 19919) }, + { CDF3(29981, 32590, 32695) }, + { CDF3(20465, 30963, 32166) }, + { CDF3(11479, 23579, 28195) }, + { CDF3( 5916, 15648, 22073) }, + { CDF3( 3031, 8605, 13398) }, + { CDF3(31146, 32691, 32739) }, + { CDF3(23106, 31724, 32444) }, + { CDF3(13783, 26738, 30439) }, + { CDF3( 7852, 19468, 25807) }, + { CDF3( 3860, 11124, 16853) }, + { CDF3(31014, 32724, 32748) }, + { CDF3(23629, 32109, 32628) }, + { CDF3(14747, 28115, 31403) }, + { CDF3( 8545, 21242, 27478) }, + { CDF3( 4574, 12781, 19067) }, + }, { + { CDF3( 9185, 19694, 24688) }, + { CDF3(26081, 31985, 32621) }, + { CDF3(16015, 29000, 31787) }, + { CDF3(10542, 23690, 29206) }, + { CDF3( 6732, 17945, 24677) }, + { CDF3( 3916, 11039, 16722) }, + { CDF3(28224, 32566, 32744) }, + { CDF3(19100, 31138, 32485) }, + { CDF3(12528, 26620, 30879) }, + { CDF3( 7741, 20277, 26885) }, + { CDF3( 4566, 12845, 18990) }, + { CDF3(29933, 32593, 32718) }, + { CDF3(17670, 30333, 32155) }, + { CDF3(10385, 23600, 28909) }, + { CDF3( 6243, 16236, 22407) }, + { CDF3( 3976, 10389, 16017) }, + { CDF3(28377, 32561, 32738) }, + { CDF3(19366, 31175, 32482) }, + { CDF3(13327, 27175, 31094) }, + { CDF3( 8258, 20769, 27143) }, + { CDF3( 4703, 13198, 19527) }, + { CDF3(31086, 32706, 32748) }, + { CDF3(22853, 31902, 32583) }, + { CDF3(14759, 28186, 31419) }, + { CDF3( 9284, 22382, 28348) }, + { CDF3( 5585, 15192, 21868) }, + { CDF3(28291, 32652, 32746) }, + { CDF3(19849, 32107, 32571) }, + { CDF3(14834, 26818, 29214) }, + { CDF3(10306, 22594, 28672) }, + { CDF3( 6615, 17384, 23384) }, + { CDF3(28947, 32604, 32745) }, + { CDF3(25625, 32289, 32646) }, + { CDF3(18758, 28672, 31403) }, + { CDF3(10017, 23430, 28523) }, + { CDF3( 6862, 15269, 22131) }, + { CDF3(23933, 32509, 32739) }, + { CDF3(19927, 31495, 32631) }, + { CDF3(11903, 26023, 30621) }, + { CDF3( 7026, 20094, 27252) }, + { CDF3( 5998, 18106, 24437) }, + }, + }, { + { + { CDF3( 4456, 11274, 15533) }, + { CDF3(21219, 29079, 31616) }, + { CDF3(11173, 23774, 28567) }, + { CDF3( 7282, 18293, 24263) }, + { CDF3( 4890, 13286, 19115) }, + { CDF3( 1890, 5508, 8659) }, + { CDF3(26651, 32136, 32647) }, + { CDF3(14630, 28254, 31455) }, + { CDF3( 8716, 21287, 27395) }, + { CDF3( 5615, 15331, 22008) }, + { CDF3( 2675, 7700, 12150) }, + { CDF3(29954, 32526, 32690) }, + { CDF3(16126, 28982, 31633) }, + { CDF3( 9030, 21361, 27352) }, + { CDF3( 5411, 14793, 21271) }, + { CDF3( 2943, 8422, 13163) }, + { CDF3(29539, 32601, 32730) }, + { CDF3(18125, 30385, 32201) }, + { CDF3(10422, 24090, 29468) }, + { CDF3( 6468, 17487, 24438) }, + { CDF3( 2970, 8653, 13531) }, + { CDF3(30912, 32715, 32748) }, + { CDF3(20666, 31373, 32497) }, + { CDF3(12509, 26640, 30917) }, + { CDF3( 8058, 20629, 27290) }, + { CDF3( 4231, 12006, 18052) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + }, { + { CDF3(10202, 20633, 25484) }, + { CDF3(27336, 31445, 32352) }, + { CDF3(12420, 24384, 28552) }, + { CDF3( 7648, 18115, 23856) }, + { CDF3( 5662, 14341, 19902) }, + { CDF3( 3611, 10328, 15390) }, + { CDF3(30945, 32616, 32736) }, + { CDF3(18682, 30505, 32253) }, + { CDF3(11513, 25336, 30203) }, + { CDF3( 7449, 19452, 26148) }, + { CDF3( 4482, 13051, 18886) }, + { CDF3(32022, 32690, 32747) }, + { CDF3(18578, 30501, 32146) }, + { CDF3(11249, 23368, 28631) }, + { CDF3( 5645, 16958, 22158) }, + { CDF3( 5009, 11444, 16637) }, + { CDF3(31357, 32710, 32748) }, + { CDF3(21552, 31494, 32504) }, + { CDF3(13891, 27677, 31340) }, + { CDF3( 9051, 22098, 28172) }, + { CDF3( 5190, 13377, 19486) }, + { CDF3(32364, 32740, 32748) }, + { CDF3(24839, 31907, 32551) }, + { CDF3(17160, 28779, 31696) }, + { CDF3(12452, 24137, 29602) }, + { CDF3( 6165, 15389, 22477) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + }, + }, { + { + { CDF3( 2575, 7281, 11077) }, + { CDF3(14002, 20866, 25402) }, + { CDF3( 6343, 15056, 19658) }, + { CDF3( 4474, 11858, 17041) }, + { CDF3( 2865, 8299, 12534) }, + { CDF3( 1344, 3949, 6391) }, + { CDF3(24720, 31239, 32459) }, + { CDF3(12585, 25356, 29968) }, + { CDF3( 7181, 18246, 24444) }, + { CDF3( 5025, 13667, 19885) }, + { CDF3( 2521, 7304, 11605) }, + { CDF3(29908, 32252, 32584) }, + { CDF3(17421, 29156, 31575) }, + { CDF3( 9889, 22188, 27782) }, + { CDF3( 5878, 15647, 22123) }, + { CDF3( 2814, 8665, 13323) }, + { CDF3(30183, 32568, 32713) }, + { CDF3(18528, 30195, 32049) }, + { CDF3(10982, 24606, 29657) }, + { CDF3( 6957, 18165, 25231) }, + { CDF3( 3508, 10118, 15468) }, + { CDF3(31761, 32736, 32748) }, + { CDF3(21041, 31328, 32546) }, + { CDF3(12568, 26732, 31166) }, + { CDF3( 8052, 20720, 27733) }, + { CDF3( 4336, 12192, 18396) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + }, { + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + }, + }, + }, .dc_sign = { + { { CDF1(16000) }, { CDF1(13056) }, { CDF1(18816) } }, + { { CDF1(15232) }, { CDF1(12928) }, { CDF1(17280) } }, + }, .br_tok = { + { + { + { CDF3(16138, 22223, 25509) }, + { CDF3(15347, 22430, 26332) }, + { CDF3( 9614, 16736, 21332) }, + { CDF3( 6600, 12275, 16907) }, + { CDF3( 4811, 9424, 13547) }, + { CDF3( 3748, 7809, 11420) }, + { CDF3( 2254, 4587, 6890) }, + { CDF3(15196, 20284, 23177) }, + { CDF3(18317, 25469, 28451) }, + { CDF3(13918, 21651, 25842) }, + { CDF3(10052, 17150, 21995) }, + { CDF3( 7499, 13630, 18587) }, + { CDF3( 6158, 11417, 16003) }, + { CDF3( 4014, 7785, 11252) }, + { CDF3(15048, 21067, 24384) }, + { CDF3(18202, 25346, 28553) }, + { CDF3(14302, 22019, 26356) }, + { CDF3(10839, 18139, 23166) }, + { CDF3( 8715, 15744, 20806) }, + { CDF3( 7536, 13576, 18544) }, + { CDF3( 5413, 10335, 14498) }, + }, { + { CDF3(17394, 24501, 27895) }, + { CDF3(15889, 23420, 27185) }, + { CDF3(11561, 19133, 23870) }, + { CDF3( 8285, 14812, 19844) }, + { CDF3( 6496, 12043, 16550) }, + { CDF3( 4771, 9574, 13677) }, + { CDF3( 3603, 6830, 10144) }, + { CDF3(21656, 27704, 30200) }, + { CDF3(21324, 27915, 30511) }, + { CDF3(17327, 25336, 28997) }, + { CDF3(13417, 21381, 26033) }, + { CDF3(10132, 17425, 22338) }, + { CDF3( 8580, 15016, 19633) }, + { CDF3( 5694, 11477, 16411) }, + { CDF3(24116, 29780, 31450) }, + { CDF3(23853, 29695, 31591) }, + { CDF3(20085, 27614, 30428) }, + { CDF3(15326, 24335, 28575) }, + { CDF3(11814, 19472, 24810) }, + { CDF3(10221, 18611, 24767) }, + { CDF3( 7689, 14558, 20321) }, + }, + }, { + { + { CDF3(16214, 22380, 25770) }, + { CDF3(14213, 21304, 25295) }, + { CDF3( 9213, 15823, 20455) }, + { CDF3( 6395, 11758, 16139) }, + { CDF3( 4779, 9187, 13066) }, + { CDF3( 3821, 7501, 10953) }, + { CDF3( 2293, 4567, 6795) }, + { CDF3(15859, 21283, 23820) }, + { CDF3(18404, 25602, 28726) }, + { CDF3(14325, 21980, 26206) }, + { CDF3(10669, 17937, 22720) }, + { CDF3( 8297, 14642, 19447) }, + { CDF3( 6746, 12389, 16893) }, + { CDF3( 4324, 8251, 11770) }, + { CDF3(16532, 21631, 24475) }, + { CDF3(20667, 27150, 29668) }, + { CDF3(16728, 24510, 28175) }, + { CDF3(12861, 20645, 25332) }, + { CDF3(10076, 17361, 22417) }, + { CDF3( 8395, 14940, 19963) }, + { CDF3( 5731, 10683, 14912) }, + }, { + { CDF3(14433, 21155, 24938) }, + { CDF3(14658, 21716, 25545) }, + { CDF3( 9923, 16824, 21557) }, + { CDF3( 6982, 13052, 17721) }, + { CDF3( 5419, 10503, 15050) }, + { CDF3( 4852, 9162, 13014) }, + { CDF3( 3271, 6395, 9630) }, + { CDF3(22210, 27833, 30109) }, + { CDF3(20750, 27368, 29821) }, + { CDF3(16894, 24828, 28573) }, + { CDF3(13247, 21276, 25757) }, + { CDF3(10038, 17265, 22563) }, + { CDF3( 8587, 14947, 20327) }, + { CDF3( 5645, 11371, 15252) }, + { CDF3(22027, 27526, 29714) }, + { CDF3(23098, 29146, 31221) }, + { CDF3(19886, 27341, 30272) }, + { CDF3(15609, 23747, 28046) }, + { CDF3(11993, 20065, 24939) }, + { CDF3( 9637, 18267, 23671) }, + { CDF3( 7625, 13801, 19144) }, + }, + }, { + { + { CDF3(14438, 20798, 24089) }, + { CDF3(12621, 19203, 23097) }, + { CDF3( 8177, 14125, 18402) }, + { CDF3( 5674, 10501, 14456) }, + { CDF3( 4236, 8239, 11733) }, + { CDF3( 3447, 6750, 9806) }, + { CDF3( 1986, 3950, 5864) }, + { CDF3(16208, 22099, 24930) }, + { CDF3(16537, 24025, 27585) }, + { CDF3(12780, 20381, 24867) }, + { CDF3( 9767, 16612, 21416) }, + { CDF3( 7686, 13738, 18398) }, + { CDF3( 6333, 11614, 15964) }, + { CDF3( 3941, 7571, 10836) }, + { CDF3(22819, 27422, 29202) }, + { CDF3(22224, 28514, 30721) }, + { CDF3(17660, 25433, 28913) }, + { CDF3(13574, 21482, 26002) }, + { CDF3(10629, 17977, 22938) }, + { CDF3( 8612, 15298, 20265) }, + { CDF3( 5607, 10491, 14596) }, + }, { + { CDF3(13569, 19800, 23206) }, + { CDF3(13128, 19924, 23869) }, + { CDF3( 8329, 14841, 19403) }, + { CDF3( 6130, 10976, 15057) }, + { CDF3( 4682, 8839, 12518) }, + { CDF3( 3656, 7409, 10588) }, + { CDF3( 2577, 5099, 7412) }, + { CDF3(22427, 28684, 30585) }, + { CDF3(20913, 27750, 30139) }, + { CDF3(15840, 24109, 27834) }, + { CDF3(12308, 20029, 24569) }, + { CDF3(10216, 16785, 21458) }, + { CDF3( 8309, 14203, 19113) }, + { CDF3( 6043, 11168, 15307) }, + { CDF3(23166, 28901, 30998) }, + { CDF3(21899, 28405, 30751) }, + { CDF3(18413, 26091, 29443) }, + { CDF3(15233, 23114, 27352) }, + { CDF3(12683, 20472, 25288) }, + { CDF3(10702, 18259, 23409) }, + { CDF3( 8125, 14464, 19226) }, + }, + }, { + { + { CDF3( 9040, 14786, 18360) }, + { CDF3( 9979, 15718, 19415) }, + { CDF3( 7913, 13918, 18311) }, + { CDF3( 5859, 10889, 15184) }, + { CDF3( 4593, 8677, 12510) }, + { CDF3( 3820, 7396, 10791) }, + { CDF3( 1730, 3471, 5192) }, + { CDF3(11803, 18365, 22709) }, + { CDF3(11419, 18058, 22225) }, + { CDF3( 9418, 15774, 20243) }, + { CDF3( 7539, 13325, 17657) }, + { CDF3( 6233, 11317, 15384) }, + { CDF3( 5137, 9656, 13545) }, + { CDF3( 2977, 5774, 8349) }, + { CDF3(21207, 27246, 29640) }, + { CDF3(19547, 26578, 29497) }, + { CDF3(16169, 23871, 27690) }, + { CDF3(12820, 20458, 25018) }, + { CDF3(10224, 17332, 22214) }, + { CDF3( 8526, 15048, 19884) }, + { CDF3( 5037, 9410, 13118) }, + }, { + { CDF3(12339, 17329, 20140) }, + { CDF3(13505, 19895, 23225) }, + { CDF3( 9847, 16944, 21564) }, + { CDF3( 7280, 13256, 18348) }, + { CDF3( 4712, 10009, 14454) }, + { CDF3( 4361, 7914, 12477) }, + { CDF3( 2870, 5628, 7995) }, + { CDF3(20061, 25504, 28526) }, + { CDF3(15235, 22878, 26145) }, + { CDF3(12985, 19958, 24155) }, + { CDF3( 9782, 16641, 21403) }, + { CDF3( 9456, 16360, 20760) }, + { CDF3( 6855, 12940, 18557) }, + { CDF3( 5661, 10564, 15002) }, + { CDF3(25656, 30602, 31894) }, + { CDF3(22570, 29107, 31092) }, + { CDF3(18917, 26423, 29541) }, + { CDF3(15940, 23649, 27754) }, + { CDF3(12803, 20581, 25219) }, + { CDF3(11082, 18695, 23376) }, + { CDF3( 7939, 14373, 19005) }, + }, + }, + }, + }, [3] = { + .skip = { + { + { CDF1(26887) }, { CDF1( 6729) }, { CDF1(10361) }, + { CDF1(17442) }, { CDF1(15045) }, { CDF1(22478) }, + { CDF1(29072) }, { CDF1( 2713) }, { CDF1(11861) }, + { CDF1(20773) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, + }, { + { CDF1(31903) }, { CDF1( 2044) }, { CDF1( 7528) }, + { CDF1(14618) }, { CDF1(16182) }, { CDF1(24168) }, + { CDF1(31037) }, { CDF1( 2786) }, { CDF1(11194) }, + { CDF1(20155) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, + }, { + { CDF1(32510) }, { CDF1( 8430) }, { CDF1(17318) }, + { CDF1(24154) }, { CDF1(23674) }, { CDF1(28789) }, + { CDF1(32139) }, { CDF1( 3440) }, { CDF1(13117) }, + { CDF1(22702) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, + }, { + { CDF1(31671) }, { CDF1( 2056) }, { CDF1(11746) }, + { CDF1(16852) }, { CDF1(18635) }, { CDF1(24715) }, + { CDF1(31484) }, { CDF1( 4656) }, { CDF1(16074) }, + { CDF1(24704) }, { CDF1( 1806) }, { CDF1(14645) }, + { CDF1(25336) }, + }, { + { CDF1(31539) }, { CDF1( 8433) }, { CDF1(20576) }, + { CDF1(27904) }, { CDF1(27852) }, { CDF1(30026) }, + { CDF1(32441) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, + }, + }, .eob_bin_16 = { + { + { CDF4( 6708, 8958, 14746, 22133) }, + { CDF4( 1222, 2074, 4783, 15410) }, + }, { + { CDF4(19575, 21766, 26044, 29709) }, + { CDF4( 7297, 10767, 19273, 28194) }, + }, + }, .eob_bin_32 = { + { + { CDF5( 4617, 5709, 8446, 13584, 23135) }, + { CDF5( 1156, 1702, 3675, 9274, 20539) }, + }, { + { CDF5(22086, 24282, 27010, 29770, 31743) }, + { CDF5( 7699, 10897, 20891, 26926, 31628) }, + }, + }, .eob_bin_64 = { + { + { CDF6( 6307, 7541, 12060, 16358, 22553, 27865) }, + { CDF6( 1289, 2320, 3971, 7926, 14153, 24291) }, + }, { + { CDF6(24212, 25708, 28268, 30035, 31307, 32049) }, + { CDF6( 8726, 12378, 19409, 26450, 30038, 32462) }, + }, + }, .eob_bin_128 = { + { + { CDF7( 3472, 4885, 7489, 12481, 18517, 24536, 29635) }, + { CDF7( 886, 1731, 3271, 8469, 15569, 22126, 28383) }, + }, { + { CDF7(24313, 26062, 28385, 30107, 31217, 31898, 32345) }, + { CDF7( 9165, 13282, 21150, 30286, 31894, 32571, 32712) }, + }, + }, .eob_bin_256 = { + { + { CDF8( 5348, 7113, 11820, 15924, + 22106, 26777, 30334, 31757) }, + { CDF8( 2453, 4474, 6307, 8777, + 16474, 22975, 29000, 31547) }, + }, { + { CDF8(23110, 24597, 27140, 28894, + 30167, 30927, 31392, 32094) }, + { CDF8( 9998, 17661, 25178, 28097, + 31308, 32038, 32403, 32695) }, + }, + }, .eob_bin_512 = { + { CDF9( 5927, 7809, 10923, 14597, 19439, + 24135, 28456, 31142, 32060) }, + { CDF9(21093, 23043, 25742, 27658, 29097, + 29716, 30073, 30820, 31956) }, + }, .eob_bin_1024 = { + { CDF10( 6698, 8334, 11961, 15762, 20186, + 23862, 27434, 29326, 31082, 32050) }, + { CDF10(20569, 22426, 25569, 26859, 28053, + 28913, 29486, 29724, 29807, 32570) }, + }, .eob_hi_bit = { + { + { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(20177) }, + { CDF1(20789) }, { CDF1(20262) }, { CDF1(16384) }, + { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, { CDF1(16384) }, + }, { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(21416) }, + { CDF1(20855) }, { CDF1(23410) }, { CDF1(16384) }, + { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, { CDF1(16384) }, + }, + }, { + { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(20238) }, + { CDF1(21057) }, { CDF1(19159) }, { CDF1(22337) }, + { CDF1(20159) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, { CDF1(16384) }, + }, { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(20125) }, + { CDF1(20559) }, { CDF1(21707) }, { CDF1(22296) }, + { CDF1(17333) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, { CDF1(16384) }, + }, + }, { + { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(19941) }, + { CDF1(20527) }, { CDF1(21470) }, { CDF1(22487) }, + { CDF1(19558) }, { CDF1(22354) }, { CDF1(20331) }, + { CDF1(16384) }, { CDF1(16384) }, + }, { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(22752) }, + { CDF1(25006) }, { CDF1(22075) }, { CDF1(21576) }, + { CDF1(17740) }, { CDF1(21690) }, { CDF1(19211) }, + { CDF1(16384) }, { CDF1(16384) }, + }, + }, { + { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(21442) }, + { CDF1(22358) }, { CDF1(18503) }, { CDF1(20291) }, + { CDF1(19945) }, { CDF1(21294) }, { CDF1(21178) }, + { CDF1(19400) }, { CDF1(10556) }, + }, { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(24648) }, + { CDF1(24949) }, { CDF1(20708) }, { CDF1(23905) }, + { CDF1(20501) }, { CDF1( 9558) }, { CDF1( 9423) }, + { CDF1(30365) }, { CDF1(19253) }, + }, + }, { + { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(26064) }, + { CDF1(22098) }, { CDF1(19613) }, { CDF1(20525) }, + { CDF1(17595) }, { CDF1(16618) }, { CDF1(20497) }, + { CDF1(18989) }, { CDF1(15513) }, + }, { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) }, + { CDF1(16384) }, { CDF1(16384) }, + }, + }, + }, .eob_base_tok = { + { + { + { CDF2(22497, 31198) }, { CDF2(31715, 32495) }, + { CDF2(31606, 32337) }, { CDF2(30388, 31990) }, + }, { + { CDF2(27877, 31584) }, { CDF2(32170, 32728) }, + { CDF2(32155, 32688) }, { CDF2(32219, 32702) }, + }, + }, { + { + { CDF2(21457, 31043) }, { CDF2(31951, 32483) }, + { CDF2(32153, 32562) }, { CDF2(31473, 32215) }, + }, { + { CDF2(27558, 31151) }, { CDF2(32020, 32640) }, + { CDF2(32097, 32575) }, { CDF2(32242, 32719) }, + }, + }, { + { + { CDF2(19980, 30591) }, { CDF2(32219, 32597) }, + { CDF2(32581, 32706) }, { CDF2(31803, 32287) }, + }, { + { CDF2(26473, 30507) }, { CDF2(32431, 32723) }, + { CDF2(32196, 32611) }, { CDF2(31588, 32528) }, + }, + }, { + { + { CDF2(24647, 30463) }, { CDF2(32412, 32695) }, + { CDF2(32468, 32720) }, { CDF2(31269, 32523) }, + }, { + { CDF2(28482, 31505) }, { CDF2(32152, 32701) }, + { CDF2(31732, 32598) }, { CDF2(31767, 32712) }, + }, + }, { + { + { CDF2(12358, 24977) }, { CDF2(31331, 32385) }, + { CDF2(32634, 32756) }, { CDF2(30411, 32548) }, + }, { + { CDF2(10923, 21845) }, { CDF2(10923, 21845) }, + { CDF2(10923, 21845) }, { CDF2(10923, 21845) }, + }, + }, + }, .base_tok = { + { + { + { CDF3( 7062, 16472, 22319) }, + { CDF3(24538, 32261, 32674) }, + { CDF3(13675, 28041, 31779) }, + { CDF3( 8590, 20674, 27631) }, + { CDF3( 5685, 14675, 22013) }, + { CDF3( 3655, 9898, 15731) }, + { CDF3(26493, 32418, 32658) }, + { CDF3(16376, 29342, 32090) }, + { CDF3(10594, 22649, 28970) }, + { CDF3( 8176, 17170, 24303) }, + { CDF3( 5605, 12694, 19139) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3(23888, 31902, 32542) }, + { CDF3(18612, 29687, 31987) }, + { CDF3(16245, 24852, 29249) }, + { CDF3(15765, 22608, 27559) }, + { CDF3(19895, 24699, 27510) }, + { CDF3(28401, 32212, 32457) }, + { CDF3(15274, 27825, 30980) }, + { CDF3( 9364, 18128, 24332) }, + { CDF3( 2283, 8193, 15082) }, + { CDF3( 1228, 3972, 7881) }, + { CDF3(29455, 32469, 32620) }, + { CDF3(17981, 28245, 31388) }, + { CDF3(10921, 20098, 26240) }, + { CDF3( 3743, 11829, 18657) }, + { CDF3( 2374, 9593, 15715) }, + { CDF3(31068, 32466, 32635) }, + { CDF3(20321, 29572, 31971) }, + { CDF3(10771, 20255, 27119) }, + { CDF3( 2795, 10410, 17361) }, + { CDF3( 8192, 16384, 24576) }, + }, { + { CDF3( 9320, 22102, 27840) }, + { CDF3(27057, 32464, 32724) }, + { CDF3(16331, 30268, 32309) }, + { CDF3(10319, 23935, 29720) }, + { CDF3( 6189, 16448, 24106) }, + { CDF3( 3589, 10884, 18808) }, + { CDF3(29026, 32624, 32748) }, + { CDF3(19226, 31507, 32587) }, + { CDF3(12692, 26921, 31203) }, + { CDF3( 7049, 19532, 27635) }, + { CDF3( 7727, 15669, 23252) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3(28056, 32625, 32748) }, + { CDF3(22383, 32075, 32669) }, + { CDF3(15417, 27098, 31749) }, + { CDF3(18127, 26493, 27190) }, + { CDF3( 5461, 16384, 21845) }, + { CDF3(27982, 32091, 32584) }, + { CDF3(19045, 29868, 31972) }, + { CDF3(10397, 22266, 27932) }, + { CDF3( 5990, 13697, 21500) }, + { CDF3( 1792, 6912, 15104) }, + { CDF3(28198, 32501, 32718) }, + { CDF3(21534, 31521, 32569) }, + { CDF3(11109, 25217, 30017) }, + { CDF3( 5671, 15124, 26151) }, + { CDF3( 4681, 14043, 18725) }, + { CDF3(28688, 32580, 32741) }, + { CDF3(22576, 32079, 32661) }, + { CDF3(10627, 22141, 28340) }, + { CDF3( 9362, 14043, 28087) }, + { CDF3( 8192, 16384, 24576) }, + }, + }, { + { + { CDF3( 7754, 16948, 22142) }, + { CDF3(25670, 32330, 32691) }, + { CDF3(15663, 29225, 31994) }, + { CDF3( 9878, 23288, 29158) }, + { CDF3( 6419, 17088, 24336) }, + { CDF3( 3859, 11003, 17039) }, + { CDF3(27562, 32595, 32725) }, + { CDF3(17575, 30588, 32399) }, + { CDF3(10819, 24838, 30309) }, + { CDF3( 7124, 18686, 25916) }, + { CDF3( 4479, 12688, 19340) }, + { CDF3(28385, 32476, 32673) }, + { CDF3(15306, 29005, 31938) }, + { CDF3( 8937, 21615, 28322) }, + { CDF3( 5982, 15603, 22786) }, + { CDF3( 3620, 10267, 16136) }, + { CDF3(27280, 32464, 32667) }, + { CDF3(15607, 29160, 32004) }, + { CDF3( 9091, 22135, 28740) }, + { CDF3( 6232, 16632, 24020) }, + { CDF3( 4047, 11377, 17672) }, + { CDF3(29220, 32630, 32718) }, + { CDF3(19650, 31220, 32462) }, + { CDF3(13050, 26312, 30827) }, + { CDF3( 9228, 20870, 27468) }, + { CDF3( 6146, 15149, 21971) }, + { CDF3(30169, 32481, 32623) }, + { CDF3(17212, 29311, 31554) }, + { CDF3( 9911, 21311, 26882) }, + { CDF3( 4487, 13314, 20372) }, + { CDF3( 2570, 7772, 12889) }, + { CDF3(30924, 32613, 32708) }, + { CDF3(19490, 30206, 32107) }, + { CDF3(11232, 23998, 29276) }, + { CDF3( 6769, 17955, 25035) }, + { CDF3( 4398, 12623, 19214) }, + { CDF3(30609, 32627, 32722) }, + { CDF3(19370, 30582, 32287) }, + { CDF3(10457, 23619, 29409) }, + { CDF3( 6443, 17637, 24834) }, + { CDF3( 4645, 13236, 20106) }, + }, { + { CDF3( 8626, 20271, 26216) }, + { CDF3(26707, 32406, 32711) }, + { CDF3(16999, 30329, 32286) }, + { CDF3(11445, 25123, 30286) }, + { CDF3( 6411, 18828, 25601) }, + { CDF3( 6801, 12458, 20248) }, + { CDF3(29918, 32682, 32748) }, + { CDF3(20649, 31739, 32618) }, + { CDF3(12879, 27773, 31581) }, + { CDF3( 7896, 21751, 28244) }, + { CDF3( 5260, 14870, 23698) }, + { CDF3(29252, 32593, 32731) }, + { CDF3(17072, 30460, 32294) }, + { CDF3(10653, 24143, 29365) }, + { CDF3( 6536, 17490, 23983) }, + { CDF3( 4929, 13170, 20085) }, + { CDF3(28137, 32518, 32715) }, + { CDF3(18171, 30784, 32407) }, + { CDF3(11437, 25436, 30459) }, + { CDF3( 7252, 18534, 26176) }, + { CDF3( 4126, 13353, 20978) }, + { CDF3(31162, 32726, 32748) }, + { CDF3(23017, 32222, 32701) }, + { CDF3(15629, 29233, 32046) }, + { CDF3( 9387, 22621, 29480) }, + { CDF3( 6922, 17616, 25010) }, + { CDF3(28838, 32265, 32614) }, + { CDF3(19701, 30206, 31920) }, + { CDF3(11214, 22410, 27933) }, + { CDF3( 5320, 14177, 23034) }, + { CDF3( 5049, 12881, 17827) }, + { CDF3(27484, 32471, 32734) }, + { CDF3(21076, 31526, 32561) }, + { CDF3(12707, 26303, 31211) }, + { CDF3( 8169, 21722, 28219) }, + { CDF3( 6045, 19406, 27042) }, + { CDF3(27753, 32572, 32745) }, + { CDF3(20832, 31878, 32653) }, + { CDF3(13250, 27356, 31674) }, + { CDF3( 7718, 21508, 29858) }, + { CDF3( 7209, 18350, 25559) }, + }, + }, { + { + { CDF3( 7876, 16901, 21741) }, + { CDF3(24001, 31898, 32625) }, + { CDF3(14529, 27959, 31451) }, + { CDF3( 8273, 20818, 27258) }, + { CDF3( 5278, 14673, 21510) }, + { CDF3( 2983, 8843, 14039) }, + { CDF3(28016, 32574, 32732) }, + { CDF3(17471, 30306, 32301) }, + { CDF3(10224, 24063, 29728) }, + { CDF3( 6602, 17954, 25052) }, + { CDF3( 4002, 11585, 17759) }, + { CDF3(30190, 32634, 32739) }, + { CDF3(17497, 30282, 32270) }, + { CDF3(10229, 23729, 29538) }, + { CDF3( 6344, 17211, 24440) }, + { CDF3( 3849, 11189, 17108) }, + { CDF3(28570, 32583, 32726) }, + { CDF3(17521, 30161, 32238) }, + { CDF3(10153, 23565, 29378) }, + { CDF3( 6455, 17341, 24443) }, + { CDF3( 3907, 11042, 17024) }, + { CDF3(30689, 32715, 32748) }, + { CDF3(21546, 31840, 32610) }, + { CDF3(13547, 27581, 31459) }, + { CDF3( 8912, 21757, 28309) }, + { CDF3( 5548, 15080, 22046) }, + { CDF3(30783, 32540, 32685) }, + { CDF3(17540, 29528, 31668) }, + { CDF3(10160, 21468, 26783) }, + { CDF3( 4724, 13393, 20054) }, + { CDF3( 2702, 8174, 13102) }, + { CDF3(31648, 32686, 32742) }, + { CDF3(20954, 31094, 32337) }, + { CDF3(12420, 25698, 30179) }, + { CDF3( 7304, 19320, 26248) }, + { CDF3( 4366, 12261, 18864) }, + { CDF3(31581, 32723, 32748) }, + { CDF3(21373, 31586, 32525) }, + { CDF3(12744, 26625, 30885) }, + { CDF3( 7431, 20322, 26950) }, + { CDF3( 4692, 13323, 20111) }, + }, { + { CDF3( 7833, 18369, 24095) }, + { CDF3(26650, 32273, 32702) }, + { CDF3(16371, 29961, 32191) }, + { CDF3(11055, 24082, 29629) }, + { CDF3( 6892, 18644, 25400) }, + { CDF3( 5006, 13057, 19240) }, + { CDF3(29834, 32666, 32748) }, + { CDF3(19577, 31335, 32570) }, + { CDF3(12253, 26509, 31122) }, + { CDF3( 7991, 20772, 27711) }, + { CDF3( 5677, 15910, 23059) }, + { CDF3(30109, 32532, 32720) }, + { CDF3(16747, 30166, 32252) }, + { CDF3(10134, 23542, 29184) }, + { CDF3( 5791, 16176, 23556) }, + { CDF3( 4362, 10414, 17284) }, + { CDF3(29492, 32626, 32748) }, + { CDF3(19894, 31402, 32525) }, + { CDF3(12942, 27071, 30869) }, + { CDF3( 8346, 21216, 27405) }, + { CDF3( 6572, 17087, 23859) }, + { CDF3(32035, 32735, 32748) }, + { CDF3(22957, 31838, 32618) }, + { CDF3(14724, 28572, 31772) }, + { CDF3(10364, 23999, 29553) }, + { CDF3( 7004, 18433, 25655) }, + { CDF3(27528, 32277, 32681) }, + { CDF3(16959, 31171, 32096) }, + { CDF3(10486, 23593, 27962) }, + { CDF3( 8192, 16384, 23211) }, + { CDF3( 8937, 17873, 20852) }, + { CDF3(27715, 32002, 32615) }, + { CDF3(15073, 29491, 31676) }, + { CDF3(11264, 24576, 28672) }, + { CDF3( 2341, 18725, 23406) }, + { CDF3( 7282, 18204, 25486) }, + { CDF3(28547, 32213, 32657) }, + { CDF3(20788, 29773, 32239) }, + { CDF3( 6780, 21469, 30508) }, + { CDF3( 5958, 14895, 23831) }, + { CDF3(16384, 21845, 27307) }, + }, + }, { + { + { CDF3( 5992, 14304, 19765) }, + { CDF3(22612, 31238, 32456) }, + { CDF3(13456, 27162, 31087) }, + { CDF3( 8001, 20062, 26504) }, + { CDF3( 5168, 14105, 20764) }, + { CDF3( 2632, 7771, 12385) }, + { CDF3(27034, 32344, 32709) }, + { CDF3(15850, 29415, 31997) }, + { CDF3( 9494, 22776, 28841) }, + { CDF3( 6151, 16830, 23969) }, + { CDF3( 3461, 10039, 15722) }, + { CDF3(30134, 32569, 32731) }, + { CDF3(15638, 29422, 31945) }, + { CDF3( 9150, 21865, 28218) }, + { CDF3( 5647, 15719, 22676) }, + { CDF3( 3402, 9772, 15477) }, + { CDF3(28530, 32586, 32735) }, + { CDF3(17139, 30298, 32292) }, + { CDF3(10200, 24039, 29685) }, + { CDF3( 6419, 17674, 24786) }, + { CDF3( 3544, 10225, 15824) }, + { CDF3(31333, 32726, 32748) }, + { CDF3(20618, 31487, 32544) }, + { CDF3(12901, 27217, 31232) }, + { CDF3( 8624, 21734, 28171) }, + { CDF3( 5104, 14191, 20748) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + }, { + { CDF3(11206, 21090, 26561) }, + { CDF3(28759, 32279, 32671) }, + { CDF3(14171, 27952, 31569) }, + { CDF3( 9743, 22907, 29141) }, + { CDF3( 6871, 17886, 24868) }, + { CDF3( 4960, 13152, 19315) }, + { CDF3(31077, 32661, 32748) }, + { CDF3(19400, 31195, 32515) }, + { CDF3(12752, 26858, 31040) }, + { CDF3( 8370, 22098, 28591) }, + { CDF3( 5457, 15373, 22298) }, + { CDF3(31697, 32706, 32748) }, + { CDF3(17860, 30657, 32333) }, + { CDF3(12510, 24812, 29261) }, + { CDF3( 6180, 19124, 24722) }, + { CDF3( 5041, 13548, 17959) }, + { CDF3(31552, 32716, 32748) }, + { CDF3(21908, 31769, 32623) }, + { CDF3(14470, 28201, 31565) }, + { CDF3( 9493, 22982, 28608) }, + { CDF3( 6858, 17240, 24137) }, + { CDF3(32543, 32752, 32756) }, + { CDF3(24286, 32097, 32666) }, + { CDF3(15958, 29217, 32024) }, + { CDF3(10207, 24234, 29958) }, + { CDF3( 6929, 18305, 25652) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + }, + }, { + { + { CDF3( 4137, 10847, 15682) }, + { CDF3(17824, 27001, 30058) }, + { CDF3(10204, 22796, 28291) }, + { CDF3( 6076, 15935, 22125) }, + { CDF3( 3852, 10937, 16816) }, + { CDF3( 2252, 6324, 10131) }, + { CDF3(25840, 32016, 32662) }, + { CDF3(15109, 28268, 31531) }, + { CDF3( 9385, 22231, 28340) }, + { CDF3( 6082, 16672, 23479) }, + { CDF3( 3318, 9427, 14681) }, + { CDF3(30594, 32574, 32718) }, + { CDF3(16836, 29552, 31859) }, + { CDF3( 9556, 22542, 28356) }, + { CDF3( 6305, 16725, 23540) }, + { CDF3( 3376, 9895, 15184) }, + { CDF3(29383, 32617, 32745) }, + { CDF3(18891, 30809, 32401) }, + { CDF3(11688, 25942, 30687) }, + { CDF3( 7468, 19469, 26651) }, + { CDF3( 3909, 11358, 17012) }, + { CDF3(31564, 32736, 32748) }, + { CDF3(20906, 31611, 32600) }, + { CDF3(13191, 27621, 31537) }, + { CDF3( 8768, 22029, 28676) }, + { CDF3( 5079, 14109, 20906) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + }, { + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + { CDF3( 8192, 16384, 24576) }, + }, + }, + }, .dc_sign = { + { { CDF1(16000) }, { CDF1(13056) }, { CDF1(18816) } }, + { { CDF1(15232) }, { CDF1(12928) }, { CDF1(17280) } }, + }, .br_tok = { + { + { + { CDF3(18315, 24289, 27551) }, + { CDF3(16854, 24068, 27835) }, + { CDF3(10140, 17927, 23173) }, + { CDF3( 6722, 12982, 18267) }, + { CDF3( 4661, 9826, 14706) }, + { CDF3( 3832, 8165, 12294) }, + { CDF3( 2795, 6098, 9245) }, + { CDF3(17145, 23326, 26672) }, + { CDF3(20733, 27680, 30308) }, + { CDF3(16032, 24461, 28546) }, + { CDF3(11653, 20093, 25081) }, + { CDF3( 9290, 16429, 22086) }, + { CDF3( 7796, 14598, 19982) }, + { CDF3( 6502, 12378, 17441) }, + { CDF3(21681, 27732, 30320) }, + { CDF3(22389, 29044, 31261) }, + { CDF3(19027, 26731, 30087) }, + { CDF3(14739, 23755, 28624) }, + { CDF3(11358, 20778, 25511) }, + { CDF3(10995, 18073, 24190) }, + { CDF3( 9162, 14990, 20617) }, + }, { + { CDF3(21425, 27952, 30388) }, + { CDF3(18062, 25838, 29034) }, + { CDF3(11956, 19881, 24808) }, + { CDF3( 7718, 15000, 20980) }, + { CDF3( 5702, 11254, 16143) }, + { CDF3( 4898, 9088, 16864) }, + { CDF3( 3679, 6776, 11907) }, + { CDF3(23294, 30160, 31663) }, + { CDF3(24397, 29896, 31836) }, + { CDF3(19245, 27128, 30593) }, + { CDF3(13202, 19825, 26404) }, + { CDF3(11578, 19297, 23957) }, + { CDF3( 8073, 13297, 21370) }, + { CDF3( 5461, 10923, 19745) }, + { CDF3(27367, 30521, 31934) }, + { CDF3(24904, 30671, 31940) }, + { CDF3(23075, 28460, 31299) }, + { CDF3(14400, 23658, 30417) }, + { CDF3(13885, 23882, 28325) }, + { CDF3(14746, 22938, 27853) }, + { CDF3( 5461, 16384, 27307) }, + }, + }, { + { + { CDF3(18274, 24813, 27890) }, + { CDF3(15537, 23149, 27003) }, + { CDF3( 9449, 16740, 21827) }, + { CDF3( 6700, 12498, 17261) }, + { CDF3( 4988, 9866, 14198) }, + { CDF3( 4236, 8147, 11902) }, + { CDF3( 2867, 5860, 8654) }, + { CDF3(17124, 23171, 26101) }, + { CDF3(20396, 27477, 30148) }, + { CDF3(16573, 24629, 28492) }, + { CDF3(12749, 20846, 25674) }, + { CDF3(10233, 17878, 22818) }, + { CDF3( 8525, 15332, 20363) }, + { CDF3( 6283, 11632, 16255) }, + { CDF3(20466, 26511, 29286) }, + { CDF3(23059, 29174, 31191) }, + { CDF3(19481, 27263, 30241) }, + { CDF3(15458, 23631, 28137) }, + { CDF3(12416, 20608, 25693) }, + { CDF3(10261, 18011, 23261) }, + { CDF3( 8016, 14655, 19666) }, + }, { + { CDF3(17616, 24586, 28112) }, + { CDF3(15809, 23299, 27155) }, + { CDF3(10767, 18890, 23793) }, + { CDF3( 7727, 14255, 18865) }, + { CDF3( 6129, 11926, 16882) }, + { CDF3( 4482, 9704, 14861) }, + { CDF3( 3277, 7452, 11522) }, + { CDF3(22956, 28551, 30730) }, + { CDF3(22724, 28937, 30961) }, + { CDF3(18467, 26324, 29580) }, + { CDF3(13234, 20713, 25649) }, + { CDF3(11181, 17592, 22481) }, + { CDF3( 8291, 18358, 24576) }, + { CDF3( 7568, 11881, 14984) }, + { CDF3(24948, 29001, 31147) }, + { CDF3(25674, 30619, 32151) }, + { CDF3(20841, 26793, 29603) }, + { CDF3(14669, 24356, 28666) }, + { CDF3(11334, 23593, 28219) }, + { CDF3( 8922, 14762, 22873) }, + { CDF3( 8301, 13544, 20535) }, + }, + }, { + { + { CDF3(17113, 23733, 27081) }, + { CDF3(14139, 21406, 25452) }, + { CDF3( 8552, 15002, 19776) }, + { CDF3( 5871, 11120, 15378) }, + { CDF3( 4455, 8616, 12253) }, + { CDF3( 3469, 6910, 10386) }, + { CDF3( 2255, 4553, 6782) }, + { CDF3(18224, 24376, 27053) }, + { CDF3(19290, 26710, 29614) }, + { CDF3(14936, 22991, 27184) }, + { CDF3(11238, 18951, 23762) }, + { CDF3( 8786, 15617, 20588) }, + { CDF3( 7317, 13228, 18003) }, + { CDF3( 5101, 9512, 13493) }, + { CDF3(22639, 28222, 30210) }, + { CDF3(23216, 29331, 31307) }, + { CDF3(19075, 26762, 29895) }, + { CDF3(15014, 23113, 27457) }, + { CDF3(11938, 19857, 24752) }, + { CDF3( 9942, 17280, 22282) }, + { CDF3( 7167, 13144, 17752) }, + }, { + { CDF3(15820, 22738, 26488) }, + { CDF3(13530, 20885, 25216) }, + { CDF3( 8395, 15530, 20452) }, + { CDF3( 6574, 12321, 16380) }, + { CDF3( 5353, 10419, 14568) }, + { CDF3( 4613, 8446, 12381) }, + { CDF3( 3440, 7158, 9903) }, + { CDF3(24247, 29051, 31224) }, + { CDF3(22118, 28058, 30369) }, + { CDF3(16498, 24768, 28389) }, + { CDF3(12920, 21175, 26137) }, + { CDF3(10730, 18619, 25352) }, + { CDF3(10187, 16279, 22791) }, + { CDF3( 9310, 14631, 22127) }, + { CDF3(24970, 30558, 32057) }, + { CDF3(24801, 29942, 31698) }, + { CDF3(22432, 28453, 30855) }, + { CDF3(19054, 25680, 29580) }, + { CDF3(14392, 23036, 28109) }, + { CDF3(12495, 20947, 26650) }, + { CDF3(12442, 20326, 26214) }, + }, + }, { + { + { CDF3(12162, 18785, 22648) }, + { CDF3(12749, 19697, 23806) }, + { CDF3( 8580, 15297, 20346) }, + { CDF3( 6169, 11749, 16543) }, + { CDF3( 4836, 9391, 13448) }, + { CDF3( 3821, 7711, 11613) }, + { CDF3( 2228, 4601, 7070) }, + { CDF3(16319, 24725, 28280) }, + { CDF3(15698, 23277, 27168) }, + { CDF3(12726, 20368, 25047) }, + { CDF3( 9912, 17015, 21976) }, + { CDF3( 7888, 14220, 19179) }, + { CDF3( 6777, 12284, 17018) }, + { CDF3( 4492, 8590, 12252) }, + { CDF3(23249, 28904, 30947) }, + { CDF3(21050, 27908, 30512) }, + { CDF3(17440, 25340, 28949) }, + { CDF3(14059, 22018, 26541) }, + { CDF3(11288, 18903, 23898) }, + { CDF3( 9411, 16342, 21428) }, + { CDF3( 6278, 11588, 15944) }, + }, { + { CDF3(13981, 20067, 23226) }, + { CDF3(16922, 23580, 26783) }, + { CDF3(11005, 19039, 24487) }, + { CDF3( 7389, 14218, 19798) }, + { CDF3( 5598, 11505, 17206) }, + { CDF3( 6090, 11213, 15659) }, + { CDF3( 3820, 7371, 10119) }, + { CDF3(21082, 26925, 29675) }, + { CDF3(21262, 28627, 31128) }, + { CDF3(18392, 26454, 30437) }, + { CDF3(14870, 22910, 27096) }, + { CDF3(12620, 19484, 24908) }, + { CDF3( 9290, 16553, 22802) }, + { CDF3( 6668, 14288, 20004) }, + { CDF3(27704, 31055, 31949) }, + { CDF3(24709, 29978, 31788) }, + { CDF3(21668, 29264, 31657) }, + { CDF3(18295, 26968, 30074) }, + { CDF3(16399, 24422, 29313) }, + { CDF3(14347, 23026, 28104) }, + { CDF3(12370, 19806, 24477) }, + }, + }, + }, + } +}; + +void dav1d_cdf_thread_update(const Dav1dFrameHeader *const hdr, + CdfContext *const dst, + const CdfContext *const src) +{ +#define update_cdf_1d(n1d, name) \ + do { \ + memcpy(dst->name, src->name, sizeof(dst->name)); \ + dst->name[n1d] = 0; \ + } while (0) + +#define update_cdf_2d(n1d, n2d, name) \ + for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j]) +#define update_cdf_3d(n1d, n2d, n3d, name) \ + for (int k = 0; k < (n1d); k++) update_cdf_2d(n2d, n3d, name[k]) +#define update_cdf_4d(n1d, n2d, n3d, n4d, name) \ + for (int l = 0; l < (n1d); l++) update_cdf_3d(n2d, n3d, n4d, name[l]) + +#define update_bit_0d(name) \ + do { \ + dst->name[0] = src->name[0]; \ + dst->name[1] = 0; \ + } while (0) + +#define update_bit_1d(n1d, name) \ + for (int i = 0; i < (n1d); i++) update_bit_0d(name[i]) +#define update_bit_2d(n1d, n2d, name) \ + for (int j = 0; j < (n1d); j++) update_bit_1d(n2d, name[j]) +#define update_bit_3d(n1d, n2d, n3d, name) \ + for (int k = 0; k < (n1d); k++) update_bit_2d(n2d, n3d, name[k]) + + update_bit_1d(N_BS_SIZES, m.use_filter_intra); + update_cdf_1d(4, m.filter_intra); + update_cdf_3d(2, N_INTRA_PRED_MODES, N_UV_INTRA_PRED_MODES - 1 - !k, m.uv_mode); + update_cdf_2d(8, 6, m.angle_delta); + update_cdf_3d(N_TX_SIZES - 1, 3, imin(k + 1, 2), m.txsz); + update_cdf_3d(2, N_INTRA_PRED_MODES, 6, m.txtp_intra1); + update_cdf_3d(3, N_INTRA_PRED_MODES, 4, m.txtp_intra2); + update_bit_1d(3, m.skip); + update_cdf_3d(N_BL_LEVELS, 4, dav1d_partition_type_count[k], m.partition); + update_bit_2d(N_TX_SIZES, 13, coef.skip); + update_cdf_3d(2, 2, 4, coef.eob_bin_16); + update_cdf_3d(2, 2, 5, coef.eob_bin_32); + update_cdf_3d(2, 2, 6, coef.eob_bin_64); + update_cdf_3d(2, 2, 7, coef.eob_bin_128); + update_cdf_3d(2, 2, 8, coef.eob_bin_256); + update_cdf_2d(2, 9, coef.eob_bin_512); + update_cdf_2d(2, 10, coef.eob_bin_1024); + update_bit_3d(N_TX_SIZES, 2, 11 /*22*/, coef.eob_hi_bit); + update_cdf_4d(N_TX_SIZES, 2, 4, 2, coef.eob_base_tok); + update_cdf_4d(N_TX_SIZES, 2, 41 /*42*/, 3, coef.base_tok); + update_bit_2d(2, 3, coef.dc_sign); + update_cdf_4d(4, 2, 21, 3, coef.br_tok); + update_cdf_2d(3, DAV1D_MAX_SEGMENTS - 1, m.seg_id); + update_cdf_1d(7, m.cfl_sign); + update_cdf_2d(6, 15, m.cfl_alpha); + update_bit_0d(m.restore_wiener); + update_bit_0d(m.restore_sgrproj); + update_cdf_1d(2, m.restore_switchable); + update_cdf_1d(3, m.delta_q); + update_cdf_2d(5, 3, m.delta_lf); + update_bit_2d(7, 3, m.pal_y); + update_bit_1d(2, m.pal_uv); + update_cdf_3d(2, 7, 6, m.pal_sz); + update_cdf_4d(2, 7, 5, k + 1, m.color_map); + update_bit_2d(7, 3, m.txpart); + update_cdf_2d(2, 15, m.txtp_inter1); + update_cdf_1d(11, m.txtp_inter2); + update_bit_1d(4, m.txtp_inter3); + + if (!(hdr->frame_type & 1)) { + update_bit_0d(m.intrabc); + + update_cdf_1d(N_MV_JOINTS - 1, dmv.joint); + for (int k = 0; k < 2; k++) { + update_cdf_1d(10, dmv.comp[k].classes); + update_bit_0d(dmv.comp[k].class0); + update_bit_1d(10, dmv.comp[k].classN); + update_bit_0d(dmv.comp[k].sign); + } + return; + } + + update_bit_1d(3, m.skip_mode); + update_cdf_2d(4, N_INTRA_PRED_MODES - 1, m.y_mode); + update_cdf_3d(2, 8, DAV1D_N_SWITCHABLE_FILTERS - 1, m.filter); + update_bit_1d(6, m.newmv_mode); + update_bit_1d(2, m.globalmv_mode); + update_bit_1d(6, m.refmv_mode); + update_bit_1d(3, m.drl_bit); + update_cdf_2d(8, N_COMP_INTER_PRED_MODES - 1, m.comp_inter_mode); + update_bit_1d(4, m.intra); + update_bit_1d(5, m.comp); + update_bit_1d(5, m.comp_dir); + update_bit_1d(6, m.jnt_comp); + update_bit_1d(6, m.mask_comp); + update_bit_1d(9, m.wedge_comp); + update_cdf_2d(9, 15, m.wedge_idx); + update_bit_2d(6, 3, m.ref); + update_bit_2d(3, 3, m.comp_fwd_ref); + update_bit_2d(2, 3, m.comp_bwd_ref); + update_bit_2d(3, 3, m.comp_uni_ref); + update_bit_1d(3, m.seg_pred); + update_bit_1d(4, m.interintra); + update_bit_1d(7, m.interintra_wedge); + update_cdf_2d(4, 3, m.interintra_mode); + update_cdf_2d(N_BS_SIZES, 2, m.motion_mode); + update_bit_1d(N_BS_SIZES, m.obmc); + + update_cdf_1d(N_MV_JOINTS - 1, mv.joint); + for (int k = 0; k < 2; k++) { + update_cdf_1d(10, mv.comp[k].classes); + update_bit_0d(mv.comp[k].class0); + update_bit_1d(10, mv.comp[k].classN); + update_cdf_2d(2, 3, mv.comp[k].class0_fp); + update_cdf_1d(3, mv.comp[k].classN_fp); + update_bit_0d(mv.comp[k].class0_hp); + update_bit_0d(mv.comp[k].classN_hp); + update_bit_0d(mv.comp[k].sign); + } +} + +/* + * CDF threading wrappers. + */ +static inline int get_qcat_idx(const int q) { + if (q <= 20) return 0; + if (q <= 60) return 1; + if (q <= 120) return 2; + return 3; +} + +void dav1d_cdf_thread_init_static(CdfThreadContext *const cdf, const int qidx) { + cdf->ref = NULL; + cdf->data.qcat = get_qcat_idx(qidx); +} + +void dav1d_cdf_thread_copy(CdfContext *const dst, const CdfThreadContext *const src) { + if (src->ref) { + memcpy(dst, src->data.cdf, sizeof(*dst)); + } else { + dst->m = av1_default_cdf; + memcpy(dst->kfym, default_kf_y_mode_cdf, sizeof(default_kf_y_mode_cdf)); + dst->coef = av1_default_coef_cdf[src->data.qcat]; + memcpy(dst->mv.joint, default_mv_joint_cdf, sizeof(default_mv_joint_cdf)); + memcpy(dst->dmv.joint, default_mv_joint_cdf, sizeof(default_mv_joint_cdf)); + dst->mv.comp[0] = dst->mv.comp[1] = dst->dmv.comp[0] = dst->dmv.comp[1] = + default_mv_component_cdf; + } +} + +int dav1d_cdf_thread_alloc(Dav1dContext *const c, CdfThreadContext *const cdf, + struct thread_data *const t) +{ + cdf->ref = dav1d_ref_create_using_pool(&c->cdf_pool, + sizeof(CdfContext) + sizeof(atomic_uint)); + if (!cdf->ref) return DAV1D_ERR(ENOMEM); + cdf->data.cdf = cdf->ref->data; + if (t) { + cdf->progress = (atomic_uint *) &cdf->data.cdf[1]; + atomic_init(cdf->progress, 0); + cdf->t = t; + } + return 0; +} + +void dav1d_cdf_thread_ref(CdfThreadContext *const dst, + CdfThreadContext *const src) +{ + *dst = *src; + if (src->ref) + dav1d_ref_inc(src->ref); +} + +void dav1d_cdf_thread_unref(CdfThreadContext *const cdf) { + if (cdf->ref) + dav1d_ref_dec(&cdf->ref); + memset(cdf, 0, sizeof(*cdf)); +} + +void dav1d_cdf_thread_wait(CdfThreadContext *const cdf) { + if (!cdf->t) return; + + if (atomic_load(cdf->progress)) return; + pthread_mutex_lock(&cdf->t->lock); + while (!atomic_load(cdf->progress)) + pthread_cond_wait(&cdf->t->cond, &cdf->t->lock); + pthread_mutex_unlock(&cdf->t->lock); +} + +void dav1d_cdf_thread_signal(CdfThreadContext *const cdf) { + if (!cdf->t) return; + + pthread_mutex_lock(&cdf->t->lock); + atomic_store(cdf->progress, 1); + pthread_cond_broadcast(&cdf->t->cond); + pthread_mutex_unlock(&cdf->t->lock); +} diff --git a/third_party/dav1d/src/cdf.h b/third_party/dav1d/src/cdf.h new file mode 100644 index 0000000000..36286e5de2 --- /dev/null +++ b/third_party/dav1d/src/cdf.h @@ -0,0 +1,157 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_CDF_H +#define DAV1D_SRC_CDF_H + +#include + +#include "src/levels.h" +#include "src/ref.h" +#include "src/thread_data.h" + +/* Buffers padded to [8] or [16] for SIMD where needed. */ + +typedef struct CdfModeContext { + ALIGN(uint16_t y_mode[4][N_INTRA_PRED_MODES + 3], 32); + ALIGN(uint16_t uv_mode[2][N_INTRA_PRED_MODES][N_UV_INTRA_PRED_MODES + 2], 32); + ALIGN(uint16_t wedge_idx[9][16], 32); + ALIGN(uint16_t partition[N_BL_LEVELS][4][N_PARTITIONS + 6], 32); + ALIGN(uint16_t cfl_alpha[6][16], 32); + ALIGN(uint16_t txtp_inter1[2][16], 32); + ALIGN(uint16_t txtp_inter2[12 + 4], 32); + ALIGN(uint16_t txtp_intra1[2][N_INTRA_PRED_MODES][7 + 1], 16); + ALIGN(uint16_t txtp_intra2[3][N_INTRA_PRED_MODES][5 + 3], 16); + ALIGN(uint16_t cfl_sign[8], 16); + ALIGN(uint16_t angle_delta[8][8], 16); + ALIGN(uint16_t filter_intra[5 + 3], 16); + ALIGN(uint16_t comp_inter_mode[8][N_COMP_INTER_PRED_MODES], 16); + ALIGN(uint16_t seg_id[3][DAV1D_MAX_SEGMENTS], 16); + ALIGN(uint16_t pal_sz[2][7][7 + 1], 16); + ALIGN(uint16_t color_map[2][7][5][8], 16); + ALIGN(uint16_t filter[2][8][DAV1D_N_SWITCHABLE_FILTERS + 1], 8); + ALIGN(uint16_t txsz[N_TX_SIZES - 1][3][4], 8); + ALIGN(uint16_t motion_mode[N_BS_SIZES][3 + 1], 8); + ALIGN(uint16_t delta_q[4], 8); + ALIGN(uint16_t delta_lf[5][4], 8); + ALIGN(uint16_t interintra_mode[4][4], 8); + ALIGN(uint16_t restore_switchable[3 + 1], 8); + ALIGN(uint16_t restore_wiener[2], 4); + ALIGN(uint16_t restore_sgrproj[2], 4); + ALIGN(uint16_t interintra[7][2], 4); + ALIGN(uint16_t interintra_wedge[7][2], 4); + ALIGN(uint16_t txtp_inter3[4][2], 4); + ALIGN(uint16_t use_filter_intra[N_BS_SIZES][2], 4); + ALIGN(uint16_t newmv_mode[6][2], 4); + ALIGN(uint16_t globalmv_mode[2][2], 4); + ALIGN(uint16_t refmv_mode[6][2], 4); + ALIGN(uint16_t drl_bit[3][2], 4); + ALIGN(uint16_t intra[4][2], 4); + ALIGN(uint16_t comp[5][2], 4); + ALIGN(uint16_t comp_dir[5][2], 4); + ALIGN(uint16_t jnt_comp[6][2], 4); + ALIGN(uint16_t mask_comp[6][2], 4); + ALIGN(uint16_t wedge_comp[9][2], 4); + ALIGN(uint16_t ref[6][3][2], 4); + ALIGN(uint16_t comp_fwd_ref[3][3][2], 4); + ALIGN(uint16_t comp_bwd_ref[2][3][2], 4); + ALIGN(uint16_t comp_uni_ref[3][3][2], 4); + ALIGN(uint16_t txpart[7][3][2], 4); + ALIGN(uint16_t skip[3][2], 4); + ALIGN(uint16_t skip_mode[3][2], 4); + ALIGN(uint16_t seg_pred[3][2], 4); + ALIGN(uint16_t obmc[N_BS_SIZES][2], 4); + ALIGN(uint16_t pal_y[7][3][2], 4); + ALIGN(uint16_t pal_uv[2][2], 4); + ALIGN(uint16_t intrabc[2], 4); +} CdfModeContext; + +typedef struct CdfCoefContext { + ALIGN(uint16_t eob_bin_16[2][2][5 + 3], 16); + ALIGN(uint16_t eob_bin_32[2][2][6 + 2], 16); + ALIGN(uint16_t eob_bin_64[2][2][7 + 1], 16); + ALIGN(uint16_t eob_bin_128[2][2][8 + 0], 16); + ALIGN(uint16_t eob_bin_256[2][2][9 + 7], 32); + ALIGN(uint16_t eob_bin_512[2][10 + 6], 32); + ALIGN(uint16_t eob_bin_1024[2][11 + 5], 32); + ALIGN(uint16_t eob_base_tok[N_TX_SIZES][2][4][4], 8); + ALIGN(uint16_t base_tok[N_TX_SIZES][2][41][4], 8); + ALIGN(uint16_t br_tok[4 /*5*/][2][21][4], 8); + ALIGN(uint16_t eob_hi_bit[N_TX_SIZES][2][11 /*22*/][2], 4); + ALIGN(uint16_t skip[N_TX_SIZES][13][2], 4); + ALIGN(uint16_t dc_sign[2][3][2], 4); +} CdfCoefContext; + +typedef struct CdfMvComponent { + ALIGN(uint16_t classes[11 + 5], 32); + ALIGN(uint16_t class0_fp[2][4], 8); + ALIGN(uint16_t classN_fp[4], 8); + ALIGN(uint16_t class0_hp[2], 4); + ALIGN(uint16_t classN_hp[2], 4); + ALIGN(uint16_t class0[2], 4); + ALIGN(uint16_t classN[10][2], 4); + ALIGN(uint16_t sign[2], 4); +} CdfMvComponent; + +typedef struct CdfMvContext { + CdfMvComponent comp[2]; + ALIGN(uint16_t joint[N_MV_JOINTS], 8); +} CdfMvContext; + +typedef struct CdfContext { + CdfModeContext m; + ALIGN(uint16_t kfym[5][5][N_INTRA_PRED_MODES + 3], 32); + CdfCoefContext coef; + CdfMvContext mv, dmv; +} CdfContext; + +typedef struct CdfThreadContext { + Dav1dRef *ref; ///< allocation origin + union { + CdfContext *cdf; // if ref != NULL + unsigned qcat; // if ref == NULL, from static CDF tables + } data; + struct thread_data *t; + atomic_uint *progress; +} CdfThreadContext; + +void dav1d_cdf_thread_init_static(CdfThreadContext *cdf, int qidx); +int dav1d_cdf_thread_alloc(Dav1dContext *c, CdfThreadContext *cdf, + struct thread_data *t); +void dav1d_cdf_thread_copy(CdfContext *dst, const CdfThreadContext *src); +void dav1d_cdf_thread_ref(CdfThreadContext *dst, CdfThreadContext *src); +void dav1d_cdf_thread_unref(CdfThreadContext *cdf); +void dav1d_cdf_thread_update(const Dav1dFrameHeader *hdr, CdfContext *dst, + const CdfContext *src); + +/* + * These are binary signals (so a signal is either "done" or "not done"). + */ +void dav1d_cdf_thread_wait(CdfThreadContext *cdf); +void dav1d_cdf_thread_signal(CdfThreadContext *cdf); + +#endif /* DAV1D_SRC_CDF_H */ diff --git a/third_party/dav1d/src/cpu.c b/third_party/dav1d/src/cpu.c new file mode 100644 index 0000000000..f8a909f28d --- /dev/null +++ b/third_party/dav1d/src/cpu.c @@ -0,0 +1,63 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include "config.h" + +#include + +#include "src/cpu.h" + +static unsigned flags = 0; + +#if __has_feature(memory_sanitizer) +// memory sanitizer is inherently incompatible with asm +static unsigned flags_mask = 0; +#elif ARCH_X86 +/* Disable AVX-512 by default for the time being */ +static unsigned flags_mask = ~DAV1D_X86_CPU_FLAG_AVX512ICL; +#else +static unsigned flags_mask = -1; +#endif + +COLD void dav1d_init_cpu(void) { +#if HAVE_ASM +#if ARCH_AARCH64 || ARCH_ARM + flags = dav1d_get_cpu_flags_arm(); +#elif ARCH_PPC64LE + flags = dav1d_get_cpu_flags_ppc(); +#elif ARCH_X86 + flags = dav1d_get_cpu_flags_x86(); +#endif +#endif +} + +COLD unsigned dav1d_get_cpu_flags(void) { + return flags & flags_mask; +} + +COLD void dav1d_set_cpu_flags_mask(const unsigned mask) { + flags_mask = mask; +} diff --git a/third_party/dav1d/src/cpu.h b/third_party/dav1d/src/cpu.h new file mode 100644 index 0000000000..d5299f243b --- /dev/null +++ b/third_party/dav1d/src/cpu.h @@ -0,0 +1,49 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_CPU_H +#define DAV1D_SRC_CPU_H + +#include "config.h" + +#include "common/attributes.h" + +#include "dav1d/common.h" + +#if ARCH_AARCH64 || ARCH_ARM +#include "src/arm/cpu.h" +#elif ARCH_PPC64LE +#include "src/ppc/cpu.h" +#elif ARCH_X86 +#include "src/x86/cpu.h" +#endif + +void dav1d_init_cpu(void); +unsigned dav1d_get_cpu_flags(void); +DAV1D_API void dav1d_set_cpu_flags_mask(unsigned mask); + +#endif /* DAV1D_SRC_CPU_H */ diff --git a/third_party/dav1d/src/ctx.h b/third_party/dav1d/src/ctx.h new file mode 100644 index 0000000000..d0e1f310ae --- /dev/null +++ b/third_party/dav1d/src/ctx.h @@ -0,0 +1,91 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_CTX_H +#define DAV1D_SRC_CTX_H + +#include + +#include "common/attributes.h" + +union alias64 { uint64_t u64; uint8_t u8[8]; } ATTR_ALIAS; +union alias32 { uint32_t u32; uint8_t u8[4]; } ATTR_ALIAS; +union alias16 { uint16_t u16; uint8_t u8[2]; } ATTR_ALIAS; +union alias8 { uint8_t u8; } ATTR_ALIAS; + +#define set_ctx_rep4(type, var, off, val) do { \ + const uint64_t const_val = val; \ + ((union alias64 *) &var[off + 0])->u64 = const_val; \ + ((union alias64 *) &var[off + 8])->u64 = const_val; \ + ((union alias64 *) &var[off + 16])->u64 = const_val; \ + ((union alias64 *) &var[off + 24])->u64 = const_val; \ + } while (0) +#define set_ctx_rep2(type, var, off, val) do { \ + const uint64_t const_val = val; \ + ((union alias64 *) &var[off + 0])->u64 = const_val; \ + ((union alias64 *) &var[off + 8])->u64 = const_val; \ + } while (0) +#define set_ctx_rep1(typesz, var, off, val) \ + ((union alias##typesz *) &var[off])->u##typesz = val +#define case_set(var, dir, diridx, off) \ + switch (var) { \ + case 1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \ + case 2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \ + case 4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \ + case 8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \ + case 16: set_ctx( , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \ + case 32: set_ctx( , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep4); break; \ + } +#define case_set_upto16(var, dir, diridx, off) \ + switch (var) { \ + case 1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \ + case 2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \ + case 4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \ + case 8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \ + case 16: set_ctx( , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \ + } +#define case_set_upto32_with_default(var, dir, diridx, off) \ + switch (var) { \ + case 1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \ + case 2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \ + case 4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \ + case 8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \ + case 16: set_ctx( , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \ + case 32: set_ctx( , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep4); break; \ + default: default_memset(dir, diridx, off, var); break; \ + } +#define case_set_upto16_with_default(var, dir, diridx, off) \ + switch (var) { \ + case 1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \ + case 2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \ + case 4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \ + case 8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \ + case 16: set_ctx( , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \ + default: default_memset(dir, diridx, off, var); break; \ + } + +#endif /* DAV1D_SRC_CTX_H */ diff --git a/third_party/dav1d/src/data.c b/third_party/dav1d/src/data.c new file mode 100644 index 0000000000..1caf22dccc --- /dev/null +++ b/third_party/dav1d/src/data.c @@ -0,0 +1,148 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include +#include +#include +#include + +#include "dav1d/data.h" + +#include "common/attributes.h" +#include "common/validate.h" + +#include "src/data.h" +#include "src/ref.h" + +uint8_t *dav1d_data_create_internal(Dav1dData *const buf, const size_t sz) { + validate_input_or_ret(buf != NULL, NULL); + + if (sz > SIZE_MAX / 2) return NULL; + buf->ref = dav1d_ref_create(sz); + if (!buf->ref) return NULL; + buf->data = buf->ref->const_data; + buf->sz = buf->m.size = sz; + dav1d_data_props_set_defaults(&buf->m); + + return buf->ref->data; +} + +int dav1d_data_wrap_internal(Dav1dData *const buf, const uint8_t *const ptr, + const size_t sz, + void (*const free_callback)(const uint8_t *data, + void *cookie), + void *const cookie) +{ + validate_input_or_ret(buf != NULL, DAV1D_ERR(EINVAL)); + validate_input_or_ret(ptr != NULL, DAV1D_ERR(EINVAL)); + validate_input_or_ret(free_callback != NULL, DAV1D_ERR(EINVAL)); + + buf->ref = dav1d_ref_wrap(ptr, free_callback, cookie); + if (!buf->ref) return DAV1D_ERR(ENOMEM); + buf->data = ptr; + buf->sz = buf->m.size = sz; + dav1d_data_props_set_defaults(&buf->m); + + return 0; +} + +int dav1d_data_wrap_user_data_internal(Dav1dData *const buf, + const uint8_t *const user_data, + void (*const free_callback)(const uint8_t *user_data, + void *cookie), + void *const cookie) +{ + validate_input_or_ret(buf != NULL, DAV1D_ERR(EINVAL)); + validate_input_or_ret(free_callback != NULL, DAV1D_ERR(EINVAL)); + + buf->m.user_data.ref = dav1d_ref_wrap(user_data, free_callback, cookie); + if (!buf->m.user_data.ref) return DAV1D_ERR(ENOMEM); + buf->m.user_data.data = user_data; + + return 0; +} + + +void dav1d_data_ref(Dav1dData *const dst, const Dav1dData *const src) { + validate_input(dst != NULL); + validate_input(dst->data == NULL); + validate_input(src != NULL); + + if (src->ref) { + validate_input(src->data != NULL); + dav1d_ref_inc(src->ref); + } + if (src->m.user_data.ref) dav1d_ref_inc(src->m.user_data.ref); + *dst = *src; +} + +void dav1d_data_move_ref(Dav1dData *const dst, Dav1dData *const src) { + validate_input(dst != NULL); + validate_input(dst->data == NULL); + validate_input(src != NULL); + + if (src->ref) + validate_input(src->data != NULL); + + *dst = *src; + memset(src, 0, sizeof(*src)); +} + +void dav1d_data_props_copy(Dav1dDataProps *const dst, + const Dav1dDataProps *const src) +{ + assert(dst != NULL); + assert(src != NULL); + + dav1d_ref_dec(&dst->user_data.ref); + *dst = *src; + if (dst->user_data.ref) dav1d_ref_inc(dst->user_data.ref); +} + +void dav1d_data_props_set_defaults(Dav1dDataProps *const props) { + assert(props != NULL); + + props->timestamp = INT64_MIN; + props->duration = 0; + props->offset = -1; + props->user_data.data = NULL; + props->user_data.ref = NULL; +} + +void dav1d_data_unref_internal(Dav1dData *const buf) { + validate_input(buf != NULL); + + struct Dav1dRef *user_data_ref = buf->m.user_data.ref; + if (buf->ref) { + validate_input(buf->data != NULL); + dav1d_ref_dec(&buf->ref); + } + memset(buf, 0, sizeof(*buf)); + dav1d_ref_dec(&user_data_ref); +} diff --git a/third_party/dav1d/src/data.h b/third_party/dav1d/src/data.h new file mode 100644 index 0000000000..6ebb551076 --- /dev/null +++ b/third_party/dav1d/src/data.h @@ -0,0 +1,60 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_DATA_H +#define DAV1D_SRC_DATA_H + +#include "dav1d/data.h" + +void dav1d_data_ref(Dav1dData *dst, const Dav1dData *src); + +/** + * Move a data reference. + */ +void dav1d_data_move_ref(Dav1dData *dst, Dav1dData *src); + +/** + * Copy the source properties to the destitionatin and increase the + * user_data's reference count (if it's not NULL). + */ +void dav1d_data_props_copy(Dav1dDataProps *dst, const Dav1dDataProps *src); + +void dav1d_data_props_set_defaults(Dav1dDataProps *props); + +uint8_t *dav1d_data_create_internal(Dav1dData *buf, size_t sz); +int dav1d_data_wrap_internal(Dav1dData *buf, const uint8_t *ptr, size_t sz, + void (*free_callback)(const uint8_t *data, + void *user_data), + void *user_data); +int dav1d_data_wrap_user_data_internal(Dav1dData *buf, + const uint8_t *user_data, + void (*free_callback)(const uint8_t *user_data, + void *cookie), + void *cookie); +void dav1d_data_unref_internal(Dav1dData *buf); + +#endif /* DAV1D_SRC_DATA_H */ diff --git a/third_party/dav1d/src/dav1d.rc.in b/third_party/dav1d/src/dav1d.rc.in new file mode 100644 index 0000000000..ad6aab481d --- /dev/null +++ b/third_party/dav1d/src/dav1d.rc.in @@ -0,0 +1,32 @@ +#define API_VERSION_NUMBER @API_VERSION_MAJOR@,@API_VERSION_MINOR@,@API_VERSION_REVISION@,0 +#define API_VERSION_NUMBER_STR "@API_VERSION_MAJOR@.@API_VERSION_MINOR@.@API_VERSION_REVISION@" +#define PROJECT_VERSION_NUMBER @PROJECT_VERSION_MAJOR@,@PROJECT_VERSION_MINOR@,@PROJECT_VERSION_REVISION@,0 +#define PROJECT_VERSION_NUMBER_STR "@PROJECT_VERSION_MAJOR@.@PROJECT_VERSION_MINOR@.@PROJECT_VERSION_REVISION@" + +#include + +1 VERSIONINFO +FILETYPE VFT_DLL +FILEOS VOS_NT_WINDOWS32 +PRODUCTVERSION PROJECT_VERSION_NUMBER +FILEVERSION API_VERSION_NUMBER +BEGIN + BLOCK "StringFileInfo" + BEGIN + BLOCK "040904E4" + BEGIN + VALUE "CompanyName", "VideoLAN" + VALUE "ProductName", "dav1d" + VALUE "ProductVersion", PROJECT_VERSION_NUMBER_STR + VALUE "FileVersion", API_VERSION_NUMBER_STR + VALUE "FileDescription", "dav1d " PROJECT_VERSION_NUMBER_STR " - AV1 decoder" + VALUE "InternalName", "dav1d" + VALUE "OriginalFilename", "libdav1d.dll" + VALUE "LegalCopyright", "Copyright \251 @COPYRIGHT_YEARS@ VideoLAN and dav1d Authors" + END + END + BLOCK "VarFileInfo" + BEGIN + VALUE "Translation", 0x409, 1252 + END +END diff --git a/third_party/dav1d/src/decode.c b/third_party/dav1d/src/decode.c new file mode 100644 index 0000000000..8462c0fae2 --- /dev/null +++ b/third_party/dav1d/src/decode.c @@ -0,0 +1,3638 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include +#include +#include +#include +#include + +#include "dav1d/data.h" + +#include "common/intops.h" + +#include "src/ctx.h" +#include "src/decode.h" +#include "src/dequant_tables.h" +#include "src/env.h" +#include "src/film_grain.h" +#include "src/log.h" +#include "src/qm.h" +#include "src/recon.h" +#include "src/ref.h" +#include "src/tables.h" +#include "src/thread_task.h" +#include "src/warpmv.h" + +static void init_quant_tables(const Dav1dSequenceHeader *const seq_hdr, + const Dav1dFrameHeader *const frame_hdr, + const int qidx, uint16_t (*dq)[3][2]) +{ + for (int i = 0; i < (frame_hdr->segmentation.enabled ? 8 : 1); i++) { + const int yac = frame_hdr->segmentation.enabled ? + iclip_u8(qidx + frame_hdr->segmentation.seg_data.d[i].delta_q) : qidx; + const int ydc = iclip_u8(yac + frame_hdr->quant.ydc_delta); + const int uac = iclip_u8(yac + frame_hdr->quant.uac_delta); + const int udc = iclip_u8(yac + frame_hdr->quant.udc_delta); + const int vac = iclip_u8(yac + frame_hdr->quant.vac_delta); + const int vdc = iclip_u8(yac + frame_hdr->quant.vdc_delta); + + dq[i][0][0] = dav1d_dq_tbl[seq_hdr->hbd][ydc][0]; + dq[i][0][1] = dav1d_dq_tbl[seq_hdr->hbd][yac][1]; + dq[i][1][0] = dav1d_dq_tbl[seq_hdr->hbd][udc][0]; + dq[i][1][1] = dav1d_dq_tbl[seq_hdr->hbd][uac][1]; + dq[i][2][0] = dav1d_dq_tbl[seq_hdr->hbd][vdc][0]; + dq[i][2][1] = dav1d_dq_tbl[seq_hdr->hbd][vac][1]; + } +} + +static int read_mv_component_diff(Dav1dTileContext *const t, + CdfMvComponent *const mv_comp, + const int have_fp) +{ + Dav1dTileState *const ts = t->ts; + const Dav1dFrameContext *const f = t->f; + const int have_hp = f->frame_hdr->hp; + const int sign = dav1d_msac_decode_bool_adapt(&ts->msac, mv_comp->sign); + const int cl = dav1d_msac_decode_symbol_adapt16(&ts->msac, + mv_comp->classes, 10); + int up, fp, hp; + + if (!cl) { + up = dav1d_msac_decode_bool_adapt(&ts->msac, mv_comp->class0); + if (have_fp) { + fp = dav1d_msac_decode_symbol_adapt4(&ts->msac, + mv_comp->class0_fp[up], 3); + hp = have_hp ? dav1d_msac_decode_bool_adapt(&ts->msac, + mv_comp->class0_hp) : 1; + } else { + fp = 3; + hp = 1; + } + } else { + up = 1 << cl; + for (int n = 0; n < cl; n++) + up |= dav1d_msac_decode_bool_adapt(&ts->msac, + mv_comp->classN[n]) << n; + if (have_fp) { + fp = dav1d_msac_decode_symbol_adapt4(&ts->msac, + mv_comp->classN_fp, 3); + hp = have_hp ? dav1d_msac_decode_bool_adapt(&ts->msac, + mv_comp->classN_hp) : 1; + } else { + fp = 3; + hp = 1; + } + } + + const int diff = ((up << 3) | (fp << 1) | hp) + 1; + + return sign ? -diff : diff; +} + +static void read_mv_residual(Dav1dTileContext *const t, mv *const ref_mv, + CdfMvContext *const mv_cdf, const int have_fp) +{ + switch (dav1d_msac_decode_symbol_adapt4(&t->ts->msac, t->ts->cdf.mv.joint, + N_MV_JOINTS - 1)) + { + case MV_JOINT_HV: + ref_mv->y += read_mv_component_diff(t, &mv_cdf->comp[0], have_fp); + ref_mv->x += read_mv_component_diff(t, &mv_cdf->comp[1], have_fp); + break; + case MV_JOINT_H: + ref_mv->x += read_mv_component_diff(t, &mv_cdf->comp[1], have_fp); + break; + case MV_JOINT_V: + ref_mv->y += read_mv_component_diff(t, &mv_cdf->comp[0], have_fp); + break; + default: + break; + } +} + +static void read_tx_tree(Dav1dTileContext *const t, + const enum RectTxfmSize from, + const int depth, uint16_t *const masks, + const int x_off, const int y_off) +{ + const Dav1dFrameContext *const f = t->f; + const int bx4 = t->bx & 31, by4 = t->by & 31; + const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[from]; + const int txw = t_dim->lw, txh = t_dim->lh; + int is_split; + + if (depth < 2 && from > (int) TX_4X4) { + const int cat = 2 * (TX_64X64 - t_dim->max) - depth; + const int a = t->a->tx[bx4] < txw; + const int l = t->l.tx[by4] < txh; + + is_split = dav1d_msac_decode_bool_adapt(&t->ts->msac, + t->ts->cdf.m.txpart[cat][a + l]); + if (is_split) + masks[depth] |= 1 << (y_off * 4 + x_off); + } else { + is_split = 0; + } + + if (is_split && t_dim->max > TX_8X8) { + const enum RectTxfmSize sub = t_dim->sub; + const TxfmInfo *const sub_t_dim = &dav1d_txfm_dimensions[sub]; + const int txsw = sub_t_dim->w, txsh = sub_t_dim->h; + + read_tx_tree(t, sub, depth + 1, masks, x_off * 2 + 0, y_off * 2 + 0); + t->bx += txsw; + if (txw >= txh && t->bx < f->bw) + read_tx_tree(t, sub, depth + 1, masks, x_off * 2 + 1, y_off * 2 + 0); + t->bx -= txsw; + t->by += txsh; + if (txh >= txw && t->by < f->bh) { + read_tx_tree(t, sub, depth + 1, masks, x_off * 2 + 0, y_off * 2 + 1); + t->bx += txsw; + if (txw >= txh && t->bx < f->bw) + read_tx_tree(t, sub, depth + 1, masks, + x_off * 2 + 1, y_off * 2 + 1); + t->bx -= txsw; + } + t->by -= txsh; + } else { +#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ + rep_macro(type, t->dir tx, off, is_split ? TX_4X4 : mul * txh) + case_set_upto16(t_dim->h, l., 1, by4); +#undef set_ctx +#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ + rep_macro(type, t->dir tx, off, is_split ? TX_4X4 : mul * txw) + case_set_upto16(t_dim->w, a->, 0, bx4); +#undef set_ctx + } +} + +static int neg_deinterleave(int diff, int ref, int max) { + if (!ref) return diff; + if (ref >= (max - 1)) return max - diff - 1; + if (2 * ref < max) { + if (diff <= 2 * ref) { + if (diff & 1) + return ref + ((diff + 1) >> 1); + else + return ref - (diff >> 1); + } + return diff; + } else { + if (diff <= 2 * (max - ref - 1)) { + if (diff & 1) + return ref + ((diff + 1) >> 1); + else + return ref - (diff >> 1); + } + return max - (diff + 1); + } +} + +static void find_matching_ref(const Dav1dTileContext *const t, + const enum EdgeFlags intra_edge_flags, + const int bw4, const int bh4, + const int w4, const int h4, + const int have_left, const int have_top, + const int ref, uint64_t masks[2]) +{ + /*const*/ refmvs_block *const *r = &t->rt.r[(t->by & 31) + 5]; + int count = 0; + int have_topleft = have_top && have_left; + int have_topright = imax(bw4, bh4) < 32 && + have_top && t->bx + bw4 < t->ts->tiling.col_end && + (intra_edge_flags & EDGE_I444_TOP_HAS_RIGHT); + +#define bs(rp) dav1d_block_dimensions[(rp)->bs] +#define matches(rp) ((rp)->ref.ref[0] == ref + 1 && (rp)->ref.ref[1] == -1) + + if (have_top) { + const refmvs_block *r2 = &r[-1][t->bx]; + if (matches(r2)) { + masks[0] |= 1; + count = 1; + } + int aw4 = bs(r2)[0]; + if (aw4 >= bw4) { + const int off = t->bx & (aw4 - 1); + if (off) have_topleft = 0; + if (aw4 - off > bw4) have_topright = 0; + } else { + unsigned mask = 1 << aw4; + for (int x = aw4; x < w4; x += aw4) { + r2 += aw4; + if (matches(r2)) { + masks[0] |= mask; + if (++count >= 8) return; + } + aw4 = bs(r2)[0]; + mask <<= aw4; + } + } + } + if (have_left) { + /*const*/ refmvs_block *const *r2 = r; + if (matches(&r2[0][t->bx - 1])) { + masks[1] |= 1; + if (++count >= 8) return; + } + int lh4 = bs(&r2[0][t->bx - 1])[1]; + if (lh4 >= bh4) { + if (t->by & (lh4 - 1)) have_topleft = 0; + } else { + unsigned mask = 1 << lh4; + for (int y = lh4; y < h4; y += lh4) { + r2 += lh4; + if (matches(&r2[0][t->bx - 1])) { + masks[1] |= mask; + if (++count >= 8) return; + } + lh4 = bs(&r2[0][t->bx - 1])[1]; + mask <<= lh4; + } + } + } + if (have_topleft && matches(&r[-1][t->bx - 1])) { + masks[1] |= 1ULL << 32; + if (++count >= 8) return; + } + if (have_topright && matches(&r[-1][t->bx + bw4])) { + masks[0] |= 1ULL << 32; + } +#undef matches +} + +static void derive_warpmv(const Dav1dTileContext *const t, + const int bw4, const int bh4, + const uint64_t masks[2], const union mv mv, + Dav1dWarpedMotionParams *const wmp) +{ + int pts[8][2 /* in, out */][2 /* x, y */], np = 0; + /*const*/ refmvs_block *const *r = &t->rt.r[(t->by & 31) + 5]; + +#define add_sample(dx, dy, sx, sy, rp) do { \ + pts[np][0][0] = 16 * (2 * dx + sx * bs(rp)[0]) - 8; \ + pts[np][0][1] = 16 * (2 * dy + sy * bs(rp)[1]) - 8; \ + pts[np][1][0] = pts[np][0][0] + (rp)->mv.mv[0].x; \ + pts[np][1][1] = pts[np][0][1] + (rp)->mv.mv[0].y; \ + np++; \ +} while (0) + + // use masks[] to find the projectable motion vectors in the edges + if ((unsigned) masks[0] == 1 && !(masks[1] >> 32)) { + const int off = t->bx & (bs(&r[-1][t->bx])[0] - 1); + add_sample(-off, 0, 1, -1, &r[-1][t->bx]); + } else for (unsigned off = 0, xmask = (uint32_t) masks[0]; np < 8 && xmask;) { // top + const int tz = ctz(xmask); + off += tz; + xmask >>= tz; + add_sample(off, 0, 1, -1, &r[-1][t->bx + off]); + xmask &= ~1; + } + if (np < 8 && masks[1] == 1) { + const int off = t->by & (bs(&r[0][t->bx - 1])[1] - 1); + add_sample(0, -off, -1, 1, &r[-off][t->bx - 1]); + } else for (unsigned off = 0, ymask = (uint32_t) masks[1]; np < 8 && ymask;) { // left + const int tz = ctz(ymask); + off += tz; + ymask >>= tz; + add_sample(0, off, -1, 1, &r[off][t->bx - 1]); + ymask &= ~1; + } + if (np < 8 && masks[1] >> 32) // top/left + add_sample(0, 0, -1, -1, &r[-1][t->bx - 1]); + if (np < 8 && masks[0] >> 32) // top/right + add_sample(bw4, 0, 1, -1, &r[-1][t->bx + bw4]); + assert(np > 0 && np <= 8); +#undef bs + + // select according to motion vector difference against a threshold + int mvd[8], ret = 0; + const int thresh = 4 * iclip(imax(bw4, bh4), 4, 28); + for (int i = 0; i < np; i++) { + mvd[i] = abs(pts[i][1][0] - pts[i][0][0] - mv.x) + + abs(pts[i][1][1] - pts[i][0][1] - mv.y); + if (mvd[i] > thresh) + mvd[i] = -1; + else + ret++; + } + if (!ret) { + ret = 1; + } else for (int i = 0, j = np - 1, k = 0; k < np - ret; k++, i++, j--) { + while (mvd[i] != -1) i++; + while (mvd[j] == -1) j--; + assert(i != j); + if (i > j) break; + // replace the discarded samples; + mvd[i] = mvd[j]; + memcpy(pts[i], pts[j], sizeof(*pts)); + } + + if (!dav1d_find_affine_int(pts, ret, bw4, bh4, mv, wmp, t->bx, t->by) && + !dav1d_get_shear_params(wmp)) + { + wmp->type = DAV1D_WM_TYPE_AFFINE; + } else + wmp->type = DAV1D_WM_TYPE_IDENTITY; +} + +static inline int findoddzero(const uint8_t *buf, int len) { + for (int n = 0; n < len; n++) + if (!buf[n * 2]) return 1; + return 0; +} + +static void read_pal_plane(Dav1dTileContext *const t, Av1Block *const b, + const int pl, const int sz_ctx, + const int bx4, const int by4) +{ + Dav1dTileState *const ts = t->ts; + const Dav1dFrameContext *const f = t->f; + const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt8(&ts->msac, + ts->cdf.m.pal_sz[pl][sz_ctx], 6) + 2; + uint16_t cache[16], used_cache[8]; + int l_cache = pl ? t->pal_sz_uv[1][by4] : t->l.pal_sz[by4]; + int n_cache = 0; + // don't reuse above palette outside SB64 boundaries + int a_cache = by4 & 15 ? pl ? t->pal_sz_uv[0][bx4] : t->a->pal_sz[bx4] : 0; + const uint16_t *l = t->al_pal[1][by4][pl], *a = t->al_pal[0][bx4][pl]; + + // fill/sort cache + while (l_cache && a_cache) { + if (*l < *a) { + if (!n_cache || cache[n_cache - 1] != *l) + cache[n_cache++] = *l; + l++; + l_cache--; + } else { + if (*a == *l) { + l++; + l_cache--; + } + if (!n_cache || cache[n_cache - 1] != *a) + cache[n_cache++] = *a; + a++; + a_cache--; + } + } + if (l_cache) { + do { + if (!n_cache || cache[n_cache - 1] != *l) + cache[n_cache++] = *l; + l++; + } while (--l_cache > 0); + } else if (a_cache) { + do { + if (!n_cache || cache[n_cache - 1] != *a) + cache[n_cache++] = *a; + a++; + } while (--a_cache > 0); + } + + // find reused cache entries + int i = 0; + for (int n = 0; n < n_cache && i < pal_sz; n++) + if (dav1d_msac_decode_bool_equi(&ts->msac)) + used_cache[i++] = cache[n]; + const int n_used_cache = i; + + // parse new entries + uint16_t *const pal = f->frame_thread.pass ? + f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + + ((t->bx >> 1) + (t->by & 1))][pl] : t->scratch.pal[pl]; + if (i < pal_sz) { + int prev = pal[i++] = dav1d_msac_decode_bools(&ts->msac, f->cur.p.bpc); + + if (i < pal_sz) { + int bits = f->cur.p.bpc - 3 + dav1d_msac_decode_bools(&ts->msac, 2); + const int max = (1 << f->cur.p.bpc) - 1; + + do { + const int delta = dav1d_msac_decode_bools(&ts->msac, bits); + prev = pal[i++] = imin(prev + delta + !pl, max); + if (prev + !pl >= max) { + for (; i < pal_sz; i++) + pal[i] = max; + break; + } + bits = imin(bits, 1 + ulog2(max - prev - !pl)); + } while (i < pal_sz); + } + + // merge cache+new entries + int n = 0, m = n_used_cache; + for (i = 0; i < pal_sz; i++) { + if (n < n_used_cache && (m >= pal_sz || used_cache[n] <= pal[m])) { + pal[i] = used_cache[n++]; + } else { + assert(m < pal_sz); + pal[i] = pal[m++]; + } + } + } else { + memcpy(pal, used_cache, n_used_cache * sizeof(*used_cache)); + } + + if (DEBUG_BLOCK_INFO) { + printf("Post-pal[pl=%d,sz=%d,cache_size=%d,used_cache=%d]: r=%d, cache=", + pl, pal_sz, n_cache, n_used_cache, ts->msac.rng); + for (int n = 0; n < n_cache; n++) + printf("%c%02x", n ? ' ' : '[', cache[n]); + printf("%s, pal=", n_cache ? "]" : "[]"); + for (int n = 0; n < pal_sz; n++) + printf("%c%02x", n ? ' ' : '[', pal[n]); + printf("]\n"); + } +} + +static void read_pal_uv(Dav1dTileContext *const t, Av1Block *const b, + const int sz_ctx, const int bx4, const int by4) +{ + read_pal_plane(t, b, 1, sz_ctx, bx4, by4); + + // V pal coding + Dav1dTileState *const ts = t->ts; + const Dav1dFrameContext *const f = t->f; + uint16_t *const pal = f->frame_thread.pass ? + f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + + ((t->bx >> 1) + (t->by & 1))][2] : t->scratch.pal[2]; + if (dav1d_msac_decode_bool_equi(&ts->msac)) { + const int bits = f->cur.p.bpc - 4 + + dav1d_msac_decode_bools(&ts->msac, 2); + int prev = pal[0] = dav1d_msac_decode_bools(&ts->msac, f->cur.p.bpc); + const int max = (1 << f->cur.p.bpc) - 1; + for (int i = 1; i < b->pal_sz[1]; i++) { + int delta = dav1d_msac_decode_bools(&ts->msac, bits); + if (delta && dav1d_msac_decode_bool_equi(&ts->msac)) delta = -delta; + prev = pal[i] = (prev + delta) & max; + } + } else { + for (int i = 0; i < b->pal_sz[1]; i++) + pal[i] = dav1d_msac_decode_bools(&ts->msac, f->cur.p.bpc); + } + if (DEBUG_BLOCK_INFO) { + printf("Post-pal[pl=2]: r=%d ", ts->msac.rng); + for (int n = 0; n < b->pal_sz[1]; n++) + printf("%c%02x", n ? ' ' : '[', pal[n]); + printf("]\n"); + } +} + +// meant to be SIMD'able, so that theoretical complexity of this function +// times block size goes from w4*h4 to w4+h4-1 +// a and b are previous two lines containing (a) top/left entries or (b) +// top/left entries, with a[0] being either the first top or first left entry, +// depending on top_offset being 1 or 0, and b being the first top/left entry +// for whichever has one. left_offset indicates whether the (len-1)th entry +// has a left neighbour. +// output is order[] and ctx for each member of this diagonal. +static void order_palette(const uint8_t *pal_idx, const ptrdiff_t stride, + const int i, const int first, const int last, + uint8_t (*const order)[8], uint8_t *const ctx) +{ + int have_top = i > first; + + assert(pal_idx); + pal_idx += first + (i - first) * stride; + for (int j = first, n = 0; j >= last; have_top = 1, j--, n++, pal_idx += stride - 1) { + const int have_left = j > 0; + + assert(have_left || have_top); + +#define add(v_in) do { \ + const int v = v_in; \ + assert((unsigned)v < 8U); \ + order[n][o_idx++] = v; \ + mask |= 1 << v; \ + } while (0) + + unsigned mask = 0; + int o_idx = 0; + if (!have_left) { + ctx[n] = 0; + add(pal_idx[-stride]); + } else if (!have_top) { + ctx[n] = 0; + add(pal_idx[-1]); + } else { + const int l = pal_idx[-1], t = pal_idx[-stride], tl = pal_idx[-(stride + 1)]; + const int same_t_l = t == l; + const int same_t_tl = t == tl; + const int same_l_tl = l == tl; + const int same_all = same_t_l & same_t_tl & same_l_tl; + + if (same_all) { + ctx[n] = 4; + add(t); + } else if (same_t_l) { + ctx[n] = 3; + add(t); + add(tl); + } else if (same_t_tl | same_l_tl) { + ctx[n] = 2; + add(tl); + add(same_t_tl ? l : t); + } else { + ctx[n] = 1; + add(imin(t, l)); + add(imax(t, l)); + add(tl); + } + } + for (unsigned m = 1, bit = 0; m < 0x100; m <<= 1, bit++) + if (!(mask & m)) + order[n][o_idx++] = bit; + assert(o_idx == 8); +#undef add + } +} + +static void read_pal_indices(Dav1dTileContext *const t, + uint8_t *const pal_idx, + const Av1Block *const b, const int pl, + const int w4, const int h4, + const int bw4, const int bh4) +{ + Dav1dTileState *const ts = t->ts; + const ptrdiff_t stride = bw4 * 4; + assert(pal_idx); + pal_idx[0] = dav1d_msac_decode_uniform(&ts->msac, b->pal_sz[pl]); + uint16_t (*const color_map_cdf)[8] = + ts->cdf.m.color_map[pl][b->pal_sz[pl] - 2]; + uint8_t (*const order)[8] = t->scratch.pal_order; + uint8_t *const ctx = t->scratch.pal_ctx; + for (int i = 1; i < 4 * (w4 + h4) - 1; i++) { + // top/left-to-bottom/right diagonals ("wave-front") + const int first = imin(i, w4 * 4 - 1); + const int last = imax(0, i - h4 * 4 + 1); + order_palette(pal_idx, stride, i, first, last, order, ctx); + for (int j = first, m = 0; j >= last; j--, m++) { + const int color_idx = dav1d_msac_decode_symbol_adapt8(&ts->msac, + color_map_cdf[ctx[m]], b->pal_sz[pl] - 1); + pal_idx[(i - j) * stride + j] = order[m][color_idx]; + } + } + // fill invisible edges + if (bw4 > w4) + for (int y = 0; y < 4 * h4; y++) + memset(&pal_idx[y * stride + 4 * w4], + pal_idx[y * stride + 4 * w4 - 1], 4 * (bw4 - w4)); + if (h4 < bh4) { + const uint8_t *const src = &pal_idx[stride * (4 * h4 - 1)]; + for (int y = h4 * 4; y < bh4 * 4; y++) + memcpy(&pal_idx[y * stride], src, bw4 * 4); + } +} + +static void read_vartx_tree(Dav1dTileContext *const t, + Av1Block *const b, const enum BlockSize bs, + const int bx4, const int by4) +{ + const Dav1dFrameContext *const f = t->f; + const uint8_t *const b_dim = dav1d_block_dimensions[bs]; + const int bw4 = b_dim[0], bh4 = b_dim[1]; + + // var-tx tree coding + uint16_t tx_split[2] = { 0 }; + b->max_ytx = dav1d_max_txfm_size_for_bs[bs][0]; + if (!b->skip && (f->frame_hdr->segmentation.lossless[b->seg_id] || + b->max_ytx == TX_4X4)) + { + b->max_ytx = b->uvtx = TX_4X4; + if (f->frame_hdr->txfm_mode == DAV1D_TX_SWITCHABLE) { +#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ + rep_macro(type, t->dir tx, off, TX_4X4) + case_set(bh4, l., 1, by4); + case_set(bw4, a->, 0, bx4); +#undef set_ctx + } + } else if (f->frame_hdr->txfm_mode != DAV1D_TX_SWITCHABLE || b->skip) { + if (f->frame_hdr->txfm_mode == DAV1D_TX_SWITCHABLE) { +#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ + rep_macro(type, t->dir tx, off, mul * b_dim[2 + diridx]) + case_set(bh4, l., 1, by4); + case_set(bw4, a->, 0, bx4); +#undef set_ctx + } + b->uvtx = dav1d_max_txfm_size_for_bs[bs][f->cur.p.layout]; + } else { + assert(bw4 <= 16 || bh4 <= 16 || b->max_ytx == TX_64X64); + int y, x, y_off, x_off; + const TxfmInfo *const ytx = &dav1d_txfm_dimensions[b->max_ytx]; + for (y = 0, y_off = 0; y < bh4; y += ytx->h, y_off++) { + for (x = 0, x_off = 0; x < bw4; x += ytx->w, x_off++) { + read_tx_tree(t, b->max_ytx, 0, tx_split, x_off, y_off); + // contexts are updated inside read_tx_tree() + t->bx += ytx->w; + } + t->bx -= x; + t->by += ytx->h; + } + t->by -= y; + if (DEBUG_BLOCK_INFO) + printf("Post-vartxtree[%x/%x]: r=%d\n", + tx_split[0], tx_split[1], t->ts->msac.rng); + b->uvtx = dav1d_max_txfm_size_for_bs[bs][f->cur.p.layout]; + } + assert(!(tx_split[0] & ~0x33)); + b->tx_split0 = (uint8_t)tx_split[0]; + b->tx_split1 = tx_split[1]; +} + +static inline unsigned get_prev_frame_segid(const Dav1dFrameContext *const f, + const int by, const int bx, + const int w4, int h4, + const uint8_t *ref_seg_map, + const ptrdiff_t stride) +{ + assert(f->frame_hdr->primary_ref_frame != DAV1D_PRIMARY_REF_NONE); + if (dav1d_thread_picture_wait(&f->refp[f->frame_hdr->primary_ref_frame], + (by + h4) * 4, PLANE_TYPE_BLOCK)) + { + return 8; + } + + unsigned seg_id = 8; + ref_seg_map += by * stride + bx; + do { + for (int x = 0; x < w4; x++) + seg_id = imin(seg_id, ref_seg_map[x]); + ref_seg_map += stride; + } while (--h4 > 0 && seg_id); + assert(seg_id < 8); + + return seg_id; +} + +static int decode_b(Dav1dTileContext *const t, + const enum BlockLevel bl, + const enum BlockSize bs, + const enum BlockPartition bp, + const enum EdgeFlags intra_edge_flags) +{ + Dav1dTileState *const ts = t->ts; + const Dav1dFrameContext *const f = t->f; + Av1Block b_mem, *const b = f->frame_thread.pass ? + &f->frame_thread.b[t->by * f->b4_stride + t->bx] : &b_mem; + const uint8_t *const b_dim = dav1d_block_dimensions[bs]; + const int bx4 = t->bx & 31, by4 = t->by & 31; + const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; + const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; + const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver; + const int bw4 = b_dim[0], bh4 = b_dim[1]; + const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by); + const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver; + const int have_left = t->bx > ts->tiling.col_start; + const int have_top = t->by > ts->tiling.row_start; + const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 && + (bw4 > ss_hor || t->bx & 1) && + (bh4 > ss_ver || t->by & 1); + + if (f->frame_thread.pass == 2) { + if (b->intra) { + f->bd_fn.recon_b_intra(t, bs, intra_edge_flags, b); + + const enum IntraPredMode y_mode_nofilt = + b->y_mode == FILTER_PRED ? DC_PRED : b->y_mode; +#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ + rep_macro(type, t->dir mode, off, mul * y_mode_nofilt); \ + rep_macro(type, t->dir intra, off, mul) + case_set(bh4, l., 1, by4); + case_set(bw4, a->, 0, bx4); +#undef set_ctx + if (f->frame_hdr->frame_type & 1) { + refmvs_block *const r = &t->rt.r[(t->by & 31) + 5 + bh4 - 1][t->bx]; + for (int x = 0; x < bw4; x++) { + r[x].ref.ref[0] = 0; + r[x].bs = bs; + } + refmvs_block *const *rr = &t->rt.r[(t->by & 31) + 5]; + for (int y = 0; y < bh4 - 1; y++) { + rr[y][t->bx + bw4 - 1].ref.ref[0] = 0; + rr[y][t->bx + bw4 - 1].bs = bs; + } + } + + if (has_chroma) { +#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ + rep_macro(type, t->dir uvmode, off, mul * b->uv_mode) + case_set(cbh4, l., 1, cby4); + case_set(cbw4, a->, 0, cbx4); +#undef set_ctx + } + } else { + if (f->frame_hdr->frame_type & 1 /* not intrabc */ && + b->comp_type == COMP_INTER_NONE && b->motion_mode == MM_WARP) + { + if (b->matrix[0] == SHRT_MIN) { + t->warpmv.type = DAV1D_WM_TYPE_IDENTITY; + } else { + t->warpmv.type = DAV1D_WM_TYPE_AFFINE; + t->warpmv.matrix[2] = b->matrix[0] + 0x10000; + t->warpmv.matrix[3] = b->matrix[1]; + t->warpmv.matrix[4] = b->matrix[2]; + t->warpmv.matrix[5] = b->matrix[3] + 0x10000; + dav1d_set_affine_mv2d(bw4, bh4, b->mv2d, &t->warpmv, + t->bx, t->by); + dav1d_get_shear_params(&t->warpmv); +#define signabs(v) v < 0 ? '-' : ' ', abs(v) + if (DEBUG_BLOCK_INFO) + printf("[ %c%x %c%x %c%x\n %c%x %c%x %c%x ]\n" + "alpha=%c%x, beta=%c%x, gamma=%c%x, delta=%c%x, mv=y:%d,x:%d\n", + signabs(t->warpmv.matrix[0]), + signabs(t->warpmv.matrix[1]), + signabs(t->warpmv.matrix[2]), + signabs(t->warpmv.matrix[3]), + signabs(t->warpmv.matrix[4]), + signabs(t->warpmv.matrix[5]), + signabs(t->warpmv.u.p.alpha), + signabs(t->warpmv.u.p.beta), + signabs(t->warpmv.u.p.gamma), + signabs(t->warpmv.u.p.delta), + b->mv2d.y, b->mv2d.x); +#undef signabs + } + } + if (f->bd_fn.recon_b_inter(t, bs, b)) return -1; + + const uint8_t *const filter = dav1d_filter_dir[b->filter2d]; +#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ + rep_macro(type, t->dir filter[0], off, mul * filter[0]); \ + rep_macro(type, t->dir filter[1], off, mul * filter[1]); \ + rep_macro(type, t->dir intra, off, 0) + case_set(bh4, l., 1, by4); + case_set(bw4, a->, 0, bx4); +#undef set_ctx + + if (f->frame_hdr->frame_type & 1) { + refmvs_block *const r = &t->rt.r[(t->by & 31) + 5 + bh4 - 1][t->bx]; + for (int x = 0; x < bw4; x++) { + r[x].ref.ref[0] = b->ref[0] + 1; + r[x].mv.mv[0] = b->mv[0]; + r[x].bs = bs; + } + refmvs_block *const *rr = &t->rt.r[(t->by & 31) + 5]; + for (int y = 0; y < bh4 - 1; y++) { + rr[y][t->bx + bw4 - 1].ref.ref[0] = b->ref[0] + 1; + rr[y][t->bx + bw4 - 1].mv.mv[0] = b->mv[0]; + rr[y][t->bx + bw4 - 1].bs = bs; + } + } + + if (has_chroma) { +#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ + rep_macro(type, t->dir uvmode, off, mul * DC_PRED) + case_set(cbh4, l., 1, cby4); + case_set(cbw4, a->, 0, cbx4); +#undef set_ctx + } + } + return 0; + } + + const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver; + + b->bl = bl; + b->bp = bp; + b->bs = bs; + + const Dav1dSegmentationData *seg = NULL; + + // segment_id (if seg_feature for skip/ref/gmv is enabled) + int seg_pred = 0; + if (f->frame_hdr->segmentation.enabled) { + if (!f->frame_hdr->segmentation.update_map) { + if (f->prev_segmap) { + unsigned seg_id = get_prev_frame_segid(f, t->by, t->bx, w4, h4, + f->prev_segmap, + f->b4_stride); + if (seg_id >= 8) return -1; + b->seg_id = seg_id; + } else { + b->seg_id = 0; + } + seg = &f->frame_hdr->segmentation.seg_data.d[b->seg_id]; + } else if (f->frame_hdr->segmentation.seg_data.preskip) { + if (f->frame_hdr->segmentation.temporal && + (seg_pred = dav1d_msac_decode_bool_adapt(&ts->msac, + ts->cdf.m.seg_pred[t->a->seg_pred[bx4] + + t->l.seg_pred[by4]]))) + { + // temporal predicted seg_id + if (f->prev_segmap) { + unsigned seg_id = get_prev_frame_segid(f, t->by, t->bx, + w4, h4, + f->prev_segmap, + f->b4_stride); + if (seg_id >= 8) return -1; + b->seg_id = seg_id; + } else { + b->seg_id = 0; + } + } else { + int seg_ctx; + const unsigned pred_seg_id = + get_cur_frame_segid(t->by, t->bx, have_top, have_left, + &seg_ctx, f->cur_segmap, f->b4_stride); + const unsigned diff = dav1d_msac_decode_symbol_adapt8(&ts->msac, + ts->cdf.m.seg_id[seg_ctx], + DAV1D_MAX_SEGMENTS - 1); + const unsigned last_active_seg_id = + f->frame_hdr->segmentation.seg_data.last_active_segid; + b->seg_id = neg_deinterleave(diff, pred_seg_id, + last_active_seg_id + 1); + if (b->seg_id > last_active_seg_id) b->seg_id = 0; // error? + if (b->seg_id >= DAV1D_MAX_SEGMENTS) b->seg_id = 0; // error? + } + + if (DEBUG_BLOCK_INFO) + printf("Post-segid[preskip;%d]: r=%d\n", + b->seg_id, ts->msac.rng); + + seg = &f->frame_hdr->segmentation.seg_data.d[b->seg_id]; + } + } else { + b->seg_id = 0; + } + + // skip_mode + if ((!seg || (!seg->globalmv && seg->ref == -1 && !seg->skip)) && + f->frame_hdr->skip_mode_enabled && imin(bw4, bh4) > 1) + { + const int smctx = t->a->skip_mode[bx4] + t->l.skip_mode[by4]; + b->skip_mode = dav1d_msac_decode_bool_adapt(&ts->msac, + ts->cdf.m.skip_mode[smctx]); + if (DEBUG_BLOCK_INFO) + printf("Post-skipmode[%d]: r=%d\n", b->skip_mode, ts->msac.rng); + } else { + b->skip_mode = 0; + } + + // skip + if (b->skip_mode || (seg && seg->skip)) { + b->skip = 1; + } else { + const int sctx = t->a->skip[bx4] + t->l.skip[by4]; + b->skip = dav1d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.skip[sctx]); + if (DEBUG_BLOCK_INFO) + printf("Post-skip[%d]: r=%d\n", b->skip, ts->msac.rng); + } + + // segment_id + if (f->frame_hdr->segmentation.enabled && + f->frame_hdr->segmentation.update_map && + !f->frame_hdr->segmentation.seg_data.preskip) + { + if (!b->skip && f->frame_hdr->segmentation.temporal && + (seg_pred = dav1d_msac_decode_bool_adapt(&ts->msac, + ts->cdf.m.seg_pred[t->a->seg_pred[bx4] + + t->l.seg_pred[by4]]))) + { + // temporal predicted seg_id + if (f->prev_segmap) { + unsigned seg_id = get_prev_frame_segid(f, t->by, t->bx, w4, h4, + f->prev_segmap, + f->b4_stride); + if (seg_id >= 8) return -1; + b->seg_id = seg_id; + } else { + b->seg_id = 0; + } + } else { + int seg_ctx; + const unsigned pred_seg_id = + get_cur_frame_segid(t->by, t->bx, have_top, have_left, + &seg_ctx, f->cur_segmap, f->b4_stride); + if (b->skip) { + b->seg_id = pred_seg_id; + } else { + const unsigned diff = dav1d_msac_decode_symbol_adapt8(&ts->msac, + ts->cdf.m.seg_id[seg_ctx], + DAV1D_MAX_SEGMENTS - 1); + const unsigned last_active_seg_id = + f->frame_hdr->segmentation.seg_data.last_active_segid; + b->seg_id = neg_deinterleave(diff, pred_seg_id, + last_active_seg_id + 1); + if (b->seg_id > last_active_seg_id) b->seg_id = 0; // error? + } + if (b->seg_id >= DAV1D_MAX_SEGMENTS) b->seg_id = 0; // error? + } + + seg = &f->frame_hdr->segmentation.seg_data.d[b->seg_id]; + + if (DEBUG_BLOCK_INFO) + printf("Post-segid[postskip;%d]: r=%d\n", + b->seg_id, ts->msac.rng); + } + + // cdef index + if (!b->skip) { + const int idx = f->seq_hdr->sb128 ? ((t->bx & 16) >> 4) + + ((t->by & 16) >> 3) : 0; + if (t->cur_sb_cdef_idx_ptr[idx] == -1) { + const int v = dav1d_msac_decode_bools(&ts->msac, + f->frame_hdr->cdef.n_bits); + t->cur_sb_cdef_idx_ptr[idx] = v; + if (bw4 > 16) t->cur_sb_cdef_idx_ptr[idx + 1] = v; + if (bh4 > 16) t->cur_sb_cdef_idx_ptr[idx + 2] = v; + if (bw4 == 32 && bh4 == 32) t->cur_sb_cdef_idx_ptr[idx + 3] = v; + + if (DEBUG_BLOCK_INFO) + printf("Post-cdef_idx[%d]: r=%d\n", + *t->cur_sb_cdef_idx_ptr, ts->msac.rng); + } + } + + // delta-q/lf + if (!(t->bx & (31 >> !f->seq_hdr->sb128)) && + !(t->by & (31 >> !f->seq_hdr->sb128))) + { + const int prev_qidx = ts->last_qidx; + const int have_delta_q = f->frame_hdr->delta.q.present && + (bs != (f->seq_hdr->sb128 ? BS_128x128 : BS_64x64) || !b->skip); + + int8_t prev_delta_lf[4]; + memcpy(prev_delta_lf, ts->last_delta_lf, 4); + + if (have_delta_q) { + int delta_q = dav1d_msac_decode_symbol_adapt4(&ts->msac, + ts->cdf.m.delta_q, 3); + if (delta_q == 3) { + const int n_bits = 1 + dav1d_msac_decode_bools(&ts->msac, 3); + delta_q = dav1d_msac_decode_bools(&ts->msac, n_bits) + + 1 + (1 << n_bits); + } + if (delta_q) { + if (dav1d_msac_decode_bool_equi(&ts->msac)) delta_q = -delta_q; + delta_q *= 1 << f->frame_hdr->delta.q.res_log2; + } + ts->last_qidx = iclip(ts->last_qidx + delta_q, 1, 255); + if (have_delta_q && DEBUG_BLOCK_INFO) + printf("Post-delta_q[%d->%d]: r=%d\n", + delta_q, ts->last_qidx, ts->msac.rng); + + if (f->frame_hdr->delta.lf.present) { + const int n_lfs = f->frame_hdr->delta.lf.multi ? + f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 ? 4 : 2 : 1; + + for (int i = 0; i < n_lfs; i++) { + int delta_lf = dav1d_msac_decode_symbol_adapt4(&ts->msac, + ts->cdf.m.delta_lf[i + f->frame_hdr->delta.lf.multi], 3); + if (delta_lf == 3) { + const int n_bits = 1 + dav1d_msac_decode_bools(&ts->msac, 3); + delta_lf = dav1d_msac_decode_bools(&ts->msac, n_bits) + + 1 + (1 << n_bits); + } + if (delta_lf) { + if (dav1d_msac_decode_bool_equi(&ts->msac)) + delta_lf = -delta_lf; + delta_lf *= 1 << f->frame_hdr->delta.lf.res_log2; + } + ts->last_delta_lf[i] = + iclip(ts->last_delta_lf[i] + delta_lf, -63, 63); + if (have_delta_q && DEBUG_BLOCK_INFO) + printf("Post-delta_lf[%d:%d]: r=%d\n", i, delta_lf, + ts->msac.rng); + } + } + } + if (ts->last_qidx == f->frame_hdr->quant.yac) { + // assign frame-wide q values to this sb + ts->dq = f->dq; + } else if (ts->last_qidx != prev_qidx) { + // find sb-specific quant parameters + init_quant_tables(f->seq_hdr, f->frame_hdr, ts->last_qidx, ts->dqmem); + ts->dq = ts->dqmem; + } + if (!memcmp(ts->last_delta_lf, (int8_t[4]) { 0, 0, 0, 0 }, 4)) { + // assign frame-wide lf values to this sb + ts->lflvl = f->lf.lvl; + } else if (memcmp(ts->last_delta_lf, prev_delta_lf, 4)) { + // find sb-specific lf lvl parameters + dav1d_calc_lf_values(ts->lflvlmem, f->frame_hdr, ts->last_delta_lf); + ts->lflvl = ts->lflvlmem; + } + } + + if (b->skip_mode) { + b->intra = 0; + } else if (f->frame_hdr->frame_type & 1) { + if (seg && (seg->ref >= 0 || seg->globalmv)) { + b->intra = !seg->ref; + } else { + const int ictx = get_intra_ctx(t->a, &t->l, by4, bx4, + have_top, have_left); + b->intra = !dav1d_msac_decode_bool_adapt(&ts->msac, + ts->cdf.m.intra[ictx]); + if (DEBUG_BLOCK_INFO) + printf("Post-intra[%d]: r=%d\n", b->intra, ts->msac.rng); + } + } else if (f->frame_hdr->allow_intrabc) { + b->intra = !dav1d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.intrabc); + if (DEBUG_BLOCK_INFO) + printf("Post-intrabcflag[%d]: r=%d\n", b->intra, ts->msac.rng); + } else { + b->intra = 1; + } + + // intra/inter-specific stuff + if (b->intra) { + uint16_t *const ymode_cdf = f->frame_hdr->frame_type & 1 ? + ts->cdf.m.y_mode[dav1d_ymode_size_context[bs]] : + ts->cdf.kfym[dav1d_intra_mode_context[t->a->mode[bx4]]] + [dav1d_intra_mode_context[t->l.mode[by4]]]; + b->y_mode = dav1d_msac_decode_symbol_adapt16(&ts->msac, ymode_cdf, + N_INTRA_PRED_MODES - 1); + if (DEBUG_BLOCK_INFO) + printf("Post-ymode[%d]: r=%d\n", b->y_mode, ts->msac.rng); + + // angle delta + if (b_dim[2] + b_dim[3] >= 2 && b->y_mode >= VERT_PRED && + b->y_mode <= VERT_LEFT_PRED) + { + uint16_t *const acdf = ts->cdf.m.angle_delta[b->y_mode - VERT_PRED]; + const int angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, acdf, 6); + b->y_angle = angle - 3; + } else { + b->y_angle = 0; + } + + if (has_chroma) { + const int cfl_allowed = f->frame_hdr->segmentation.lossless[b->seg_id] ? + cbw4 == 1 && cbh4 == 1 : !!(cfl_allowed_mask & (1 << bs)); + uint16_t *const uvmode_cdf = ts->cdf.m.uv_mode[cfl_allowed][b->y_mode]; + b->uv_mode = dav1d_msac_decode_symbol_adapt16(&ts->msac, uvmode_cdf, + N_UV_INTRA_PRED_MODES - 1 - !cfl_allowed); + if (DEBUG_BLOCK_INFO) + printf("Post-uvmode[%d]: r=%d\n", b->uv_mode, ts->msac.rng); + + if (b->uv_mode == CFL_PRED) { +#define SIGN(a) (!!(a) + ((a) > 0)) + const int sign = dav1d_msac_decode_symbol_adapt8(&ts->msac, + ts->cdf.m.cfl_sign, 7) + 1; + const int sign_u = sign * 0x56 >> 8, sign_v = sign - sign_u * 3; + assert(sign_u == sign / 3); + if (sign_u) { + const int ctx = (sign_u == 2) * 3 + sign_v; + b->cfl_alpha[0] = dav1d_msac_decode_symbol_adapt16(&ts->msac, + ts->cdf.m.cfl_alpha[ctx], 15) + 1; + if (sign_u == 1) b->cfl_alpha[0] = -b->cfl_alpha[0]; + } else { + b->cfl_alpha[0] = 0; + } + if (sign_v) { + const int ctx = (sign_v == 2) * 3 + sign_u; + b->cfl_alpha[1] = dav1d_msac_decode_symbol_adapt16(&ts->msac, + ts->cdf.m.cfl_alpha[ctx], 15) + 1; + if (sign_v == 1) b->cfl_alpha[1] = -b->cfl_alpha[1]; + } else { + b->cfl_alpha[1] = 0; + } +#undef SIGN + if (DEBUG_BLOCK_INFO) + printf("Post-uvalphas[%d/%d]: r=%d\n", + b->cfl_alpha[0], b->cfl_alpha[1], ts->msac.rng); + } else if (b_dim[2] + b_dim[3] >= 2 && b->uv_mode >= VERT_PRED && + b->uv_mode <= VERT_LEFT_PRED) + { + uint16_t *const acdf = ts->cdf.m.angle_delta[b->uv_mode - VERT_PRED]; + const int angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, acdf, 6); + b->uv_angle = angle - 3; + } else { + b->uv_angle = 0; + } + } + + b->pal_sz[0] = b->pal_sz[1] = 0; + if (f->frame_hdr->allow_screen_content_tools && + imax(bw4, bh4) <= 16 && bw4 + bh4 >= 4) + { + const int sz_ctx = b_dim[2] + b_dim[3] - 2; + if (b->y_mode == DC_PRED) { + const int pal_ctx = (t->a->pal_sz[bx4] > 0) + (t->l.pal_sz[by4] > 0); + const int use_y_pal = dav1d_msac_decode_bool_adapt(&ts->msac, + ts->cdf.m.pal_y[sz_ctx][pal_ctx]); + if (DEBUG_BLOCK_INFO) + printf("Post-y_pal[%d]: r=%d\n", use_y_pal, ts->msac.rng); + if (use_y_pal) + read_pal_plane(t, b, 0, sz_ctx, bx4, by4); + } + + if (has_chroma && b->uv_mode == DC_PRED) { + const int pal_ctx = b->pal_sz[0] > 0; + const int use_uv_pal = dav1d_msac_decode_bool_adapt(&ts->msac, + ts->cdf.m.pal_uv[pal_ctx]); + if (DEBUG_BLOCK_INFO) + printf("Post-uv_pal[%d]: r=%d\n", use_uv_pal, ts->msac.rng); + if (use_uv_pal) // see aomedia bug 2183 for why we use luma coordinates + read_pal_uv(t, b, sz_ctx, bx4, by4); + } + } + + if (b->y_mode == DC_PRED && !b->pal_sz[0] && + imax(b_dim[2], b_dim[3]) <= 3 && f->seq_hdr->filter_intra) + { + const int is_filter = dav1d_msac_decode_bool_adapt(&ts->msac, + ts->cdf.m.use_filter_intra[bs]); + if (is_filter) { + b->y_mode = FILTER_PRED; + b->y_angle = dav1d_msac_decode_symbol_adapt4(&ts->msac, + ts->cdf.m.filter_intra, 4); + } + if (DEBUG_BLOCK_INFO) + printf("Post-filterintramode[%d/%d]: r=%d\n", + b->y_mode, b->y_angle, ts->msac.rng); + } + + if (b->pal_sz[0]) { + uint8_t *pal_idx; + if (f->frame_thread.pass) { + assert(ts->frame_thread.pal_idx); + pal_idx = ts->frame_thread.pal_idx; + ts->frame_thread.pal_idx += bw4 * bh4 * 16; + } else + pal_idx = t->scratch.pal_idx; + read_pal_indices(t, pal_idx, b, 0, w4, h4, bw4, bh4); + if (DEBUG_BLOCK_INFO) + printf("Post-y-pal-indices: r=%d\n", ts->msac.rng); + } + + if (has_chroma && b->pal_sz[1]) { + uint8_t *pal_idx; + if (f->frame_thread.pass) { + assert(ts->frame_thread.pal_idx); + pal_idx = ts->frame_thread.pal_idx; + ts->frame_thread.pal_idx += cbw4 * cbh4 * 16; + } else + pal_idx = &t->scratch.pal_idx[bw4 * bh4 * 16]; + read_pal_indices(t, pal_idx, b, 1, cw4, ch4, cbw4, cbh4); + if (DEBUG_BLOCK_INFO) + printf("Post-uv-pal-indices: r=%d\n", ts->msac.rng); + } + + const TxfmInfo *t_dim; + if (f->frame_hdr->segmentation.lossless[b->seg_id]) { + b->tx = b->uvtx = (int) TX_4X4; + t_dim = &dav1d_txfm_dimensions[TX_4X4]; + } else { + b->tx = dav1d_max_txfm_size_for_bs[bs][0]; + b->uvtx = dav1d_max_txfm_size_for_bs[bs][f->cur.p.layout]; + t_dim = &dav1d_txfm_dimensions[b->tx]; + if (f->frame_hdr->txfm_mode == DAV1D_TX_SWITCHABLE && t_dim->max > TX_4X4) { + const int tctx = get_tx_ctx(t->a, &t->l, t_dim, by4, bx4); + uint16_t *const tx_cdf = ts->cdf.m.txsz[t_dim->max - 1][tctx]; + int depth = dav1d_msac_decode_symbol_adapt4(&ts->msac, tx_cdf, + imin(t_dim->max, 2)); + + while (depth--) { + b->tx = t_dim->sub; + t_dim = &dav1d_txfm_dimensions[b->tx]; + } + } + if (DEBUG_BLOCK_INFO) + printf("Post-tx[%d]: r=%d\n", b->tx, ts->msac.rng); + } + + // reconstruction + if (f->frame_thread.pass == 1) { + f->bd_fn.read_coef_blocks(t, bs, b); + } else { + f->bd_fn.recon_b_intra(t, bs, intra_edge_flags, b); + } + + if (f->frame_hdr->loopfilter.level_y[0] || + f->frame_hdr->loopfilter.level_y[1]) + { + dav1d_create_lf_mask_intra(t->lf_mask, f->lf.level, f->b4_stride, + (const uint8_t (*)[8][2]) + &ts->lflvl[b->seg_id][0][0][0], + t->bx, t->by, f->w4, f->h4, bs, + b->tx, b->uvtx, f->cur.p.layout, + &t->a->tx_lpf_y[bx4], &t->l.tx_lpf_y[by4], + has_chroma ? &t->a->tx_lpf_uv[cbx4] : NULL, + has_chroma ? &t->l.tx_lpf_uv[cby4] : NULL); + } + + // update contexts +#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ + rep_macro(type, t->dir tx_intra, off, mul * (((uint8_t *) &t_dim->lw)[diridx])); \ + rep_macro(type, t->dir tx, off, mul * (((uint8_t *) &t_dim->lw)[diridx])); \ + rep_macro(type, t->dir mode, off, mul * y_mode_nofilt); \ + rep_macro(type, t->dir pal_sz, off, mul * b->pal_sz[0]); \ + rep_macro(type, t->dir seg_pred, off, mul * seg_pred); \ + rep_macro(type, t->dir skip_mode, off, 0); \ + rep_macro(type, t->dir intra, off, mul); \ + rep_macro(type, t->dir skip, off, mul * b->skip); \ + /* see aomedia bug 2183 for why we use luma coordinates here */ \ + rep_macro(type, t->pal_sz_uv[diridx], off, mul * (has_chroma ? b->pal_sz[1] : 0)); \ + if (f->frame_hdr->frame_type & 1) { \ + rep_macro(type, t->dir comp_type, off, mul * COMP_INTER_NONE); \ + rep_macro(type, t->dir ref[0], off, mul * ((uint8_t) -1)); \ + rep_macro(type, t->dir ref[1], off, mul * ((uint8_t) -1)); \ + rep_macro(type, t->dir filter[0], off, mul * DAV1D_N_SWITCHABLE_FILTERS); \ + rep_macro(type, t->dir filter[1], off, mul * DAV1D_N_SWITCHABLE_FILTERS); \ + } + const enum IntraPredMode y_mode_nofilt = + b->y_mode == FILTER_PRED ? DC_PRED : b->y_mode; + case_set(bh4, l., 1, by4); + case_set(bw4, a->, 0, bx4); +#undef set_ctx + if (b->pal_sz[0]) { + uint16_t *const pal = f->frame_thread.pass ? + f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + + ((t->bx >> 1) + (t->by & 1))][0] : t->scratch.pal[0]; + for (int x = 0; x < bw4; x++) + memcpy(t->al_pal[0][bx4 + x][0], pal, 16); + for (int y = 0; y < bh4; y++) + memcpy(t->al_pal[1][by4 + y][0], pal, 16); + } + if (has_chroma) { +#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ + rep_macro(type, t->dir uvmode, off, mul * b->uv_mode) + case_set(cbh4, l., 1, cby4); + case_set(cbw4, a->, 0, cbx4); +#undef set_ctx + if (b->pal_sz[1]) { + const uint16_t (*const pal)[8] = f->frame_thread.pass ? + f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * + (f->b4_stride >> 1) + ((t->bx >> 1) + (t->by & 1))] : + t->scratch.pal; + // see aomedia bug 2183 for why we use luma coordinates here + for (int pl = 1; pl <= 2; pl++) { + for (int x = 0; x < bw4; x++) + memcpy(t->al_pal[0][bx4 + x][pl], pal[pl], 16); + for (int y = 0; y < bh4; y++) + memcpy(t->al_pal[1][by4 + y][pl], pal[pl], 16); + } + } + } + if ((f->frame_hdr->frame_type & 1) || f->frame_hdr->allow_intrabc) { + splat_intraref(&t->rt, t->by, t->bx, bs); + } + } else if (!(f->frame_hdr->frame_type & 1)) { + // intra block copy + refmvs_candidate mvstack[8]; + int n_mvs, ctx; + dav1d_refmvs_find(&t->rt, mvstack, &n_mvs, &ctx, + (union refmvs_refpair) { .ref = { 0, -1 }}, + bs, intra_edge_flags, t->by, t->bx); + + if (mvstack[0].mv.mv[0].n) + b->mv[0] = mvstack[0].mv.mv[0]; + else if (mvstack[1].mv.mv[0].n) + b->mv[0] = mvstack[1].mv.mv[0]; + else { + if (t->by - (16 << f->seq_hdr->sb128) < ts->tiling.row_start) { + b->mv[0].y = 0; + b->mv[0].x = -(512 << f->seq_hdr->sb128) - 2048; + } else { + b->mv[0].y = -(512 << f->seq_hdr->sb128); + b->mv[0].x = 0; + } + } + + const union mv ref = b->mv[0]; + read_mv_residual(t, &b->mv[0], &ts->cdf.dmv, 0); + + // clip intrabc motion vector to decoded parts of current tile + int border_left = ts->tiling.col_start * 4; + int border_top = ts->tiling.row_start * 4; + if (has_chroma) { + if (bw4 < 2 && ss_hor) + border_left += 4; + if (bh4 < 2 && ss_ver) + border_top += 4; + } + int src_left = t->bx * 4 + (b->mv[0].x >> 3); + int src_top = t->by * 4 + (b->mv[0].y >> 3); + int src_right = src_left + bw4 * 4; + int src_bottom = src_top + bh4 * 4; + const int border_right = ((ts->tiling.col_end + (bw4 - 1)) & ~(bw4 - 1)) * 4; + + // check against left or right tile boundary and adjust if necessary + if (src_left < border_left) { + src_right += border_left - src_left; + src_left += border_left - src_left; + } else if (src_right > border_right) { + src_left -= src_right - border_right; + src_right -= src_right - border_right; + } + // check against top tile boundary and adjust if necessary + if (src_top < border_top) { + src_bottom += border_top - src_top; + src_top += border_top - src_top; + } + + const int sbx = (t->bx >> (4 + f->seq_hdr->sb128)) << (6 + f->seq_hdr->sb128); + const int sby = (t->by >> (4 + f->seq_hdr->sb128)) << (6 + f->seq_hdr->sb128); + const int sb_size = 1 << (6 + f->seq_hdr->sb128); + // check for overlap with current superblock + if (src_bottom > sby && src_right > sbx) { + if (src_top - border_top >= src_bottom - sby) { + // if possible move src up into the previous suberblock row + src_top -= src_bottom - sby; + src_bottom -= src_bottom - sby; + } else if (src_left - border_left >= src_right - sbx) { + // if possible move src left into the previous suberblock + src_left -= src_right - sbx; + src_right -= src_right - sbx; + } + } + // move src up if it is below current superblock row + if (src_bottom > sby + sb_size) { + src_top -= src_bottom - (sby + sb_size); + src_bottom -= src_bottom - (sby + sb_size); + } + // error out if mv still overlaps with the current superblock + if (src_bottom > sby && src_right > sbx) + return -1; + + b->mv[0].x = (src_left - t->bx * 4) * 8; + b->mv[0].y = (src_top - t->by * 4) * 8; + + if (DEBUG_BLOCK_INFO) + printf("Post-dmv[%d/%d,ref=%d/%d|%d/%d]: r=%d\n", + b->mv[0].y, b->mv[0].x, ref.y, ref.x, + mvstack[0].mv.mv[0].y, mvstack[0].mv.mv[0].x, ts->msac.rng); + read_vartx_tree(t, b, bs, bx4, by4); + + // reconstruction + if (f->frame_thread.pass == 1) { + f->bd_fn.read_coef_blocks(t, bs, b); + b->filter2d = FILTER_2D_BILINEAR; + } else { + if (f->bd_fn.recon_b_inter(t, bs, b)) return -1; + } + + splat_intrabc_mv(&t->rt, t->by, t->bx, bs, b->mv[0]); + +#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ + rep_macro(type, t->dir tx_intra, off, mul * b_dim[2 + diridx]); \ + rep_macro(type, t->dir mode, off, mul * DC_PRED); \ + rep_macro(type, t->dir pal_sz, off, 0); \ + /* see aomedia bug 2183 for why this is outside if (has_chroma) */ \ + rep_macro(type, t->pal_sz_uv[diridx], off, 0); \ + rep_macro(type, t->dir seg_pred, off, mul * seg_pred); \ + rep_macro(type, t->dir skip_mode, off, 0); \ + rep_macro(type, t->dir intra, off, 0); \ + rep_macro(type, t->dir skip, off, mul * b->skip) + case_set(bh4, l., 1, by4); + case_set(bw4, a->, 0, bx4); +#undef set_ctx + if (has_chroma) { +#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ + rep_macro(type, t->dir uvmode, off, mul * DC_PRED) + case_set(cbh4, l., 1, cby4); + case_set(cbw4, a->, 0, cbx4); +#undef set_ctx + } + } else { + // inter-specific mode/mv coding + int is_comp, has_subpel_filter; + + if (b->skip_mode) { + is_comp = 1; + } else if ((!seg || (seg->ref == -1 && !seg->globalmv && !seg->skip)) && + f->frame_hdr->switchable_comp_refs && imin(bw4, bh4) > 1) + { + const int ctx = get_comp_ctx(t->a, &t->l, by4, bx4, + have_top, have_left); + is_comp = dav1d_msac_decode_bool_adapt(&ts->msac, + ts->cdf.m.comp[ctx]); + if (DEBUG_BLOCK_INFO) + printf("Post-compflag[%d]: r=%d\n", is_comp, ts->msac.rng); + } else { + is_comp = 0; + } + + if (b->skip_mode) { + b->ref[0] = f->frame_hdr->skip_mode_refs[0]; + b->ref[1] = f->frame_hdr->skip_mode_refs[1]; + b->comp_type = COMP_INTER_AVG; + b->inter_mode = NEARESTMV_NEARESTMV; + b->drl_idx = NEAREST_DRL; + has_subpel_filter = 0; + + refmvs_candidate mvstack[8]; + int n_mvs, ctx; + dav1d_refmvs_find(&t->rt, mvstack, &n_mvs, &ctx, + (union refmvs_refpair) { .ref = { + b->ref[0] + 1, b->ref[1] + 1 }}, + bs, intra_edge_flags, t->by, t->bx); + + b->mv[0] = mvstack[0].mv.mv[0]; + b->mv[1] = mvstack[0].mv.mv[1]; + fix_mv_precision(f->frame_hdr, &b->mv[0]); + fix_mv_precision(f->frame_hdr, &b->mv[1]); + if (DEBUG_BLOCK_INFO) + printf("Post-skipmodeblock[mv=1:y=%d,x=%d,2:y=%d,x=%d,refs=%d+%d\n", + b->mv[0].y, b->mv[0].x, b->mv[1].y, b->mv[1].x, + b->ref[0], b->ref[1]); + } else if (is_comp) { + const int dir_ctx = get_comp_dir_ctx(t->a, &t->l, by4, bx4, + have_top, have_left); + if (dav1d_msac_decode_bool_adapt(&ts->msac, + ts->cdf.m.comp_dir[dir_ctx])) + { + // bidir - first reference (fw) + const int ctx1 = av1_get_fwd_ref_ctx(t->a, &t->l, by4, bx4, + have_top, have_left); + if (dav1d_msac_decode_bool_adapt(&ts->msac, + ts->cdf.m.comp_fwd_ref[0][ctx1])) + { + const int ctx2 = av1_get_fwd_ref_2_ctx(t->a, &t->l, by4, bx4, + have_top, have_left); + b->ref[0] = 2 + dav1d_msac_decode_bool_adapt(&ts->msac, + ts->cdf.m.comp_fwd_ref[2][ctx2]); + } else { + const int ctx2 = av1_get_fwd_ref_1_ctx(t->a, &t->l, by4, bx4, + have_top, have_left); + b->ref[0] = dav1d_msac_decode_bool_adapt(&ts->msac, + ts->cdf.m.comp_fwd_ref[1][ctx2]); + } + + // second reference (bw) + const int ctx3 = av1_get_bwd_ref_ctx(t->a, &t->l, by4, bx4, + have_top, have_left); + if (dav1d_msac_decode_bool_adapt(&ts->msac, + ts->cdf.m.comp_bwd_ref[0][ctx3])) + { + b->ref[1] = 6; + } else { + const int ctx4 = av1_get_bwd_ref_1_ctx(t->a, &t->l, by4, bx4, + have_top, have_left); + b->ref[1] = 4 + dav1d_msac_decode_bool_adapt(&ts->msac, + ts->cdf.m.comp_bwd_ref[1][ctx4]); + } + } else { + // unidir + const int uctx_p = av1_get_uni_p_ctx(t->a, &t->l, by4, bx4, + have_top, have_left); + if (dav1d_msac_decode_bool_adapt(&ts->msac, + ts->cdf.m.comp_uni_ref[0][uctx_p])) + { + b->ref[0] = 4; + b->ref[1] = 6; + } else { + const int uctx_p1 = av1_get_uni_p1_ctx(t->a, &t->l, by4, bx4, + have_top, have_left); + b->ref[0] = 0; + b->ref[1] = 1 + dav1d_msac_decode_bool_adapt(&ts->msac, + ts->cdf.m.comp_uni_ref[1][uctx_p1]); + if (b->ref[1] == 2) { + const int uctx_p2 = av1_get_uni_p2_ctx(t->a, &t->l, by4, bx4, + have_top, have_left); + b->ref[1] += dav1d_msac_decode_bool_adapt(&ts->msac, + ts->cdf.m.comp_uni_ref[2][uctx_p2]); + } + } + } + if (DEBUG_BLOCK_INFO) + printf("Post-refs[%d/%d]: r=%d\n", + b->ref[0], b->ref[1], ts->msac.rng); + + refmvs_candidate mvstack[8]; + int n_mvs, ctx; + dav1d_refmvs_find(&t->rt, mvstack, &n_mvs, &ctx, + (union refmvs_refpair) { .ref = { + b->ref[0] + 1, b->ref[1] + 1 }}, + bs, intra_edge_flags, t->by, t->bx); + + b->inter_mode = dav1d_msac_decode_symbol_adapt8(&ts->msac, + ts->cdf.m.comp_inter_mode[ctx], + N_COMP_INTER_PRED_MODES - 1); + if (DEBUG_BLOCK_INFO) + printf("Post-compintermode[%d,ctx=%d,n_mvs=%d]: r=%d\n", + b->inter_mode, ctx, n_mvs, ts->msac.rng); + + const uint8_t *const im = dav1d_comp_inter_pred_modes[b->inter_mode]; + b->drl_idx = NEAREST_DRL; + if (b->inter_mode == NEWMV_NEWMV) { + if (n_mvs > 1) { // NEARER, NEAR or NEARISH + const int drl_ctx_v1 = get_drl_context(mvstack, 0); + b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac, + ts->cdf.m.drl_bit[drl_ctx_v1]); + if (b->drl_idx == NEARER_DRL && n_mvs > 2) { + const int drl_ctx_v2 = get_drl_context(mvstack, 1); + b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac, + ts->cdf.m.drl_bit[drl_ctx_v2]); + } + if (DEBUG_BLOCK_INFO) + printf("Post-drlidx[%d,n_mvs=%d]: r=%d\n", + b->drl_idx, n_mvs, ts->msac.rng); + } + } else if (im[0] == NEARMV || im[1] == NEARMV) { + b->drl_idx = NEARER_DRL; + if (n_mvs > 2) { // NEAR or NEARISH + const int drl_ctx_v2 = get_drl_context(mvstack, 1); + b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac, + ts->cdf.m.drl_bit[drl_ctx_v2]); + if (b->drl_idx == NEAR_DRL && n_mvs > 3) { + const int drl_ctx_v3 = get_drl_context(mvstack, 2); + b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac, + ts->cdf.m.drl_bit[drl_ctx_v3]); + } + if (DEBUG_BLOCK_INFO) + printf("Post-drlidx[%d,n_mvs=%d]: r=%d\n", + b->drl_idx, n_mvs, ts->msac.rng); + } + } + assert(b->drl_idx >= NEAREST_DRL && b->drl_idx <= NEARISH_DRL); + +#define assign_comp_mv(idx) \ + switch (im[idx]) { \ + case NEARMV: \ + case NEARESTMV: \ + b->mv[idx] = mvstack[b->drl_idx].mv.mv[idx]; \ + fix_mv_precision(f->frame_hdr, &b->mv[idx]); \ + break; \ + case GLOBALMV: \ + has_subpel_filter |= \ + f->frame_hdr->gmv[b->ref[idx]].type == DAV1D_WM_TYPE_TRANSLATION; \ + b->mv[idx] = get_gmv_2d(&f->frame_hdr->gmv[b->ref[idx]], \ + t->bx, t->by, bw4, bh4, f->frame_hdr); \ + break; \ + case NEWMV: \ + b->mv[idx] = mvstack[b->drl_idx].mv.mv[idx]; \ + read_mv_residual(t, &b->mv[idx], &ts->cdf.mv, \ + !f->frame_hdr->force_integer_mv); \ + break; \ + } + has_subpel_filter = imin(bw4, bh4) == 1 || + b->inter_mode != GLOBALMV_GLOBALMV; + assign_comp_mv(0); + assign_comp_mv(1); +#undef assign_comp_mv + if (DEBUG_BLOCK_INFO) + printf("Post-residual_mv[1:y=%d,x=%d,2:y=%d,x=%d]: r=%d\n", + b->mv[0].y, b->mv[0].x, b->mv[1].y, b->mv[1].x, + ts->msac.rng); + + // jnt_comp vs. seg vs. wedge + int is_segwedge = 0; + if (f->seq_hdr->masked_compound) { + const int mask_ctx = get_mask_comp_ctx(t->a, &t->l, by4, bx4); + + is_segwedge = dav1d_msac_decode_bool_adapt(&ts->msac, + ts->cdf.m.mask_comp[mask_ctx]); + if (DEBUG_BLOCK_INFO) + printf("Post-segwedge_vs_jntavg[%d,ctx=%d]: r=%d\n", + is_segwedge, mask_ctx, ts->msac.rng); + } + + if (!is_segwedge) { + if (f->seq_hdr->jnt_comp) { + const int jnt_ctx = + get_jnt_comp_ctx(f->seq_hdr->order_hint_n_bits, + f->cur.frame_hdr->frame_offset, + f->refp[b->ref[0]].p.frame_hdr->frame_offset, + f->refp[b->ref[1]].p.frame_hdr->frame_offset, + t->a, &t->l, by4, bx4); + b->comp_type = COMP_INTER_WEIGHTED_AVG + + dav1d_msac_decode_bool_adapt(&ts->msac, + ts->cdf.m.jnt_comp[jnt_ctx]); + if (DEBUG_BLOCK_INFO) + printf("Post-jnt_comp[%d,ctx=%d[ac:%d,ar:%d,lc:%d,lr:%d]]: r=%d\n", + b->comp_type == COMP_INTER_AVG, + jnt_ctx, t->a->comp_type[bx4], t->a->ref[0][bx4], + t->l.comp_type[by4], t->l.ref[0][by4], + ts->msac.rng); + } else { + b->comp_type = COMP_INTER_AVG; + } + } else { + if (wedge_allowed_mask & (1 << bs)) { + const int ctx = dav1d_wedge_ctx_lut[bs]; + b->comp_type = COMP_INTER_WEDGE - + dav1d_msac_decode_bool_adapt(&ts->msac, + ts->cdf.m.wedge_comp[ctx]); + if (b->comp_type == COMP_INTER_WEDGE) + b->wedge_idx = dav1d_msac_decode_symbol_adapt16(&ts->msac, + ts->cdf.m.wedge_idx[ctx], 15); + } else { + b->comp_type = COMP_INTER_SEG; + } + b->mask_sign = dav1d_msac_decode_bool_equi(&ts->msac); + if (DEBUG_BLOCK_INFO) + printf("Post-seg/wedge[%d,wedge_idx=%d,sign=%d]: r=%d\n", + b->comp_type == COMP_INTER_WEDGE, + b->wedge_idx, b->mask_sign, ts->msac.rng); + } + } else { + b->comp_type = COMP_INTER_NONE; + + // ref + if (seg && seg->ref > 0) { + b->ref[0] = seg->ref - 1; + } else if (seg && (seg->globalmv || seg->skip)) { + b->ref[0] = 0; + } else { + const int ctx1 = av1_get_ref_ctx(t->a, &t->l, by4, bx4, + have_top, have_left); + if (dav1d_msac_decode_bool_adapt(&ts->msac, + ts->cdf.m.ref[0][ctx1])) + { + const int ctx2 = av1_get_ref_2_ctx(t->a, &t->l, by4, bx4, + have_top, have_left); + if (dav1d_msac_decode_bool_adapt(&ts->msac, + ts->cdf.m.ref[1][ctx2])) + { + b->ref[0] = 6; + } else { + const int ctx3 = av1_get_ref_6_ctx(t->a, &t->l, by4, bx4, + have_top, have_left); + b->ref[0] = 4 + dav1d_msac_decode_bool_adapt(&ts->msac, + ts->cdf.m.ref[5][ctx3]); + } + } else { + const int ctx2 = av1_get_ref_3_ctx(t->a, &t->l, by4, bx4, + have_top, have_left); + if (dav1d_msac_decode_bool_adapt(&ts->msac, + ts->cdf.m.ref[2][ctx2])) + { + const int ctx3 = av1_get_ref_5_ctx(t->a, &t->l, by4, bx4, + have_top, have_left); + b->ref[0] = 2 + dav1d_msac_decode_bool_adapt(&ts->msac, + ts->cdf.m.ref[4][ctx3]); + } else { + const int ctx3 = av1_get_ref_4_ctx(t->a, &t->l, by4, bx4, + have_top, have_left); + b->ref[0] = dav1d_msac_decode_bool_adapt(&ts->msac, + ts->cdf.m.ref[3][ctx3]); + } + } + if (DEBUG_BLOCK_INFO) + printf("Post-ref[%d]: r=%d\n", b->ref[0], ts->msac.rng); + } + b->ref[1] = -1; + + refmvs_candidate mvstack[8]; + int n_mvs, ctx; + dav1d_refmvs_find(&t->rt, mvstack, &n_mvs, &ctx, + (union refmvs_refpair) { .ref = { b->ref[0] + 1, -1 }}, + bs, intra_edge_flags, t->by, t->bx); + + // mode parsing and mv derivation from ref_mvs + if ((seg && (seg->skip || seg->globalmv)) || + dav1d_msac_decode_bool_adapt(&ts->msac, + ts->cdf.m.newmv_mode[ctx & 7])) + { + if ((seg && (seg->skip || seg->globalmv)) || + !dav1d_msac_decode_bool_adapt(&ts->msac, + ts->cdf.m.globalmv_mode[(ctx >> 3) & 1])) + { + b->inter_mode = GLOBALMV; + b->mv[0] = get_gmv_2d(&f->frame_hdr->gmv[b->ref[0]], + t->bx, t->by, bw4, bh4, f->frame_hdr); + has_subpel_filter = imin(bw4, bh4) == 1 || + f->frame_hdr->gmv[b->ref[0]].type == DAV1D_WM_TYPE_TRANSLATION; + } else { + has_subpel_filter = 1; + if (dav1d_msac_decode_bool_adapt(&ts->msac, + ts->cdf.m.refmv_mode[(ctx >> 4) & 15])) + { // NEAREST, NEARER, NEAR or NEARISH + b->inter_mode = NEARMV; + b->drl_idx = NEARER_DRL; + if (n_mvs > 2) { // NEARER, NEAR or NEARISH + const int drl_ctx_v2 = get_drl_context(mvstack, 1); + b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac, + ts->cdf.m.drl_bit[drl_ctx_v2]); + if (b->drl_idx == NEAR_DRL && n_mvs > 3) { // NEAR or NEARISH + const int drl_ctx_v3 = + get_drl_context(mvstack, 2); + b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac, + ts->cdf.m.drl_bit[drl_ctx_v3]); + } + } + } else { + b->inter_mode = NEARESTMV; + b->drl_idx = NEAREST_DRL; + } + assert(b->drl_idx >= NEAREST_DRL && b->drl_idx <= NEARISH_DRL); + b->mv[0] = mvstack[b->drl_idx].mv.mv[0]; + if (b->drl_idx < NEAR_DRL) + fix_mv_precision(f->frame_hdr, &b->mv[0]); + } + + if (DEBUG_BLOCK_INFO) + printf("Post-intermode[%d,drl=%d,mv=y:%d,x:%d,n_mvs=%d]: r=%d\n", + b->inter_mode, b->drl_idx, b->mv[0].y, b->mv[0].x, n_mvs, + ts->msac.rng); + } else { + has_subpel_filter = 1; + b->inter_mode = NEWMV; + b->drl_idx = NEAREST_DRL; + if (n_mvs > 1) { // NEARER, NEAR or NEARISH + const int drl_ctx_v1 = get_drl_context(mvstack, 0); + b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac, + ts->cdf.m.drl_bit[drl_ctx_v1]); + if (b->drl_idx == NEARER_DRL && n_mvs > 2) { // NEAR or NEARISH + const int drl_ctx_v2 = get_drl_context(mvstack, 1); + b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac, + ts->cdf.m.drl_bit[drl_ctx_v2]); + } + } + assert(b->drl_idx >= NEAREST_DRL && b->drl_idx <= NEARISH_DRL); + if (n_mvs > 1) { + b->mv[0] = mvstack[b->drl_idx].mv.mv[0]; + } else { + assert(!b->drl_idx); + b->mv[0] = mvstack[0].mv.mv[0]; + fix_mv_precision(f->frame_hdr, &b->mv[0]); + } + if (DEBUG_BLOCK_INFO) + printf("Post-intermode[%d,drl=%d]: r=%d\n", + b->inter_mode, b->drl_idx, ts->msac.rng); + read_mv_residual(t, &b->mv[0], &ts->cdf.mv, + !f->frame_hdr->force_integer_mv); + if (DEBUG_BLOCK_INFO) + printf("Post-residualmv[mv=y:%d,x:%d]: r=%d\n", + b->mv[0].y, b->mv[0].x, ts->msac.rng); + } + + // interintra flags + const int ii_sz_grp = dav1d_ymode_size_context[bs]; + if (f->seq_hdr->inter_intra && + interintra_allowed_mask & (1 << bs) && + dav1d_msac_decode_bool_adapt(&ts->msac, + ts->cdf.m.interintra[ii_sz_grp])) + { + b->interintra_mode = dav1d_msac_decode_symbol_adapt4(&ts->msac, + ts->cdf.m.interintra_mode[ii_sz_grp], + N_INTER_INTRA_PRED_MODES - 1); + const int wedge_ctx = dav1d_wedge_ctx_lut[bs]; + b->interintra_type = INTER_INTRA_BLEND + + dav1d_msac_decode_bool_adapt(&ts->msac, + ts->cdf.m.interintra_wedge[wedge_ctx]); + if (b->interintra_type == INTER_INTRA_WEDGE) + b->wedge_idx = dav1d_msac_decode_symbol_adapt16(&ts->msac, + ts->cdf.m.wedge_idx[wedge_ctx], 15); + } else { + b->interintra_type = INTER_INTRA_NONE; + } + if (DEBUG_BLOCK_INFO && f->seq_hdr->inter_intra && + interintra_allowed_mask & (1 << bs)) + { + printf("Post-interintra[t=%d,m=%d,w=%d]: r=%d\n", + b->interintra_type, b->interintra_mode, + b->wedge_idx, ts->msac.rng); + } + + // motion variation + if (f->frame_hdr->switchable_motion_mode && + b->interintra_type == INTER_INTRA_NONE && imin(bw4, bh4) >= 2 && + // is not warped global motion + !(!f->frame_hdr->force_integer_mv && b->inter_mode == GLOBALMV && + f->frame_hdr->gmv[b->ref[0]].type > DAV1D_WM_TYPE_TRANSLATION) && + // has overlappable neighbours + ((have_left && findoddzero(&t->l.intra[by4 + 1], h4 >> 1)) || + (have_top && findoddzero(&t->a->intra[bx4 + 1], w4 >> 1)))) + { + // reaching here means the block allows obmc - check warp by + // finding matching-ref blocks in top/left edges + uint64_t mask[2] = { 0, 0 }; + find_matching_ref(t, intra_edge_flags, bw4, bh4, w4, h4, + have_left, have_top, b->ref[0], mask); + const int allow_warp = !f->svc[b->ref[0]][0].scale && + !f->frame_hdr->force_integer_mv && + f->frame_hdr->warp_motion && (mask[0] | mask[1]); + + b->motion_mode = allow_warp ? + dav1d_msac_decode_symbol_adapt4(&ts->msac, + ts->cdf.m.motion_mode[bs], 2) : + dav1d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.obmc[bs]); + if (b->motion_mode == MM_WARP) { + has_subpel_filter = 0; + derive_warpmv(t, bw4, bh4, mask, b->mv[0], &t->warpmv); +#define signabs(v) v < 0 ? '-' : ' ', abs(v) + if (DEBUG_BLOCK_INFO) + printf("[ %c%x %c%x %c%x\n %c%x %c%x %c%x ]\n" + "alpha=%c%x, beta=%c%x, gamma=%c%x, delta=%c%x, " + "mv=y:%d,x:%d\n", + signabs(t->warpmv.matrix[0]), + signabs(t->warpmv.matrix[1]), + signabs(t->warpmv.matrix[2]), + signabs(t->warpmv.matrix[3]), + signabs(t->warpmv.matrix[4]), + signabs(t->warpmv.matrix[5]), + signabs(t->warpmv.u.p.alpha), + signabs(t->warpmv.u.p.beta), + signabs(t->warpmv.u.p.gamma), + signabs(t->warpmv.u.p.delta), + b->mv[0].y, b->mv[0].x); +#undef signabs + if (f->frame_thread.pass) { + if (t->warpmv.type == DAV1D_WM_TYPE_AFFINE) { + b->matrix[0] = t->warpmv.matrix[2] - 0x10000; + b->matrix[1] = t->warpmv.matrix[3]; + b->matrix[2] = t->warpmv.matrix[4]; + b->matrix[3] = t->warpmv.matrix[5] - 0x10000; + } else { + b->matrix[0] = SHRT_MIN; + } + } + } + + if (DEBUG_BLOCK_INFO) + printf("Post-motionmode[%d]: r=%d [mask: 0x%" PRIu64 "x/0x%" + PRIu64 "x]\n", b->motion_mode, ts->msac.rng, mask[0], + mask[1]); + } else { + b->motion_mode = MM_TRANSLATION; + } + } + + // subpel filter + enum Dav1dFilterMode filter[2]; + if (f->frame_hdr->subpel_filter_mode == DAV1D_FILTER_SWITCHABLE) { + if (has_subpel_filter) { + const int comp = b->comp_type != COMP_INTER_NONE; + const int ctx1 = get_filter_ctx(t->a, &t->l, comp, 0, b->ref[0], + by4, bx4); + filter[0] = dav1d_msac_decode_symbol_adapt4(&ts->msac, + ts->cdf.m.filter[0][ctx1], + DAV1D_N_SWITCHABLE_FILTERS - 1); + if (f->seq_hdr->dual_filter) { + const int ctx2 = get_filter_ctx(t->a, &t->l, comp, 1, + b->ref[0], by4, bx4); + if (DEBUG_BLOCK_INFO) + printf("Post-subpel_filter1[%d,ctx=%d]: r=%d\n", + filter[0], ctx1, ts->msac.rng); + filter[1] = dav1d_msac_decode_symbol_adapt4(&ts->msac, + ts->cdf.m.filter[1][ctx2], + DAV1D_N_SWITCHABLE_FILTERS - 1); + if (DEBUG_BLOCK_INFO) + printf("Post-subpel_filter2[%d,ctx=%d]: r=%d\n", + filter[1], ctx2, ts->msac.rng); + } else { + filter[1] = filter[0]; + if (DEBUG_BLOCK_INFO) + printf("Post-subpel_filter[%d,ctx=%d]: r=%d\n", + filter[0], ctx1, ts->msac.rng); + } + } else { + filter[0] = filter[1] = DAV1D_FILTER_8TAP_REGULAR; + } + } else { + filter[0] = filter[1] = f->frame_hdr->subpel_filter_mode; + } + b->filter2d = dav1d_filter_2d[filter[1]][filter[0]]; + + read_vartx_tree(t, b, bs, bx4, by4); + + // reconstruction + if (f->frame_thread.pass == 1) { + f->bd_fn.read_coef_blocks(t, bs, b); + } else { + if (f->bd_fn.recon_b_inter(t, bs, b)) return -1; + } + + if (f->frame_hdr->loopfilter.level_y[0] || + f->frame_hdr->loopfilter.level_y[1]) + { + const int is_globalmv = + b->inter_mode == (is_comp ? GLOBALMV_GLOBALMV : GLOBALMV); + const uint8_t (*const lf_lvls)[8][2] = (const uint8_t (*)[8][2]) + &ts->lflvl[b->seg_id][0][b->ref[0] + 1][!is_globalmv]; + const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 }; + dav1d_create_lf_mask_inter(t->lf_mask, f->lf.level, f->b4_stride, lf_lvls, + t->bx, t->by, f->w4, f->h4, b->skip, bs, + f->frame_hdr->segmentation.lossless[b->seg_id] ? + (enum RectTxfmSize) TX_4X4 : b->max_ytx, + tx_split, b->uvtx, f->cur.p.layout, + &t->a->tx_lpf_y[bx4], &t->l.tx_lpf_y[by4], + has_chroma ? &t->a->tx_lpf_uv[cbx4] : NULL, + has_chroma ? &t->l.tx_lpf_uv[cby4] : NULL); + } + + // context updates + if (is_comp) { + splat_tworef_mv(&t->rt, t->by, t->bx, bs, b->inter_mode, + (refmvs_refpair) { .ref = { b->ref[0], b->ref[1] }}, + (refmvs_mvpair) { .mv = { [0] = b->mv[0], [1] = b->mv[1] }}); + } else { + splat_oneref_mv(&t->rt, t->by, t->bx, bs, b->inter_mode, + b->ref[0], b->mv[0], b->interintra_type); + } + +#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ + rep_macro(type, t->dir seg_pred, off, mul * seg_pred); \ + rep_macro(type, t->dir skip_mode, off, mul * b->skip_mode); \ + rep_macro(type, t->dir intra, off, 0); \ + rep_macro(type, t->dir skip, off, mul * b->skip); \ + rep_macro(type, t->dir pal_sz, off, 0); \ + /* see aomedia bug 2183 for why this is outside if (has_chroma) */ \ + rep_macro(type, t->pal_sz_uv[diridx], off, 0); \ + rep_macro(type, t->dir tx_intra, off, mul * b_dim[2 + diridx]); \ + rep_macro(type, t->dir comp_type, off, mul * b->comp_type); \ + rep_macro(type, t->dir filter[0], off, mul * filter[0]); \ + rep_macro(type, t->dir filter[1], off, mul * filter[1]); \ + rep_macro(type, t->dir mode, off, mul * b->inter_mode); \ + rep_macro(type, t->dir ref[0], off, mul * b->ref[0]); \ + rep_macro(type, t->dir ref[1], off, mul * ((uint8_t) b->ref[1])) + case_set(bh4, l., 1, by4); + case_set(bw4, a->, 0, bx4); +#undef set_ctx + + if (has_chroma) { +#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ + rep_macro(type, t->dir uvmode, off, mul * DC_PRED) + case_set(cbh4, l., 1, cby4); + case_set(cbw4, a->, 0, cbx4); +#undef set_ctx + } + } + + // update contexts + if (f->frame_hdr->segmentation.enabled && + f->frame_hdr->segmentation.update_map) + { + uint8_t *seg_ptr = &f->cur_segmap[t->by * f->b4_stride + t->bx]; +#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ + for (int y = 0; y < bh4; y++) { \ + rep_macro(type, seg_ptr, 0, mul * b->seg_id); \ + seg_ptr += f->b4_stride; \ + } + case_set(bw4, NULL, 0, 0); +#undef set_ctx + } + if (!b->skip) { + uint16_t (*noskip_mask)[2] = &t->lf_mask->noskip_mask[by4]; + const unsigned mask = (~0U >> (32 - bw4)) << (bx4 & 15); + const int bx_idx = (bx4 & 16) >> 4; + for (int y = 0; y < bh4; y++, noskip_mask++) { + (*noskip_mask)[bx_idx] |= mask; + if (bw4 == 32) // this should be mask >> 16, but it's 0xffffffff anyway + (*noskip_mask)[1] |= mask; + } + } + + return 0; +} + +#if __has_feature(memory_sanitizer) + +#include + +static int checked_decode_b(Dav1dTileContext *const t, + const enum BlockLevel bl, + const enum BlockSize bs, + const enum BlockPartition bp, + const enum EdgeFlags intra_edge_flags) +{ + const Dav1dFrameContext *const f = t->f; + const int err = decode_b(t, bl, bs, bp, intra_edge_flags); + + if (err == 0 && !(f->frame_thread.pass & 1)) { + const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; + const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; + const uint8_t *const b_dim = dav1d_block_dimensions[bs]; + const int bw4 = b_dim[0], bh4 = b_dim[1]; + const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by); + const int has_chroma = f->seq_hdr->layout != DAV1D_PIXEL_LAYOUT_I400 && + (bw4 > ss_hor || t->bx & 1) && + (bh4 > ss_ver || t->by & 1); + + for (int p = 0; p < 1 + 2 * has_chroma; p++) { + const int ss_ver = p && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; + const int ss_hor = p && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; + const ptrdiff_t stride = f->cur.stride[!!p]; + const int bx = t->bx & ~ss_hor; + const int by = t->by & ~ss_ver; + const int width = w4 << (2 - ss_hor + (bw4 == ss_hor)); + const int height = h4 << (2 - ss_ver + (bh4 == ss_ver)); + + const uint8_t *data = f->cur.data[p] + (by << (2 - ss_ver)) * stride + + (bx << (2 - ss_hor + !!f->seq_hdr->hbd)); + + for (int y = 0; y < height; data += stride, y++) { + const size_t line_sz = width << !!f->seq_hdr->hbd; + if (__msan_test_shadow(data, line_sz) != -1) { + fprintf(stderr, "B[%d](%d, %d) w4:%d, h4:%d, row:%d\n", + p, bx, by, w4, h4, y); + __msan_check_mem_is_initialized(data, line_sz); + } + } + } + } + + return err; +} + +#define decode_b checked_decode_b + +#endif /* defined(__has_feature) */ + +static int decode_sb(Dav1dTileContext *const t, const enum BlockLevel bl, + const EdgeNode *const node) +{ + const Dav1dFrameContext *const f = t->f; + const int hsz = 16 >> bl; + const int have_h_split = f->bw > t->bx + hsz; + const int have_v_split = f->bh > t->by + hsz; + + if (!have_h_split && !have_v_split) { + assert(bl < BL_8X8); + return decode_sb(t, bl + 1, ((const EdgeBranch *) node)->split[0]); + } + + uint16_t *pc; + enum BlockPartition bp; + int ctx, bx8, by8; + if (f->frame_thread.pass != 2) { + if (0 && bl == BL_64X64) + printf("poc=%d,y=%d,x=%d,bl=%d,r=%d\n", + f->frame_hdr->frame_offset, t->by, t->bx, bl, t->ts->msac.rng); + bx8 = (t->bx & 31) >> 1; + by8 = (t->by & 31) >> 1; + ctx = get_partition_ctx(t->a, &t->l, bl, by8, bx8); + pc = t->ts->cdf.m.partition[bl][ctx]; + } + + if (have_h_split && have_v_split) { + if (f->frame_thread.pass == 2) { + const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx]; + bp = b->bl == bl ? b->bp : PARTITION_SPLIT; + } else { + bp = dav1d_msac_decode_symbol_adapt16(&t->ts->msac, pc, + dav1d_partition_type_count[bl]); + if (f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I422 && + (bp == PARTITION_V || bp == PARTITION_V4 || + bp == PARTITION_T_LEFT_SPLIT || bp == PARTITION_T_RIGHT_SPLIT)) + { + return 1; + } + if (DEBUG_BLOCK_INFO) + printf("poc=%d,y=%d,x=%d,bl=%d,ctx=%d,bp=%d: r=%d\n", + f->frame_hdr->frame_offset, t->by, t->bx, bl, ctx, bp, + t->ts->msac.rng); + } + const uint8_t *const b = dav1d_block_sizes[bl][bp]; + + switch (bp) { + case PARTITION_NONE: + if (decode_b(t, bl, b[0], PARTITION_NONE, node->o)) + return -1; + break; + case PARTITION_H: + if (decode_b(t, bl, b[0], PARTITION_H, node->h[0])) + return -1; + t->by += hsz; + if (decode_b(t, bl, b[0], PARTITION_H, node->h[1])) + return -1; + t->by -= hsz; + break; + case PARTITION_V: + if (decode_b(t, bl, b[0], PARTITION_V, node->v[0])) + return -1; + t->bx += hsz; + if (decode_b(t, bl, b[0], PARTITION_V, node->v[1])) + return -1; + t->bx -= hsz; + break; + case PARTITION_SPLIT: + if (bl == BL_8X8) { + const EdgeTip *const tip = (const EdgeTip *) node; + assert(hsz == 1); + if (decode_b(t, bl, BS_4x4, PARTITION_SPLIT, tip->split[0])) + return -1; + const enum Filter2d tl_filter = t->tl_4x4_filter; + t->bx++; + if (decode_b(t, bl, BS_4x4, PARTITION_SPLIT, tip->split[1])) + return -1; + t->bx--; + t->by++; + if (decode_b(t, bl, BS_4x4, PARTITION_SPLIT, tip->split[2])) + return -1; + t->bx++; + t->tl_4x4_filter = tl_filter; + if (decode_b(t, bl, BS_4x4, PARTITION_SPLIT, tip->split[3])) + return -1; + t->bx--; + t->by--; + } else { + const EdgeBranch *const branch = (const EdgeBranch *) node; + if (decode_sb(t, bl + 1, branch->split[0])) + return 1; + t->bx += hsz; + if (decode_sb(t, bl + 1, branch->split[1])) + return 1; + t->bx -= hsz; + t->by += hsz; + if (decode_sb(t, bl + 1, branch->split[2])) + return 1; + t->bx += hsz; + if (decode_sb(t, bl + 1, branch->split[3])) + return 1; + t->bx -= hsz; + t->by -= hsz; + } + break; + case PARTITION_T_TOP_SPLIT: { + const EdgeBranch *const branch = (const EdgeBranch *) node; + if (decode_b(t, bl, b[0], PARTITION_T_TOP_SPLIT, branch->tts[0])) + return -1; + t->bx += hsz; + if (decode_b(t, bl, b[0], PARTITION_T_TOP_SPLIT, branch->tts[1])) + return -1; + t->bx -= hsz; + t->by += hsz; + if (decode_b(t, bl, b[1], PARTITION_T_TOP_SPLIT, branch->tts[2])) + return -1; + t->by -= hsz; + break; + } + case PARTITION_T_BOTTOM_SPLIT: { + const EdgeBranch *const branch = (const EdgeBranch *) node; + if (decode_b(t, bl, b[0], PARTITION_T_BOTTOM_SPLIT, branch->tbs[0])) + return -1; + t->by += hsz; + if (decode_b(t, bl, b[1], PARTITION_T_BOTTOM_SPLIT, branch->tbs[1])) + return -1; + t->bx += hsz; + if (decode_b(t, bl, b[1], PARTITION_T_BOTTOM_SPLIT, branch->tbs[2])) + return -1; + t->bx -= hsz; + t->by -= hsz; + break; + } + case PARTITION_T_LEFT_SPLIT: { + const EdgeBranch *const branch = (const EdgeBranch *) node; + if (decode_b(t, bl, b[0], PARTITION_T_LEFT_SPLIT, branch->tls[0])) + return -1; + t->by += hsz; + if (decode_b(t, bl, b[0], PARTITION_T_LEFT_SPLIT, branch->tls[1])) + return -1; + t->by -= hsz; + t->bx += hsz; + if (decode_b(t, bl, b[1], PARTITION_T_LEFT_SPLIT, branch->tls[2])) + return -1; + t->bx -= hsz; + break; + } + case PARTITION_T_RIGHT_SPLIT: { + const EdgeBranch *const branch = (const EdgeBranch *) node; + if (decode_b(t, bl, b[0], PARTITION_T_RIGHT_SPLIT, branch->trs[0])) + return -1; + t->bx += hsz; + if (decode_b(t, bl, b[1], PARTITION_T_RIGHT_SPLIT, branch->trs[1])) + return -1; + t->by += hsz; + if (decode_b(t, bl, b[1], PARTITION_T_RIGHT_SPLIT, branch->trs[2])) + return -1; + t->by -= hsz; + t->bx -= hsz; + break; + } + case PARTITION_H4: { + const EdgeBranch *const branch = (const EdgeBranch *) node; + if (decode_b(t, bl, b[0], PARTITION_H4, branch->h4[0])) + return -1; + t->by += hsz >> 1; + if (decode_b(t, bl, b[0], PARTITION_H4, branch->h4[1])) + return -1; + t->by += hsz >> 1; + if (decode_b(t, bl, b[0], PARTITION_H4, branch->h4[2])) + return -1; + t->by += hsz >> 1; + if (t->by < f->bh) + if (decode_b(t, bl, b[0], PARTITION_H4, branch->h4[3])) + return -1; + t->by -= hsz * 3 >> 1; + break; + } + case PARTITION_V4: { + const EdgeBranch *const branch = (const EdgeBranch *) node; + if (decode_b(t, bl, b[0], PARTITION_V4, branch->v4[0])) + return -1; + t->bx += hsz >> 1; + if (decode_b(t, bl, b[0], PARTITION_V4, branch->v4[1])) + return -1; + t->bx += hsz >> 1; + if (decode_b(t, bl, b[0], PARTITION_V4, branch->v4[2])) + return -1; + t->bx += hsz >> 1; + if (t->bx < f->bw) + if (decode_b(t, bl, b[0], PARTITION_V4, branch->v4[3])) + return -1; + t->bx -= hsz * 3 >> 1; + break; + } + default: assert(0); + } + } else if (have_h_split) { + unsigned is_split; + if (f->frame_thread.pass == 2) { + const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx]; + is_split = b->bl != bl; + } else { + is_split = dav1d_msac_decode_bool(&t->ts->msac, + gather_top_partition_prob(pc, bl)); + if (DEBUG_BLOCK_INFO) + printf("poc=%d,y=%d,x=%d,bl=%d,ctx=%d,bp=%d: r=%d\n", + f->frame_hdr->frame_offset, t->by, t->bx, bl, ctx, + is_split ? PARTITION_SPLIT : PARTITION_H, t->ts->msac.rng); + } + + assert(bl < BL_8X8); + if (is_split) { + const EdgeBranch *const branch = (const EdgeBranch *) node; + bp = PARTITION_SPLIT; + if (decode_sb(t, bl + 1, branch->split[0])) return 1; + t->bx += hsz; + if (decode_sb(t, bl + 1, branch->split[1])) return 1; + t->bx -= hsz; + } else { + bp = PARTITION_H; + if (decode_b(t, bl, dav1d_block_sizes[bl][PARTITION_H][0], + PARTITION_H, node->h[0])) + return -1; + } + } else { + assert(have_v_split); + unsigned is_split; + if (f->frame_thread.pass == 2) { + const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx]; + is_split = b->bl != bl; + } else { + is_split = dav1d_msac_decode_bool(&t->ts->msac, + gather_left_partition_prob(pc, bl)); + if (f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I422 && !is_split) + return 1; + if (DEBUG_BLOCK_INFO) + printf("poc=%d,y=%d,x=%d,bl=%d,ctx=%d,bp=%d: r=%d\n", + f->frame_hdr->frame_offset, t->by, t->bx, bl, ctx, + is_split ? PARTITION_SPLIT : PARTITION_V, t->ts->msac.rng); + } + + assert(bl < BL_8X8); + if (is_split) { + const EdgeBranch *const branch = (const EdgeBranch *) node; + bp = PARTITION_SPLIT; + if (decode_sb(t, bl + 1, branch->split[0])) return 1; + t->by += hsz; + if (decode_sb(t, bl + 1, branch->split[2])) return 1; + t->by -= hsz; + } else { + bp = PARTITION_V; + if (decode_b(t, bl, dav1d_block_sizes[bl][PARTITION_V][0], + PARTITION_V, node->v[0])) + return -1; + } + } + + if (f->frame_thread.pass != 2 && (bp != PARTITION_SPLIT || bl == BL_8X8)) { +#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ + rep_macro(type, t->a->partition, bx8, mul * dav1d_al_part_ctx[0][bl][bp]); \ + rep_macro(type, t->l.partition, by8, mul * dav1d_al_part_ctx[1][bl][bp]) + case_set_upto16(hsz,,,); +#undef set_ctx + } + + return 0; +} + +static void reset_context(BlockContext *const ctx, const int keyframe, const int pass) { + memset(ctx->intra, keyframe, sizeof(ctx->intra)); + memset(ctx->uvmode, DC_PRED, sizeof(ctx->uvmode)); + if (keyframe) + memset(ctx->mode, DC_PRED, sizeof(ctx->mode)); + + if (pass == 2) return; + + memset(ctx->partition, 0, sizeof(ctx->partition)); + memset(ctx->skip, 0, sizeof(ctx->skip)); + memset(ctx->skip_mode, 0, sizeof(ctx->skip_mode)); + memset(ctx->tx_lpf_y, 2, sizeof(ctx->tx_lpf_y)); + memset(ctx->tx_lpf_uv, 1, sizeof(ctx->tx_lpf_uv)); + memset(ctx->tx_intra, -1, sizeof(ctx->tx_intra)); + memset(ctx->tx, TX_64X64, sizeof(ctx->tx)); + if (!keyframe) { + memset(ctx->ref, -1, sizeof(ctx->ref)); + memset(ctx->comp_type, 0, sizeof(ctx->comp_type)); + memset(ctx->mode, NEARESTMV, sizeof(ctx->mode)); + } + memset(ctx->lcoef, 0x40, sizeof(ctx->lcoef)); + memset(ctx->ccoef, 0x40, sizeof(ctx->ccoef)); + memset(ctx->filter, DAV1D_N_SWITCHABLE_FILTERS, sizeof(ctx->filter)); + memset(ctx->seg_pred, 0, sizeof(ctx->seg_pred)); + memset(ctx->pal_sz, 0, sizeof(ctx->pal_sz)); +} + +// { Y+U+V, Y+U } * 4 +static const uint8_t ss_size_mul[4][2] = { + [DAV1D_PIXEL_LAYOUT_I400] = { 4, 4 }, + [DAV1D_PIXEL_LAYOUT_I420] = { 6, 5 }, + [DAV1D_PIXEL_LAYOUT_I422] = { 8, 6 }, + [DAV1D_PIXEL_LAYOUT_I444] = { 12, 8 }, +}; + +static void setup_tile(Dav1dTileState *const ts, + const Dav1dFrameContext *const f, + const uint8_t *const data, const size_t sz, + const int tile_row, const int tile_col, + const int tile_start_off) +{ + const int col_sb_start = f->frame_hdr->tiling.col_start_sb[tile_col]; + const int col_sb128_start = col_sb_start >> !f->seq_hdr->sb128; + const int col_sb_end = f->frame_hdr->tiling.col_start_sb[tile_col + 1]; + const int row_sb_start = f->frame_hdr->tiling.row_start_sb[tile_row]; + const int row_sb_end = f->frame_hdr->tiling.row_start_sb[tile_row + 1]; + const int sb_shift = f->sb_shift; + + const uint8_t *const size_mul = ss_size_mul[f->cur.p.layout]; + ts->frame_thread.pal_idx = f->frame_thread.pal_idx ? + &f->frame_thread.pal_idx[(size_t)tile_start_off * size_mul[1] / 4] : + NULL; + + ts->frame_thread.cf = f->frame_thread.cf ? + (uint8_t*)f->frame_thread.cf + + (((size_t)tile_start_off * size_mul[0]) >> !f->seq_hdr->hbd) : + NULL; + + dav1d_cdf_thread_copy(&ts->cdf, &f->in_cdf); + ts->last_qidx = f->frame_hdr->quant.yac; + memset(ts->last_delta_lf, 0, sizeof(ts->last_delta_lf)); + + dav1d_msac_init(&ts->msac, data, sz, f->frame_hdr->disable_cdf_update); + + ts->tiling.row = tile_row; + ts->tiling.col = tile_col; + ts->tiling.col_start = col_sb_start << sb_shift; + ts->tiling.col_end = imin(col_sb_end << sb_shift, f->bw); + ts->tiling.row_start = row_sb_start << sb_shift; + ts->tiling.row_end = imin(row_sb_end << sb_shift, f->bh); + + // Reference Restoration Unit (used for exp coding) + int sb_idx, unit_idx; + if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) { + // vertical components only + sb_idx = (ts->tiling.row_start >> 5) * f->sr_sb128w; + unit_idx = (ts->tiling.row_start & 16) >> 3; + } else { + sb_idx = (ts->tiling.row_start >> 5) * f->sb128w + col_sb128_start; + unit_idx = ((ts->tiling.row_start & 16) >> 3) + + ((ts->tiling.col_start & 16) >> 4); + } + for (int p = 0; p < 3; p++) { + if (!((f->lf.restore_planes >> p) & 1U)) + continue; + + if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) { + const int ss_hor = p && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; + const int d = f->frame_hdr->super_res.width_scale_denominator; + const int unit_size_log2 = f->frame_hdr->restoration.unit_size[!!p]; + const int rnd = (8 << unit_size_log2) - 1, shift = unit_size_log2 + 3; + const int x = ((4 * ts->tiling.col_start * d >> ss_hor) + rnd) >> shift; + const int px_x = x << (unit_size_log2 + ss_hor); + const int u_idx = unit_idx + ((px_x & 64) >> 6); + const int sb128x = px_x >> 7; + if (sb128x >= f->sr_sb128w) continue; + ts->lr_ref[p] = &f->lf.lr_mask[sb_idx + sb128x].lr[p][u_idx]; + } else { + ts->lr_ref[p] = &f->lf.lr_mask[sb_idx].lr[p][unit_idx]; + } + + ts->lr_ref[p]->filter_v[0] = 3; + ts->lr_ref[p]->filter_v[1] = -7; + ts->lr_ref[p]->filter_v[2] = 15; + ts->lr_ref[p]->filter_h[0] = 3; + ts->lr_ref[p]->filter_h[1] = -7; + ts->lr_ref[p]->filter_h[2] = 15; + ts->lr_ref[p]->sgr_weights[0] = -32; + ts->lr_ref[p]->sgr_weights[1] = 31; + } + + if (f->n_tc > 1) + atomic_init(&ts->progress, row_sb_start); +} + +static void read_restoration_info(Dav1dTileContext *const t, + Av1RestorationUnit *const lr, const int p, + const enum Dav1dRestorationType frame_type) +{ + const Dav1dFrameContext *const f = t->f; + Dav1dTileState *const ts = t->ts; + + if (frame_type == DAV1D_RESTORATION_SWITCHABLE) { + const int filter = dav1d_msac_decode_symbol_adapt4(&ts->msac, + ts->cdf.m.restore_switchable, 2); + lr->type = filter ? filter == 2 ? DAV1D_RESTORATION_SGRPROJ : + DAV1D_RESTORATION_WIENER : + DAV1D_RESTORATION_NONE; + } else { + const unsigned type = + dav1d_msac_decode_bool_adapt(&ts->msac, + frame_type == DAV1D_RESTORATION_WIENER ? + ts->cdf.m.restore_wiener : ts->cdf.m.restore_sgrproj); + lr->type = type ? frame_type : DAV1D_RESTORATION_NONE; + } + + if (lr->type == DAV1D_RESTORATION_WIENER) { + lr->filter_v[0] = p ? 0 : + dav1d_msac_decode_subexp(&ts->msac, + ts->lr_ref[p]->filter_v[0] + 5, 16, 1) - 5; + lr->filter_v[1] = + dav1d_msac_decode_subexp(&ts->msac, + ts->lr_ref[p]->filter_v[1] + 23, 32, 2) - 23; + lr->filter_v[2] = + dav1d_msac_decode_subexp(&ts->msac, + ts->lr_ref[p]->filter_v[2] + 17, 64, 3) - 17; + + lr->filter_h[0] = p ? 0 : + dav1d_msac_decode_subexp(&ts->msac, + ts->lr_ref[p]->filter_h[0] + 5, 16, 1) - 5; + lr->filter_h[1] = + dav1d_msac_decode_subexp(&ts->msac, + ts->lr_ref[p]->filter_h[1] + 23, 32, 2) - 23; + lr->filter_h[2] = + dav1d_msac_decode_subexp(&ts->msac, + ts->lr_ref[p]->filter_h[2] + 17, 64, 3) - 17; + memcpy(lr->sgr_weights, ts->lr_ref[p]->sgr_weights, sizeof(lr->sgr_weights)); + ts->lr_ref[p] = lr; + if (DEBUG_BLOCK_INFO) + printf("Post-lr_wiener[pl=%d,v[%d,%d,%d],h[%d,%d,%d]]: r=%d\n", + p, lr->filter_v[0], lr->filter_v[1], + lr->filter_v[2], lr->filter_h[0], + lr->filter_h[1], lr->filter_h[2], ts->msac.rng); + } else if (lr->type == DAV1D_RESTORATION_SGRPROJ) { + const unsigned idx = dav1d_msac_decode_bools(&ts->msac, 4); + lr->sgr_idx = idx; + lr->sgr_weights[0] = dav1d_sgr_params[idx][0] ? + dav1d_msac_decode_subexp(&ts->msac, + ts->lr_ref[p]->sgr_weights[0] + 96, 128, 4) - 96 : + 0; + lr->sgr_weights[1] = dav1d_sgr_params[idx][1] ? + dav1d_msac_decode_subexp(&ts->msac, + ts->lr_ref[p]->sgr_weights[1] + 32, 128, 4) - 32 : + 95; + memcpy(lr->filter_v, ts->lr_ref[p]->filter_v, sizeof(lr->filter_v)); + memcpy(lr->filter_h, ts->lr_ref[p]->filter_h, sizeof(lr->filter_h)); + ts->lr_ref[p] = lr; + if (DEBUG_BLOCK_INFO) + printf("Post-lr_sgrproj[pl=%d,idx=%d,w[%d,%d]]: r=%d\n", + p, lr->sgr_idx, lr->sgr_weights[0], + lr->sgr_weights[1], ts->msac.rng); + } +} + +int dav1d_decode_tile_sbrow(Dav1dTileContext *const t) { + const Dav1dFrameContext *const f = t->f; + const enum BlockLevel root_bl = f->seq_hdr->sb128 ? BL_128X128 : BL_64X64; + Dav1dTileState *const ts = t->ts; + const Dav1dContext *const c = f->c; + const int sb_step = f->sb_step; + const int tile_row = ts->tiling.row, tile_col = ts->tiling.col; + const int col_sb_start = f->frame_hdr->tiling.col_start_sb[tile_col]; + const int col_sb128_start = col_sb_start >> !f->seq_hdr->sb128; + + if ((f->frame_hdr->frame_type & 1) || f->frame_hdr->allow_intrabc) { + dav1d_refmvs_tile_sbrow_init(&t->rt, &f->rf, ts->tiling.col_start, + ts->tiling.col_end, ts->tiling.row_start, + ts->tiling.row_end, t->by >> f->sb_shift, + ts->tiling.row); + } + + reset_context(&t->l, !(f->frame_hdr->frame_type & 1), f->frame_thread.pass); + if (f->frame_thread.pass == 2) { + for (t->bx = ts->tiling.col_start, + t->a = f->a + col_sb128_start + tile_row * f->sb128w; + t->bx < ts->tiling.col_end; t->bx += sb_step) + { + if (atomic_load_explicit(c->frame_thread.flush, memory_order_acquire)) + return 1; + if (decode_sb(t, root_bl, c->intra_edge.root[root_bl])) + return 1; + if (t->bx & 16 || f->seq_hdr->sb128) + t->a++; + } + f->bd_fn.backup_ipred_edge(t); + return 0; + } + + // error out on symbol decoder overread + if (ts->msac.cnt < -15) return 1; + + if (f->n_tc > 1 && f->frame_hdr->use_ref_frame_mvs) { + if (c->n_fc > 1) for (int n = 0; n < 7; n++) + if (dav1d_thread_picture_wait(&f->refp[n], 4 * (t->by + sb_step), + PLANE_TYPE_BLOCK)) + { + return 1; + } + dav1d_refmvs_load_tmvs(&f->rf, ts->tiling.row, + ts->tiling.col_start >> 1, ts->tiling.col_end >> 1, + t->by >> 1, (t->by + sb_step) >> 1); + } + memset(t->pal_sz_uv[1], 0, sizeof(*t->pal_sz_uv)); + const int sb128y = t->by >> 5; + for (t->bx = ts->tiling.col_start, t->a = f->a + col_sb128_start + tile_row * f->sb128w, + t->lf_mask = f->lf.mask + sb128y * f->sb128w + col_sb128_start; + t->bx < ts->tiling.col_end; t->bx += sb_step) + { + if (atomic_load_explicit(c->frame_thread.flush, memory_order_acquire)) + return 1; + if (root_bl == BL_128X128) { + t->cur_sb_cdef_idx_ptr = t->lf_mask->cdef_idx; + t->cur_sb_cdef_idx_ptr[0] = -1; + t->cur_sb_cdef_idx_ptr[1] = -1; + t->cur_sb_cdef_idx_ptr[2] = -1; + t->cur_sb_cdef_idx_ptr[3] = -1; + } else { + t->cur_sb_cdef_idx_ptr = + &t->lf_mask->cdef_idx[((t->bx & 16) >> 4) + + ((t->by & 16) >> 3)]; + t->cur_sb_cdef_idx_ptr[0] = -1; + } + // Restoration filter + for (int p = 0; p < 3; p++) { + if (!((f->lf.restore_planes >> p) & 1U)) + continue; + + const int ss_ver = p && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; + const int ss_hor = p && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; + const int unit_size_log2 = f->frame_hdr->restoration.unit_size[!!p]; + const int y = t->by * 4 >> ss_ver; + const int h = (f->cur.p.h + ss_ver) >> ss_ver; + + const int unit_size = 1 << unit_size_log2; + const unsigned mask = unit_size - 1; + if (y & mask) continue; + const int half_unit = unit_size >> 1; + // Round half up at frame boundaries, if there's more than one + // restoration unit + if (y && y + half_unit > h) continue; + + const enum Dav1dRestorationType frame_type = f->frame_hdr->restoration.type[p]; + + if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) { + const int w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor; + const int n_units = imax(1, (w + half_unit) >> unit_size_log2); + + const int d = f->frame_hdr->super_res.width_scale_denominator; + const int rnd = unit_size * 8 - 1, shift = unit_size_log2 + 3; + const int x0 = ((4 * t->bx * d >> ss_hor) + rnd) >> shift; + const int x1 = ((4 * (t->bx + sb_step) * d >> ss_hor) + rnd) >> shift; + + for (int x = x0; x < imin(x1, n_units); x++) { + const int px_x = x << (unit_size_log2 + ss_hor); + const int sb_idx = (t->by >> 5) * f->sr_sb128w + (px_x >> 7); + const int unit_idx = ((t->by & 16) >> 3) + ((px_x & 64) >> 6); + Av1RestorationUnit *const lr = &f->lf.lr_mask[sb_idx].lr[p][unit_idx]; + + read_restoration_info(t, lr, p, frame_type); + } + } else { + const int x = 4 * t->bx >> ss_hor; + if (x & mask) continue; + const int w = (f->cur.p.w + ss_hor) >> ss_hor; + // Round half up at frame boundaries, if there's more than one + // restoration unit + if (x && x + half_unit > w) continue; + const int sb_idx = (t->by >> 5) * f->sr_sb128w + (t->bx >> 5); + const int unit_idx = ((t->by & 16) >> 3) + ((t->bx & 16) >> 4); + Av1RestorationUnit *const lr = &f->lf.lr_mask[sb_idx].lr[p][unit_idx]; + + read_restoration_info(t, lr, p, frame_type); + } + } + if (decode_sb(t, root_bl, c->intra_edge.root[root_bl])) + return 1; + if (t->bx & 16 || f->seq_hdr->sb128) { + t->a++; + t->lf_mask++; + } + } + + if (f->n_tc > 1 && f->frame_hdr->frame_type & 1) { + dav1d_refmvs_save_tmvs(&t->rt, + ts->tiling.col_start >> 1, ts->tiling.col_end >> 1, + t->by >> 1, (t->by + sb_step) >> 1); + } + + // backup pre-loopfilter pixels for intra prediction of the next sbrow + if (f->frame_thread.pass != 1) + f->bd_fn.backup_ipred_edge(t); + + // backup t->a/l.tx_lpf_y/uv at tile boundaries to use them to "fix" + // up the initial value in neighbour tiles when running the loopfilter + int align_h = (f->bh + 31) & ~31; + memcpy(&f->lf.tx_lpf_right_edge[0][align_h * tile_col + t->by], + &t->l.tx_lpf_y[t->by & 16], sb_step); + const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; + align_h >>= ss_ver; + memcpy(&f->lf.tx_lpf_right_edge[1][align_h * tile_col + (t->by >> ss_ver)], + &t->l.tx_lpf_uv[(t->by & 16) >> ss_ver], sb_step >> ss_ver); + + return 0; +} + +int dav1d_decode_frame(Dav1dFrameContext *const f) { + const Dav1dContext *const c = f->c; + int retval = DAV1D_ERR(ENOMEM); + + if (f->n_tc > 1) { + const int titsati_sz = f->frame_hdr->tiling.cols * f->sbh; + if (titsati_sz != f->tile_thread.titsati_sz) { + freep(&f->tile_thread.task_idx_to_sby_and_tile_idx); + f->tile_thread.task_idx_to_sby_and_tile_idx = + malloc(sizeof(*f->tile_thread.task_idx_to_sby_and_tile_idx) * + titsati_sz); + if (!f->tile_thread.task_idx_to_sby_and_tile_idx) { + f->tile_thread.titsati_sz = 0; + goto error; + } + f->tile_thread.titsati_sz = titsati_sz; + } + if (f->tile_thread.titsati_init[0] != f->frame_hdr->tiling.cols || + f->tile_thread.titsati_init[1] != f->frame_hdr->tiling.rows || + memcmp(f->frame_hdr->tiling.row_start_sb, f->tile_thread.titsati_index_rows, + sizeof(*f->tile_thread.titsati_index_rows) * + (f->frame_hdr->tiling.rows + 1))) + { + for (int tile_row = 0, task_idx = 0; + tile_row < f->frame_hdr->tiling.rows; tile_row++) + { + for (int sby = f->frame_hdr->tiling.row_start_sb[tile_row]; + sby < f->frame_hdr->tiling.row_start_sb[tile_row + 1]; sby++) + { + for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; + tile_col++, task_idx++) + { + f->tile_thread.task_idx_to_sby_and_tile_idx[task_idx][0] = sby; + f->tile_thread.task_idx_to_sby_and_tile_idx[task_idx][1] = + tile_row * f->frame_hdr->tiling.cols + tile_col; + } + } + } + f->tile_thread.titsati_init[0] = f->frame_hdr->tiling.cols; + f->tile_thread.titsati_init[1] = f->frame_hdr->tiling.rows; + memcpy(f->tile_thread.titsati_index_rows, f->frame_hdr->tiling.row_start_sb, + sizeof(*f->tile_thread.titsati_index_rows) * + (f->frame_hdr->tiling.rows + 1)); + } + } + + const int n_ts = f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows; + if (n_ts != f->n_ts) { + if (c->n_fc > 1) { + freep(&f->frame_thread.tile_start_off); + f->frame_thread.tile_start_off = + malloc(sizeof(*f->frame_thread.tile_start_off) * n_ts); + if (!f->frame_thread.tile_start_off) { + for (int n = 0; n < f->n_ts; n++) { + Dav1dTileState *const ts = &f->ts[n]; + pthread_cond_destroy(&ts->tile_thread.cond); + pthread_mutex_destroy(&ts->tile_thread.lock); + } + f->n_ts = 0; + goto error; + } + } + Dav1dTileState *ts_new = dav1d_alloc_aligned(sizeof(*f->ts) * n_ts, 32); + if (!ts_new) goto error; + if (n_ts > f->n_ts) { + if (f->ts) { + memcpy(ts_new, f->ts, sizeof(*f->ts) * f->n_ts); + dav1d_free_aligned(f->ts); + } + f->ts = ts_new; + for (int n = f->n_ts; n < n_ts; f->n_ts = ++n) { + Dav1dTileState *const ts = &f->ts[n]; + if (pthread_mutex_init(&ts->tile_thread.lock, NULL)) goto error; + if (pthread_cond_init(&ts->tile_thread.cond, NULL)) { + pthread_mutex_destroy(&ts->tile_thread.lock); + goto error; + } + } + } else { + for (int n = n_ts; n < f->n_ts; n++) { + Dav1dTileState *const ts = &f->ts[n]; + pthread_cond_destroy(&ts->tile_thread.cond); + pthread_mutex_destroy(&ts->tile_thread.lock); + } + memcpy(ts_new, f->ts, sizeof(*f->ts) * n_ts); + dav1d_free_aligned(f->ts); + f->n_ts = n_ts; + f->ts = ts_new; + } + } + + const int a_sz = f->sb128w * f->frame_hdr->tiling.rows; + if (a_sz != f->a_sz) { + freep(&f->a); + f->a = malloc(sizeof(*f->a) * a_sz); + if (!f->a) { + f->a_sz = 0; + goto error; + } + f->a_sz = a_sz; + } + + const int num_sb128 = f->sb128w * f->sb128h; + const uint8_t *const size_mul = ss_size_mul[f->cur.p.layout]; + const int hbd = !!f->seq_hdr->hbd; + if (c->n_fc > 1) { + int tile_idx = 0; + for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) { + int row_off = f->frame_hdr->tiling.row_start_sb[tile_row] * + f->sb_step * 4 * f->sb128w * 128; + int b_diff = (f->frame_hdr->tiling.row_start_sb[tile_row + 1] - + f->frame_hdr->tiling.row_start_sb[tile_row]) * f->sb_step * 4; + for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; tile_col++) { + f->frame_thread.tile_start_off[tile_idx++] = row_off + b_diff * + f->frame_hdr->tiling.col_start_sb[tile_col] * f->sb_step * 4; + } + } + + const int cf_sz = (num_sb128 * size_mul[0]) << hbd; + if (cf_sz != f->frame_thread.cf_sz) { + dav1d_freep_aligned(&f->frame_thread.cf); + f->frame_thread.cf = + dav1d_alloc_aligned((size_t)cf_sz * 128 * 128 / 2, 32); + if (!f->frame_thread.cf) { + f->frame_thread.cf_sz = 0; + goto error; + } + memset(f->frame_thread.cf, 0, (size_t)cf_sz * 128 * 128 / 2); + f->frame_thread.cf_sz = cf_sz; + } + + if (f->frame_hdr->allow_screen_content_tools) { + if (num_sb128 != f->frame_thread.pal_sz) { + dav1d_freep_aligned(&f->frame_thread.pal); + f->frame_thread.pal = + dav1d_alloc_aligned(sizeof(*f->frame_thread.pal) * + num_sb128 * 16 * 16, 32); + if (!f->frame_thread.pal) { + f->frame_thread.pal_sz = 0; + goto error; + } + f->frame_thread.pal_sz = num_sb128; + } + + const int pal_idx_sz = num_sb128 * size_mul[1]; + if (pal_idx_sz != f->frame_thread.pal_idx_sz) { + dav1d_freep_aligned(&f->frame_thread.pal_idx); + f->frame_thread.pal_idx = + dav1d_alloc_aligned(sizeof(*f->frame_thread.pal_idx) * + pal_idx_sz * 128 * 128 / 4, 32); + if (!f->frame_thread.pal_idx) { + f->frame_thread.pal_idx_sz = 0; + goto error; + } + f->frame_thread.pal_idx_sz = pal_idx_sz; + } + } else if (f->frame_thread.pal) { + dav1d_freep_aligned(&f->frame_thread.pal); + dav1d_freep_aligned(&f->frame_thread.pal_idx); + f->frame_thread.pal_sz = f->frame_thread.pal_idx_sz = 0; + } + } + + // update allocation of block contexts for above + const ptrdiff_t y_stride = f->cur.stride[0], uv_stride = f->cur.stride[1]; + if (y_stride != f->lf.cdef_line_sz[0] || uv_stride != f->lf.cdef_line_sz[1]) { + dav1d_free_aligned(f->lf.cdef_line_buf); + size_t alloc_sz = 64; + alloc_sz += (y_stride < 0 ? -y_stride : y_stride ) * 4; + alloc_sz += (uv_stride < 0 ? -uv_stride : uv_stride) * 8; + uint8_t *ptr = f->lf.cdef_line_buf = dav1d_alloc_aligned(alloc_sz, 32); + if (!ptr) { + f->lf.cdef_line_sz[0] = f->lf.cdef_line_sz[1] = 0; + goto error; + } + + ptr += 32; + if (y_stride < 0) { + f->lf.cdef_line[0][0] = ptr - y_stride * 1; + f->lf.cdef_line[1][0] = ptr - y_stride * 3; + ptr -= y_stride * 4; + } else { + f->lf.cdef_line[0][0] = ptr + y_stride * 0; + f->lf.cdef_line[1][0] = ptr + y_stride * 2; + ptr += y_stride * 4; + } + if (uv_stride < 0) { + f->lf.cdef_line[0][1] = ptr - uv_stride * 1; + f->lf.cdef_line[0][2] = ptr - uv_stride * 3; + f->lf.cdef_line[1][1] = ptr - uv_stride * 5; + f->lf.cdef_line[1][2] = ptr - uv_stride * 7; + } else { + f->lf.cdef_line[0][1] = ptr + uv_stride * 0; + f->lf.cdef_line[0][2] = ptr + uv_stride * 2; + f->lf.cdef_line[1][1] = ptr + uv_stride * 4; + f->lf.cdef_line[1][2] = ptr + uv_stride * 6; + } + + f->lf.cdef_line_sz[0] = (int) y_stride; + f->lf.cdef_line_sz[1] = (int) uv_stride; + } + + const int lr_line_sz = ((f->sr_cur.p.p.w + 31) & ~31) << hbd; + if (lr_line_sz != f->lf.lr_line_sz) { + dav1d_freep_aligned(&f->lf.lr_lpf_line[0]); + uint8_t *lr_ptr = dav1d_alloc_aligned(lr_line_sz * 3 * 12, 32); + if (!lr_ptr) { + f->lf.lr_line_sz = 0; + goto error; + } + + for (int pl = 0; pl <= 2; pl++) { + f->lf.lr_lpf_line[pl] = lr_ptr; + lr_ptr += lr_line_sz * 12; + } + + f->lf.lr_line_sz = lr_line_sz; + } + + // update allocation for loopfilter masks + if (num_sb128 != f->lf.mask_sz) { + freep(&f->lf.mask); + freep(&f->lf.level); + f->lf.mask = malloc(sizeof(*f->lf.mask) * num_sb128); + // over-allocate by 3 bytes since some of the SIMD implementations + // index this from the level type and can thus over-read by up to 3 + f->lf.level = malloc(sizeof(*f->lf.level) * num_sb128 * 32 * 32 + 3); + if (!f->lf.mask || !f->lf.level) { + f->lf.mask_sz = 0; + goto error; + } + if (c->n_fc > 1) { + freep(&f->frame_thread.b); + freep(&f->frame_thread.cbi); + f->frame_thread.b = malloc(sizeof(*f->frame_thread.b) * + num_sb128 * 32 * 32); + f->frame_thread.cbi = malloc(sizeof(*f->frame_thread.cbi) * + num_sb128 * 32 * 32); + if (!f->frame_thread.b || !f->frame_thread.cbi) { + f->lf.mask_sz = 0; + goto error; + } + } + f->lf.mask_sz = num_sb128; + } + + f->sr_sb128w = (f->sr_cur.p.p.w + 127) >> 7; + const int lr_mask_sz = f->sr_sb128w * f->sb128h; + if (lr_mask_sz != f->lf.lr_mask_sz) { + freep(&f->lf.lr_mask); + f->lf.lr_mask = malloc(sizeof(*f->lf.lr_mask) * lr_mask_sz); + if (!f->lf.lr_mask) { + f->lf.lr_mask_sz = 0; + goto error; + } + f->lf.lr_mask_sz = lr_mask_sz; + } + f->lf.restore_planes = + ((f->frame_hdr->restoration.type[0] != DAV1D_RESTORATION_NONE) << 0) + + ((f->frame_hdr->restoration.type[1] != DAV1D_RESTORATION_NONE) << 1) + + ((f->frame_hdr->restoration.type[2] != DAV1D_RESTORATION_NONE) << 2); + if (f->frame_hdr->loopfilter.sharpness != f->lf.last_sharpness) { + dav1d_calc_eih(&f->lf.lim_lut, f->frame_hdr->loopfilter.sharpness); + f->lf.last_sharpness = f->frame_hdr->loopfilter.sharpness; + } + dav1d_calc_lf_values(f->lf.lvl, f->frame_hdr, (int8_t[4]) { 0, 0, 0, 0 }); + memset(f->lf.mask, 0, sizeof(*f->lf.mask) * num_sb128); + + const int ipred_edge_sz = f->sbh * f->sb128w << hbd; + if (ipred_edge_sz != f->ipred_edge_sz) { + dav1d_freep_aligned(&f->ipred_edge[0]); + uint8_t *ptr = f->ipred_edge[0] = + dav1d_alloc_aligned(ipred_edge_sz * 128 * 3, 32); + if (!ptr) { + f->ipred_edge_sz = 0; + goto error; + } + f->ipred_edge[1] = ptr + ipred_edge_sz * 128 * 1; + f->ipred_edge[2] = ptr + ipred_edge_sz * 128 * 2; + f->ipred_edge_sz = ipred_edge_sz; + } + + const int re_sz = f->sb128h * f->frame_hdr->tiling.cols; + if (re_sz != f->lf.re_sz) { + freep(&f->lf.tx_lpf_right_edge[0]); + f->lf.tx_lpf_right_edge[0] = malloc(re_sz * 32 * 2); + if (!f->lf.tx_lpf_right_edge[0]) { + f->lf.re_sz = 0; + goto error; + } + f->lf.tx_lpf_right_edge[1] = f->lf.tx_lpf_right_edge[0] + re_sz * 32; + f->lf.re_sz = re_sz; + } + + // init ref mvs + if ((f->frame_hdr->frame_type & 1) || f->frame_hdr->allow_intrabc) { + const int ret = + dav1d_refmvs_init_frame(&f->rf, f->seq_hdr, f->frame_hdr, + f->refpoc, f->mvs, f->refrefpoc, f->ref_mvs, f->n_tc); + if (ret < 0) goto error; + } + retval = DAV1D_ERR(EINVAL); + + // setup dequant tables + init_quant_tables(f->seq_hdr, f->frame_hdr, f->frame_hdr->quant.yac, f->dq); + if (f->frame_hdr->quant.qm) + for (int j = 0; j < N_RECT_TX_SIZES; j++) { + f->qm[0][j][0] = dav1d_qm_tbl[f->frame_hdr->quant.qm_y][0][j]; + f->qm[0][j][1] = dav1d_qm_tbl[f->frame_hdr->quant.qm_u][1][j]; + f->qm[0][j][2] = dav1d_qm_tbl[f->frame_hdr->quant.qm_v][1][j]; + } + for (int i = f->frame_hdr->quant.qm; i < 2; i++) + for (int tx = 0; tx < N_RECT_TX_SIZES; tx++) + for (int pl = 0; pl < 3; pl++) + f->qm[i][tx][pl] = dav1d_qm_tbl[15][!!pl][tx]; + + // setup jnt_comp weights + if (f->frame_hdr->switchable_comp_refs) { + for (int i = 0; i < 7; i++) { + const unsigned ref0poc = f->refp[i].p.frame_hdr->frame_offset; + + for (int j = i + 1; j < 7; j++) { + const unsigned ref1poc = f->refp[j].p.frame_hdr->frame_offset; + + const unsigned d1 = + imin(abs(get_poc_diff(f->seq_hdr->order_hint_n_bits, ref0poc, + f->cur.frame_hdr->frame_offset)), 31); + const unsigned d0 = + imin(abs(get_poc_diff(f->seq_hdr->order_hint_n_bits, ref1poc, + f->cur.frame_hdr->frame_offset)), 31); + const int order = d0 <= d1; + + static const uint8_t quant_dist_weight[3][2] = { + { 2, 3 }, { 2, 5 }, { 2, 7 } + }; + static const uint8_t quant_dist_lookup_table[4][2] = { + { 9, 7 }, { 11, 5 }, { 12, 4 }, { 13, 3 } + }; + + int k; + for (k = 0; k < 3; k++) { + const int c0 = quant_dist_weight[k][order]; + const int c1 = quant_dist_weight[k][!order]; + const int d0_c0 = d0 * c0; + const int d1_c1 = d1 * c1; + if ((d0 > d1 && d0_c0 < d1_c1) || (d0 <= d1 && d0_c0 > d1_c1)) break; + } + + f->jnt_weights[i][j] = quant_dist_lookup_table[k][order]; + } + } + } + + /* Init loopfilter pointers. Increasing NULL pointers is technically UB, + * so just point the chroma pointers in 4:0:0 to the luma plane here to + * avoid having additional in-loop branches in various places. We never + * dereference those pointers so it doesn't really matter what they + * point at, as long as the pointers are valid. */ + const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400; + f->lf.mask_ptr = f->lf.mask; + f->lf.p[0] = f->cur.data[0]; + f->lf.p[1] = f->cur.data[has_chroma ? 1 : 0]; + f->lf.p[2] = f->cur.data[has_chroma ? 2 : 0]; + f->lf.sr_p[0] = f->sr_cur.p.data[0]; + f->lf.sr_p[1] = f->sr_cur.p.data[has_chroma ? 1 : 0]; + f->lf.sr_p[2] = f->sr_cur.p.data[has_chroma ? 2 : 0]; + f->lf.tile_row = 1; + + dav1d_cdf_thread_wait(&f->in_cdf); + if (f->frame_hdr->refresh_context) + dav1d_cdf_thread_copy(f->out_cdf.data.cdf, &f->in_cdf); + + // parse individual tiles per tile group + int update_set = 0, tile_row = 0, tile_col = 0; + for (int i = 0; i < f->n_tile_data; i++) { + const uint8_t *data = f->tile[i].data.data; + size_t size = f->tile[i].data.sz; + + for (int j = f->tile[i].start; j <= f->tile[i].end; j++) { + size_t tile_sz; + if (j == f->tile[i].end) { + tile_sz = size; + } else { + if (f->frame_hdr->tiling.n_bytes > size) goto error; + tile_sz = 0; + for (unsigned k = 0; k < f->frame_hdr->tiling.n_bytes; k++) + tile_sz |= (unsigned)*data++ << (k * 8); + tile_sz++; + size -= f->frame_hdr->tiling.n_bytes; + if (tile_sz > size) goto error; + } + + setup_tile(&f->ts[j], f, data, tile_sz, tile_row, tile_col++, + c->n_fc > 1 ? f->frame_thread.tile_start_off[j] : 0); + + if (tile_col == f->frame_hdr->tiling.cols) { + tile_col = 0; + tile_row++; + } + if (j == f->frame_hdr->tiling.update && f->frame_hdr->refresh_context) + update_set = 1; + data += tile_sz; + size -= tile_sz; + } + } + + // 2-pass decoding: + // - enabled for frame-threading, so that one frame can do symbol parsing + // as another (or multiple) are doing reconstruction. One advantage here + // is that although reconstruction is limited by reference availability, + // symbol parsing is not. Therefore, symbol parsing can effectively use + // row and col tile threading, but reconstruction only col tile threading; + // - pass 0 means no 2-pass; + // - pass 1 means symbol parsing only; + // - pass 2 means reconstruction and loop filtering. + + const int uses_2pass = c->n_fc > 1 && f->frame_hdr->refresh_context; + for (f->frame_thread.pass = uses_2pass; + f->frame_thread.pass <= 2 * uses_2pass; f->frame_thread.pass++) + { + const enum PlaneType progress_plane_type = + f->frame_thread.pass == 0 ? PLANE_TYPE_ALL : + f->frame_thread.pass == 1 ? PLANE_TYPE_BLOCK : PLANE_TYPE_Y; + + for (int n = 0; n < f->sb128w * f->frame_hdr->tiling.rows; n++) + reset_context(&f->a[n], !(f->frame_hdr->frame_type & 1), f->frame_thread.pass); + + if (f->n_tc == 1) { + Dav1dTileContext *const t = f->tc; + + // no tile threading - we explicitly interleave tile/sbrow decoding + // and post-filtering, so that the full process runs in-line, so + // that frame threading is still possible + for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) { + const int sbh_end = + imin(f->frame_hdr->tiling.row_start_sb[tile_row + 1], f->sbh); + for (int sby = f->frame_hdr->tiling.row_start_sb[tile_row]; + sby < sbh_end; sby++) + { + t->by = sby << (4 + f->seq_hdr->sb128); + const int by_end = (t->by + f->sb_step) >> 1; + if (f->frame_thread.pass <= 1 && f->frame_hdr->use_ref_frame_mvs) { + if (c->n_fc > 1) for (int n = 0; n < 7; n++) + if (dav1d_thread_picture_wait(&f->refp[n], + 4 * (t->by + f->sb_step), + PLANE_TYPE_BLOCK)) + { + goto error; + } + dav1d_refmvs_load_tmvs(&f->rf, tile_row, + 0, f->bw >> 1, t->by >> 1, by_end); + } + for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; tile_col++) { + t->ts = &f->ts[tile_row * f->frame_hdr->tiling.cols + tile_col]; + + if (dav1d_decode_tile_sbrow(t)) goto error; + } + if (f->frame_thread.pass <= 1 && f->frame_hdr->frame_type & 1) { + dav1d_refmvs_save_tmvs(&t->rt, 0, f->bw >> 1, t->by >> 1, by_end); + } + + // loopfilter + cdef + restoration + if (f->frame_thread.pass != 1) + f->bd_fn.filter_sbrow(f, sby); + dav1d_thread_picture_signal(&f->sr_cur, (sby + 1) * f->sb_step * 4, + progress_plane_type); + } + } + } else { + // signal available tasks to worker threads + int num_tasks; + + pthread_mutex_lock(&f->tile_thread.lock); + assert(!f->tile_thread.tasks_left); + if (f->frame_thread.pass == 1 || f->n_tc >= f->frame_hdr->tiling.cols) { + // we can (or in fact, if >, we need to) do full tile decoding. + // loopfilter happens below + num_tasks = f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows; + } else { + // we need to interleave sbrow decoding for all tile cols in a + // tile row, since otherwise subsequent threads will be blocked + // waiting for the post-filter to complete + num_tasks = f->sbh * f->frame_hdr->tiling.cols; + } + f->tile_thread.num_tasks = f->tile_thread.tasks_left = num_tasks; + pthread_cond_broadcast(&f->tile_thread.cond); + pthread_mutex_unlock(&f->tile_thread.lock); + + // loopfilter + cdef + restoration + for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) { + for (int sby = f->frame_hdr->tiling.row_start_sb[tile_row]; + sby < f->frame_hdr->tiling.row_start_sb[tile_row + 1]; sby++) + { + for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; + tile_col++) + { + int progress; + Dav1dTileState *const ts = + &f->ts[tile_row * f->frame_hdr->tiling.cols + tile_col]; + + if ((progress = atomic_load(&ts->progress)) <= sby) { + pthread_mutex_lock(&ts->tile_thread.lock); + while ((progress = atomic_load(&ts->progress)) <= sby) + pthread_cond_wait(&ts->tile_thread.cond, + &ts->tile_thread.lock); + pthread_mutex_unlock(&ts->tile_thread.lock); + } + if (progress == TILE_ERROR) { + dav1d_thread_picture_signal(&f->sr_cur, FRAME_ERROR, + PLANE_TYPE_ALL); + const uint64_t all_mask = ~0ULL >> (64 - f->n_tc); + pthread_mutex_lock(&f->tile_thread.lock); + while (f->tile_thread.available != all_mask) + pthread_cond_wait(&f->tile_thread.icond, &f->tile_thread.lock); + pthread_mutex_unlock(&f->tile_thread.lock); + goto error; + } + } + + // loopfilter + cdef + restoration + if (f->frame_thread.pass != 1) + f->bd_fn.filter_sbrow(f, sby); + dav1d_thread_picture_signal(&f->sr_cur, (sby + 1) * f->sb_step * 4, + progress_plane_type); + } + } + + const uint64_t all_mask = ~0ULL >> (64 - f->n_tc); + pthread_mutex_lock(&f->tile_thread.lock); + while (f->tile_thread.available != all_mask) + pthread_cond_wait(&f->tile_thread.icond, &f->tile_thread.lock); + pthread_mutex_unlock(&f->tile_thread.lock); + } + + if (f->frame_thread.pass <= 1 && f->frame_hdr->refresh_context) { + // cdf update + if (update_set) + dav1d_cdf_thread_update(f->frame_hdr, f->out_cdf.data.cdf, + &f->ts[f->frame_hdr->tiling.update].cdf); + dav1d_cdf_thread_signal(&f->out_cdf); + } + if (f->frame_thread.pass == 1) { + assert(c->n_fc > 1); + for (int tile_idx = 0; + tile_idx < f->frame_hdr->tiling.rows * f->frame_hdr->tiling.cols; + tile_idx++) + { + Dav1dTileState *const ts = &f->ts[tile_idx]; + const size_t tile_start_off = + (size_t) f->frame_thread.tile_start_off[tile_idx]; + ts->frame_thread.pal_idx = f->frame_thread.pal_idx ? + &f->frame_thread.pal_idx[tile_start_off * size_mul[1] / 4] : + NULL; + ts->frame_thread.cf = f->frame_thread.cf ? + (uint8_t*)f->frame_thread.cf + + ((tile_start_off * size_mul[0]) >> !f->seq_hdr->hbd) : + NULL; + if (f->n_tc > 0) { + const unsigned row_sb_start = + f->frame_hdr->tiling.row_start_sb[ts->tiling.row]; + atomic_init(&ts->progress, row_sb_start); + } + } + } + } + + retval = 0; +error: + dav1d_thread_picture_signal(&f->sr_cur, retval == 0 ? UINT_MAX : FRAME_ERROR, + PLANE_TYPE_ALL); + for (int i = 0; i < 7; i++) { + if (f->refp[i].p.data[0]) + dav1d_thread_picture_unref(&f->refp[i]); + dav1d_ref_dec(&f->ref_mvs_ref[i]); + } + + dav1d_picture_unref_internal(&f->cur); + dav1d_thread_picture_unref(&f->sr_cur); + dav1d_cdf_thread_unref(&f->in_cdf); + if (f->frame_hdr->refresh_context) { + dav1d_cdf_thread_signal(&f->out_cdf); + dav1d_cdf_thread_unref(&f->out_cdf); + } + dav1d_ref_dec(&f->cur_segmap_ref); + dav1d_ref_dec(&f->prev_segmap_ref); + dav1d_ref_dec(&f->mvs_ref); + dav1d_ref_dec(&f->seq_hdr_ref); + dav1d_ref_dec(&f->frame_hdr_ref); + + for (int i = 0; i < f->n_tile_data; i++) + dav1d_data_unref_internal(&f->tile[i].data); + + return retval; +} + +static int get_upscale_x0(const int in_w, const int out_w, const int step) { + const int err = out_w * step - (in_w << 14); + const int x0 = (-((out_w - in_w) << 13) + (out_w >> 1)) / out_w + 128 - (err >> 1); + return x0 & 0x3fff; +} + +int dav1d_submit_frame(Dav1dContext *const c) { + Dav1dFrameContext *f; + int res = -1; + + // wait for c->out_delayed[next] and move into c->out if visible + Dav1dThreadPicture *out_delayed; + if (c->n_fc > 1) { + const unsigned next = c->frame_thread.next++; + if (c->frame_thread.next == c->n_fc) + c->frame_thread.next = 0; + + f = &c->fc[next]; + pthread_mutex_lock(&f->frame_thread.td.lock); + while (f->n_tile_data > 0) + pthread_cond_wait(&f->frame_thread.td.cond, + &f->frame_thread.td.lock); + out_delayed = &c->frame_thread.out_delayed[next]; + if (out_delayed->p.data[0]) { + const unsigned progress = atomic_load_explicit(&out_delayed->progress[1], + memory_order_relaxed); + if (out_delayed->visible && progress != FRAME_ERROR) + dav1d_picture_ref(&c->out, &out_delayed->p); + dav1d_thread_picture_unref(out_delayed); + } + } else { + f = c->fc; + } + + f->seq_hdr = c->seq_hdr; + f->seq_hdr_ref = c->seq_hdr_ref; + dav1d_ref_inc(f->seq_hdr_ref); + f->frame_hdr = c->frame_hdr; + f->frame_hdr_ref = c->frame_hdr_ref; + c->frame_hdr = NULL; + c->frame_hdr_ref = NULL; + f->dsp = &c->dsp[f->seq_hdr->hbd]; + + const int bpc = 8 + 2 * f->seq_hdr->hbd; + + if (!f->dsp->ipred.intra_pred[DC_PRED]) { + Dav1dDSPContext *const dsp = &c->dsp[f->seq_hdr->hbd]; + + switch (bpc) { +#define assign_bitdepth_case(bd) \ + dav1d_cdef_dsp_init_##bd##bpc(&dsp->cdef); \ + dav1d_intra_pred_dsp_init_##bd##bpc(&dsp->ipred); \ + dav1d_itx_dsp_init_##bd##bpc(&dsp->itx, bpc); \ + dav1d_loop_filter_dsp_init_##bd##bpc(&dsp->lf); \ + dav1d_loop_restoration_dsp_init_##bd##bpc(&dsp->lr, bpc); \ + dav1d_mc_dsp_init_##bd##bpc(&dsp->mc); \ + dav1d_film_grain_dsp_init_##bd##bpc(&dsp->fg); \ + break +#if CONFIG_8BPC + case 8: + assign_bitdepth_case(8); +#endif +#if CONFIG_16BPC + case 10: + case 12: + assign_bitdepth_case(16); +#endif +#undef assign_bitdepth_case + default: + dav1d_log(c, "Compiled without support for %d-bit decoding\n", + 8 + 2 * f->seq_hdr->hbd); + res = DAV1D_ERR(ENOPROTOOPT); + goto error; + } + } + +#define assign_bitdepth_case(bd) \ + f->bd_fn.recon_b_inter = dav1d_recon_b_inter_##bd##bpc; \ + f->bd_fn.recon_b_intra = dav1d_recon_b_intra_##bd##bpc; \ + f->bd_fn.filter_sbrow = dav1d_filter_sbrow_##bd##bpc; \ + f->bd_fn.backup_ipred_edge = dav1d_backup_ipred_edge_##bd##bpc; \ + f->bd_fn.read_coef_blocks = dav1d_read_coef_blocks_##bd##bpc + if (!f->seq_hdr->hbd) { +#if CONFIG_8BPC + assign_bitdepth_case(8); +#endif + } else { +#if CONFIG_16BPC + assign_bitdepth_case(16); +#endif + } +#undef assign_bitdepth_case + + int ref_coded_width[7]; + if (f->frame_hdr->frame_type & 1) { + if (f->frame_hdr->primary_ref_frame != DAV1D_PRIMARY_REF_NONE) { + const int pri_ref = f->frame_hdr->refidx[f->frame_hdr->primary_ref_frame]; + if (!c->refs[pri_ref].p.p.data[0]) { + res = DAV1D_ERR(EINVAL); + goto error; + } + } + for (int i = 0; i < 7; i++) { + const int refidx = f->frame_hdr->refidx[i]; + if (!c->refs[refidx].p.p.data[0] || + f->frame_hdr->width[0] * 2 < c->refs[refidx].p.p.p.w || + f->frame_hdr->height * 2 < c->refs[refidx].p.p.p.h || + f->frame_hdr->width[0] > c->refs[refidx].p.p.p.w * 16 || + f->frame_hdr->height > c->refs[refidx].p.p.p.h * 16 || + f->seq_hdr->layout != c->refs[refidx].p.p.p.layout || + bpc != c->refs[refidx].p.p.p.bpc) + { + for (int j = 0; j < i; j++) + dav1d_thread_picture_unref(&f->refp[j]); + res = DAV1D_ERR(EINVAL); + goto error; + } + dav1d_thread_picture_ref(&f->refp[i], &c->refs[refidx].p); + ref_coded_width[i] = c->refs[refidx].p.p.frame_hdr->width[0]; + if (f->frame_hdr->width[0] != c->refs[refidx].p.p.p.w || + f->frame_hdr->height != c->refs[refidx].p.p.p.h) + { +#define scale_fac(ref_sz, this_sz) \ + ((((ref_sz) << 14) + ((this_sz) >> 1)) / (this_sz)) + f->svc[i][0].scale = scale_fac(c->refs[refidx].p.p.p.w, + f->frame_hdr->width[0]); + f->svc[i][1].scale = scale_fac(c->refs[refidx].p.p.p.h, + f->frame_hdr->height); + f->svc[i][0].step = (f->svc[i][0].scale + 8) >> 4; + f->svc[i][1].step = (f->svc[i][1].scale + 8) >> 4; + } else { + f->svc[i][0].scale = 0; + } + f->gmv_warp_allowed[i] = f->frame_hdr->gmv[i].type > DAV1D_WM_TYPE_TRANSLATION && + !f->frame_hdr->force_integer_mv && + !dav1d_get_shear_params(&f->frame_hdr->gmv[i]) && + !f->svc[i][0].scale; + } + } + + // setup entropy + if (f->frame_hdr->primary_ref_frame == DAV1D_PRIMARY_REF_NONE) { + dav1d_cdf_thread_init_static(&f->in_cdf, f->frame_hdr->quant.yac); + } else { + const int pri_ref = f->frame_hdr->refidx[f->frame_hdr->primary_ref_frame]; + dav1d_cdf_thread_ref(&f->in_cdf, &c->cdf[pri_ref]); + } + if (f->frame_hdr->refresh_context) { + res = dav1d_cdf_thread_alloc(c, &f->out_cdf, c->n_fc > 1 ? &f->frame_thread.td : NULL); + if (res < 0) goto error; + } + + // FIXME qsort so tiles are in order (for frame threading) + if (f->n_tile_data_alloc < c->n_tile_data) { + freep(&f->tile); + assert(c->n_tile_data < INT_MAX / (int)sizeof(*f->tile)); + f->tile = malloc(c->n_tile_data * sizeof(*f->tile)); + if (!f->tile) { + f->n_tile_data_alloc = f->n_tile_data = 0; + res = DAV1D_ERR(ENOMEM); + goto error; + } + f->n_tile_data_alloc = c->n_tile_data; + } + memcpy(f->tile, c->tile, c->n_tile_data * sizeof(*f->tile)); + memset(c->tile, 0, c->n_tile_data * sizeof(*c->tile)); + f->n_tile_data = c->n_tile_data; + c->n_tile_data = 0; + + // allocate frame + res = dav1d_thread_picture_alloc(c, f, bpc); + if (res < 0) goto error; + + if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) { + res = dav1d_picture_alloc_copy(c, &f->cur, f->frame_hdr->width[0], &f->sr_cur.p); + if (res < 0) goto error; + } else { + dav1d_picture_ref(&f->cur, &f->sr_cur.p); + } + + if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) { + f->resize_step[0] = scale_fac(f->cur.p.w, f->sr_cur.p.p.w); + const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; + const int in_cw = (f->cur.p.w + ss_hor) >> ss_hor; + const int out_cw = (f->sr_cur.p.p.w + ss_hor) >> ss_hor; + f->resize_step[1] = scale_fac(in_cw, out_cw); +#undef scale_fac + f->resize_start[0] = get_upscale_x0(f->cur.p.w, f->sr_cur.p.p.w, f->resize_step[0]); + f->resize_start[1] = get_upscale_x0(in_cw, out_cw, f->resize_step[1]); + } + + // move f->cur into output queue + if (c->n_fc == 1) { + if (f->frame_hdr->show_frame) + dav1d_picture_ref(&c->out, &f->sr_cur.p); + } else { + dav1d_thread_picture_ref(out_delayed, &f->sr_cur); + } + + f->w4 = (f->frame_hdr->width[0] + 3) >> 2; + f->h4 = (f->frame_hdr->height + 3) >> 2; + f->bw = ((f->frame_hdr->width[0] + 7) >> 3) << 1; + f->bh = ((f->frame_hdr->height + 7) >> 3) << 1; + f->sb128w = (f->bw + 31) >> 5; + f->sb128h = (f->bh + 31) >> 5; + f->sb_shift = 4 + f->seq_hdr->sb128; + f->sb_step = 16 << f->seq_hdr->sb128; + f->sbh = (f->bh + f->sb_step - 1) >> f->sb_shift; + f->b4_stride = (f->bw + 31) & ~31; + f->bitdepth_max = (1 << f->cur.p.bpc) - 1; + + // ref_mvs + if ((f->frame_hdr->frame_type & 1) || f->frame_hdr->allow_intrabc) { + f->mvs_ref = dav1d_ref_create_using_pool(&c->refmvs_pool, + sizeof(*f->mvs) * f->sb128h * 16 * (f->b4_stride >> 1)); + if (!f->mvs_ref) { + res = DAV1D_ERR(ENOMEM); + goto error; + } + f->mvs = f->mvs_ref->data; + if (!f->frame_hdr->allow_intrabc) { + for (int i = 0; i < 7; i++) + f->refpoc[i] = f->refp[i].p.frame_hdr->frame_offset; + } else { + memset(f->refpoc, 0, sizeof(f->refpoc)); + } + if (f->frame_hdr->use_ref_frame_mvs) { + for (int i = 0; i < 7; i++) { + const int refidx = f->frame_hdr->refidx[i]; + if (c->refs[refidx].refmvs != NULL && + ref_coded_width[i] == f->cur.p.w && + f->refp[i].p.p.h == f->cur.p.h) + { + f->ref_mvs_ref[i] = c->refs[refidx].refmvs; + dav1d_ref_inc(f->ref_mvs_ref[i]); + f->ref_mvs[i] = c->refs[refidx].refmvs->data; + } else { + f->ref_mvs[i] = NULL; + f->ref_mvs_ref[i] = NULL; + } + memcpy(f->refrefpoc[i], c->refs[refidx].refpoc, + sizeof(*f->refrefpoc)); + } + } else { + memset(f->ref_mvs_ref, 0, sizeof(f->ref_mvs_ref)); + } + } else { + f->mvs_ref = NULL; + memset(f->ref_mvs_ref, 0, sizeof(f->ref_mvs_ref)); + } + + // segmap + if (f->frame_hdr->segmentation.enabled) { + // By default, the previous segmentation map is not initialised. + f->prev_segmap_ref = NULL; + f->prev_segmap = NULL; + + // We might need a previous frame's segmentation map. This + // happens if there is either no update or a temporal update. + if (f->frame_hdr->segmentation.temporal || !f->frame_hdr->segmentation.update_map) { + const int pri_ref = f->frame_hdr->primary_ref_frame; + assert(pri_ref != DAV1D_PRIMARY_REF_NONE); + const int ref_w = ((ref_coded_width[pri_ref] + 7) >> 3) << 1; + const int ref_h = ((f->refp[pri_ref].p.p.h + 7) >> 3) << 1; + if (ref_w == f->bw && ref_h == f->bh) { + f->prev_segmap_ref = c->refs[f->frame_hdr->refidx[pri_ref]].segmap; + if (f->prev_segmap_ref) { + dav1d_ref_inc(f->prev_segmap_ref); + f->prev_segmap = f->prev_segmap_ref->data; + } + } + } + + if (f->frame_hdr->segmentation.update_map) { + // We're updating an existing map, but need somewhere to + // put the new values. Allocate them here (the data + // actually gets set elsewhere) + f->cur_segmap_ref = dav1d_ref_create_using_pool(&c->segmap_pool, + sizeof(*f->cur_segmap) * f->b4_stride * 32 * f->sb128h); + if (!f->cur_segmap_ref) { + dav1d_ref_dec(&f->prev_segmap_ref); + res = DAV1D_ERR(ENOMEM); + goto error; + } + f->cur_segmap = f->cur_segmap_ref->data; + } else if (f->prev_segmap_ref) { + // We're not updating an existing map, and we have a valid + // reference. Use that. + f->cur_segmap_ref = f->prev_segmap_ref; + dav1d_ref_inc(f->cur_segmap_ref); + f->cur_segmap = f->prev_segmap_ref->data; + } else { + // We need to make a new map. Allocate one here and zero it out. + const size_t segmap_size = sizeof(*f->cur_segmap) * f->b4_stride * 32 * f->sb128h; + f->cur_segmap_ref = dav1d_ref_create_using_pool(&c->segmap_pool, segmap_size); + if (!f->cur_segmap_ref) { + res = DAV1D_ERR(ENOMEM); + goto error; + } + f->cur_segmap = f->cur_segmap_ref->data; + memset(f->cur_segmap, 0, segmap_size); + } + } else { + f->cur_segmap = NULL; + f->cur_segmap_ref = NULL; + f->prev_segmap_ref = NULL; + } + + // update references etc. + const unsigned refresh_frame_flags = f->frame_hdr->refresh_frame_flags; + for (int i = 0; i < 8; i++) { + if (refresh_frame_flags & (1 << i)) { + if (c->refs[i].p.p.data[0]) + dav1d_thread_picture_unref(&c->refs[i].p); + dav1d_thread_picture_ref(&c->refs[i].p, &f->sr_cur); + + dav1d_cdf_thread_unref(&c->cdf[i]); + if (f->frame_hdr->refresh_context) { + dav1d_cdf_thread_ref(&c->cdf[i], &f->out_cdf); + } else { + dav1d_cdf_thread_ref(&c->cdf[i], &f->in_cdf); + } + + dav1d_ref_dec(&c->refs[i].segmap); + c->refs[i].segmap = f->cur_segmap_ref; + if (f->cur_segmap_ref) + dav1d_ref_inc(f->cur_segmap_ref); + dav1d_ref_dec(&c->refs[i].refmvs); + if (!f->frame_hdr->allow_intrabc) { + c->refs[i].refmvs = f->mvs_ref; + if (f->mvs_ref) + dav1d_ref_inc(f->mvs_ref); + } + memcpy(c->refs[i].refpoc, f->refpoc, sizeof(f->refpoc)); + } + } + + if (c->n_fc == 1) { + if ((res = dav1d_decode_frame(f)) < 0) { + dav1d_picture_unref_internal(&c->out); + for (int i = 0; i < 8; i++) { + if (refresh_frame_flags & (1 << i)) { + if (c->refs[i].p.p.data[0]) + dav1d_thread_picture_unref(&c->refs[i].p); + dav1d_cdf_thread_unref(&c->cdf[i]); + dav1d_ref_dec(&c->refs[i].segmap); + dav1d_ref_dec(&c->refs[i].refmvs); + } + } + return res; + } + } else { + pthread_cond_signal(&f->frame_thread.td.cond); + pthread_mutex_unlock(&f->frame_thread.td.lock); + } + + return 0; +error: + dav1d_cdf_thread_unref(&f->in_cdf); + if (f->frame_hdr->refresh_context) + dav1d_cdf_thread_unref(&f->out_cdf); + for (int i = 0; i < 7; i++) { + if (f->refp[i].p.data[0]) + dav1d_thread_picture_unref(&f->refp[i]); + dav1d_ref_dec(&f->ref_mvs_ref[i]); + } + if (c->n_fc == 1) + dav1d_picture_unref_internal(&c->out); + else + dav1d_thread_picture_unref(out_delayed); + dav1d_picture_unref_internal(&f->cur); + dav1d_thread_picture_unref(&f->sr_cur); + dav1d_ref_dec(&f->mvs_ref); + dav1d_ref_dec(&f->seq_hdr_ref); + dav1d_ref_dec(&f->frame_hdr_ref); + + for (int i = 0; i < f->n_tile_data; i++) + dav1d_data_unref_internal(&f->tile[i].data); + f->n_tile_data = 0; + + if (c->n_fc > 1) { + pthread_cond_signal(&f->frame_thread.td.cond); + pthread_mutex_unlock(&f->frame_thread.td.lock); + } + + return res; +} diff --git a/third_party/dav1d/src/decode.h b/third_party/dav1d/src/decode.h new file mode 100644 index 0000000000..1eae5850a5 --- /dev/null +++ b/third_party/dav1d/src/decode.h @@ -0,0 +1,35 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_DECODE_H +#define DAV1D_SRC_DECODE_H + +#include "src/internal.h" + +int dav1d_submit_frame(Dav1dContext *c); + +#endif /* DAV1D_SRC_DECODE_H */ diff --git a/third_party/dav1d/src/dequant_tables.c b/third_party/dav1d/src/dequant_tables.c new file mode 100644 index 0000000000..5d801119a6 --- /dev/null +++ b/third_party/dav1d/src/dequant_tables.c @@ -0,0 +1,229 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include "src/dequant_tables.h" + +const uint16_t dav1d_dq_tbl[][QINDEX_RANGE][2] = { + { + { 4, 4, }, { 8, 8, }, { 8, 9, }, { 9, 10, }, + { 10, 11, }, { 11, 12, }, { 12, 13, }, { 12, 14, }, + { 13, 15, }, { 14, 16, }, { 15, 17, }, { 16, 18, }, + { 17, 19, }, { 18, 20, }, { 19, 21, }, { 19, 22, }, + { 20, 23, }, { 21, 24, }, { 22, 25, }, { 23, 26, }, + { 24, 27, }, { 25, 28, }, { 26, 29, }, { 26, 30, }, + { 27, 31, }, { 28, 32, }, { 29, 33, }, { 30, 34, }, + { 31, 35, }, { 32, 36, }, { 32, 37, }, { 33, 38, }, + { 34, 39, }, { 35, 40, }, { 36, 41, }, { 37, 42, }, + { 38, 43, }, { 38, 44, }, { 39, 45, }, { 40, 46, }, + { 41, 47, }, { 42, 48, }, { 43, 49, }, { 43, 50, }, + { 44, 51, }, { 45, 52, }, { 46, 53, }, { 47, 54, }, + { 48, 55, }, { 48, 56, }, { 49, 57, }, { 50, 58, }, + { 51, 59, }, { 52, 60, }, { 53, 61, }, { 53, 62, }, + { 54, 63, }, { 55, 64, }, { 56, 65, }, { 57, 66, }, + { 57, 67, }, { 58, 68, }, { 59, 69, }, { 60, 70, }, + { 61, 71, }, { 62, 72, }, { 62, 73, }, { 63, 74, }, + { 64, 75, }, { 65, 76, }, { 66, 77, }, { 66, 78, }, + { 67, 79, }, { 68, 80, }, { 69, 81, }, { 70, 82, }, + { 70, 83, }, { 71, 84, }, { 72, 85, }, { 73, 86, }, + { 74, 87, }, { 74, 88, }, { 75, 89, }, { 76, 90, }, + { 77, 91, }, { 78, 92, }, { 78, 93, }, { 79, 94, }, + { 80, 95, }, { 81, 96, }, { 81, 97, }, { 82, 98, }, + { 83, 99, }, { 84, 100, }, { 85, 101, }, { 85, 102, }, + { 87, 104, }, { 88, 106, }, { 90, 108, }, { 92, 110, }, + { 93, 112, }, { 95, 114, }, { 96, 116, }, { 98, 118, }, + { 99, 120, }, { 101, 122, }, { 102, 124, }, { 104, 126, }, + { 105, 128, }, { 107, 130, }, { 108, 132, }, { 110, 134, }, + { 111, 136, }, { 113, 138, }, { 114, 140, }, { 116, 142, }, + { 117, 144, }, { 118, 146, }, { 120, 148, }, { 121, 150, }, + { 123, 152, }, { 125, 155, }, { 127, 158, }, { 129, 161, }, + { 131, 164, }, { 134, 167, }, { 136, 170, }, { 138, 173, }, + { 140, 176, }, { 142, 179, }, { 144, 182, }, { 146, 185, }, + { 148, 188, }, { 150, 191, }, { 152, 194, }, { 154, 197, }, + { 156, 200, }, { 158, 203, }, { 161, 207, }, { 164, 211, }, + { 166, 215, }, { 169, 219, }, { 172, 223, }, { 174, 227, }, + { 177, 231, }, { 180, 235, }, { 182, 239, }, { 185, 243, }, + { 187, 247, }, { 190, 251, }, { 192, 255, }, { 195, 260, }, + { 199, 265, }, { 202, 270, }, { 205, 275, }, { 208, 280, }, + { 211, 285, }, { 214, 290, }, { 217, 295, }, { 220, 300, }, + { 223, 305, }, { 226, 311, }, { 230, 317, }, { 233, 323, }, + { 237, 329, }, { 240, 335, }, { 243, 341, }, { 247, 347, }, + { 250, 353, }, { 253, 359, }, { 257, 366, }, { 261, 373, }, + { 265, 380, }, { 269, 387, }, { 272, 394, }, { 276, 401, }, + { 280, 408, }, { 284, 416, }, { 288, 424, }, { 292, 432, }, + { 296, 440, }, { 300, 448, }, { 304, 456, }, { 309, 465, }, + { 313, 474, }, { 317, 483, }, { 322, 492, }, { 326, 501, }, + { 330, 510, }, { 335, 520, }, { 340, 530, }, { 344, 540, }, + { 349, 550, }, { 354, 560, }, { 359, 571, }, { 364, 582, }, + { 369, 593, }, { 374, 604, }, { 379, 615, }, { 384, 627, }, + { 389, 639, }, { 395, 651, }, { 400, 663, }, { 406, 676, }, + { 411, 689, }, { 417, 702, }, { 423, 715, }, { 429, 729, }, + { 435, 743, }, { 441, 757, }, { 447, 771, }, { 454, 786, }, + { 461, 801, }, { 467, 816, }, { 475, 832, }, { 482, 848, }, + { 489, 864, }, { 497, 881, }, { 505, 898, }, { 513, 915, }, + { 522, 933, }, { 530, 951, }, { 539, 969, }, { 549, 988, }, + { 559, 1007, }, { 569, 1026, }, { 579, 1046, }, { 590, 1066, }, + { 602, 1087, }, { 614, 1108, }, { 626, 1129, }, { 640, 1151, }, + { 654, 1173, }, { 668, 1196, }, { 684, 1219, }, { 700, 1243, }, + { 717, 1267, }, { 736, 1292, }, { 755, 1317, }, { 775, 1343, }, + { 796, 1369, }, { 819, 1396, }, { 843, 1423, }, { 869, 1451, }, + { 896, 1479, }, { 925, 1508, }, { 955, 1537, }, { 988, 1567, }, + { 1022, 1597, }, { 1058, 1628, }, { 1098, 1660, }, { 1139, 1692, }, + { 1184, 1725, }, { 1232, 1759, }, { 1282, 1793, }, { 1336, 1828, }, + }, { + { 4, 4, }, { 9, 9, }, { 10, 11, }, { 13, 13, }, + { 15, 16, }, { 17, 18, }, { 20, 21, }, { 22, 24, }, + { 25, 27, }, { 28, 30, }, { 31, 33, }, { 34, 37, }, + { 37, 40, }, { 40, 44, }, { 43, 48, }, { 47, 51, }, + { 50, 55, }, { 53, 59, }, { 57, 63, }, { 60, 67, }, + { 64, 71, }, { 68, 75, }, { 71, 79, }, { 75, 83, }, + { 78, 88, }, { 82, 92, }, { 86, 96, }, { 90, 100, }, + { 93, 105, }, { 97, 109, }, { 101, 114, }, { 105, 118, }, + { 109, 122, }, { 113, 127, }, { 116, 131, }, { 120, 136, }, + { 124, 140, }, { 128, 145, }, { 132, 149, }, { 136, 154, }, + { 140, 158, }, { 143, 163, }, { 147, 168, }, { 151, 172, }, + { 155, 177, }, { 159, 181, }, { 163, 186, }, { 166, 190, }, + { 170, 195, }, { 174, 199, }, { 178, 204, }, { 182, 208, }, + { 185, 213, }, { 189, 217, }, { 193, 222, }, { 197, 226, }, + { 200, 231, }, { 204, 235, }, { 208, 240, }, { 212, 244, }, + { 215, 249, }, { 219, 253, }, { 223, 258, }, { 226, 262, }, + { 230, 267, }, { 233, 271, }, { 237, 275, }, { 241, 280, }, + { 244, 284, }, { 248, 289, }, { 251, 293, }, { 255, 297, }, + { 259, 302, }, { 262, 306, }, { 266, 311, }, { 269, 315, }, + { 273, 319, }, { 276, 324, }, { 280, 328, }, { 283, 332, }, + { 287, 337, }, { 290, 341, }, { 293, 345, }, { 297, 349, }, + { 300, 354, }, { 304, 358, }, { 307, 362, }, { 310, 367, }, + { 314, 371, }, { 317, 375, }, { 321, 379, }, { 324, 384, }, + { 327, 388, }, { 331, 392, }, { 334, 396, }, { 337, 401, }, + { 343, 409, }, { 350, 417, }, { 356, 425, }, { 362, 433, }, + { 369, 441, }, { 375, 449, }, { 381, 458, }, { 387, 466, }, + { 394, 474, }, { 400, 482, }, { 406, 490, }, { 412, 498, }, + { 418, 506, }, { 424, 514, }, { 430, 523, }, { 436, 531, }, + { 442, 539, }, { 448, 547, }, { 454, 555, }, { 460, 563, }, + { 466, 571, }, { 472, 579, }, { 478, 588, }, { 484, 596, }, + { 490, 604, }, { 499, 616, }, { 507, 628, }, { 516, 640, }, + { 525, 652, }, { 533, 664, }, { 542, 676, }, { 550, 688, }, + { 559, 700, }, { 567, 713, }, { 576, 725, }, { 584, 737, }, + { 592, 749, }, { 601, 761, }, { 609, 773, }, { 617, 785, }, + { 625, 797, }, { 634, 809, }, { 644, 825, }, { 655, 841, }, + { 666, 857, }, { 676, 873, }, { 687, 889, }, { 698, 905, }, + { 708, 922, }, { 718, 938, }, { 729, 954, }, { 739, 970, }, + { 749, 986, }, { 759, 1002, }, { 770, 1018, }, { 782, 1038, }, + { 795, 1058, }, { 807, 1078, }, { 819, 1098, }, { 831, 1118, }, + { 844, 1138, }, { 856, 1158, }, { 868, 1178, }, { 880, 1198, }, + { 891, 1218, }, { 906, 1242, }, { 920, 1266, }, { 933, 1290, }, + { 947, 1314, }, { 961, 1338, }, { 975, 1362, }, { 988, 1386, }, + { 1001, 1411, }, { 1015, 1435, }, { 1030, 1463, }, { 1045, 1491, }, + { 1061, 1519, }, { 1076, 1547, }, { 1090, 1575, }, { 1105, 1603, }, + { 1120, 1631, }, { 1137, 1663, }, { 1153, 1695, }, { 1170, 1727, }, + { 1186, 1759, }, { 1202, 1791, }, { 1218, 1823, }, { 1236, 1859, }, + { 1253, 1895, }, { 1271, 1931, }, { 1288, 1967, }, { 1306, 2003, }, + { 1323, 2039, }, { 1342, 2079, }, { 1361, 2119, }, { 1379, 2159, }, + { 1398, 2199, }, { 1416, 2239, }, { 1436, 2283, }, { 1456, 2327, }, + { 1476, 2371, }, { 1496, 2415, }, { 1516, 2459, }, { 1537, 2507, }, + { 1559, 2555, }, { 1580, 2603, }, { 1601, 2651, }, { 1624, 2703, }, + { 1647, 2755, }, { 1670, 2807, }, { 1692, 2859, }, { 1717, 2915, }, + { 1741, 2971, }, { 1766, 3027, }, { 1791, 3083, }, { 1817, 3143, }, + { 1844, 3203, }, { 1871, 3263, }, { 1900, 3327, }, { 1929, 3391, }, + { 1958, 3455, }, { 1990, 3523, }, { 2021, 3591, }, { 2054, 3659, }, + { 2088, 3731, }, { 2123, 3803, }, { 2159, 3876, }, { 2197, 3952, }, + { 2236, 4028, }, { 2276, 4104, }, { 2319, 4184, }, { 2363, 4264, }, + { 2410, 4348, }, { 2458, 4432, }, { 2508, 4516, }, { 2561, 4604, }, + { 2616, 4692, }, { 2675, 4784, }, { 2737, 4876, }, { 2802, 4972, }, + { 2871, 5068, }, { 2944, 5168, }, { 3020, 5268, }, { 3102, 5372, }, + { 3188, 5476, }, { 3280, 5584, }, { 3375, 5692, }, { 3478, 5804, }, + { 3586, 5916, }, { 3702, 6032, }, { 3823, 6148, }, { 3953, 6268, }, + { 4089, 6388, }, { 4236, 6512, }, { 4394, 6640, }, { 4559, 6768, }, + { 4737, 6900, }, { 4929, 7036, }, { 5130, 7172, }, { 5347, 7312, }, + }, { + { 4, 4 }, { 12, 13 }, { 18, 19 }, { 25, 27 }, + { 33, 35 }, { 41, 44 }, { 50, 54 }, { 60, 64 }, + { 70, 75 }, { 80, 87 }, { 91, 99 }, { 103, 112 }, + { 115, 126 }, { 127, 139 }, { 140, 154 }, { 153, 168 }, + { 166, 183 }, { 180, 199 }, { 194, 214 }, { 208, 230 }, + { 222, 247 }, { 237, 263 }, { 251, 280 }, { 266, 297 }, + { 281, 314 }, { 296, 331 }, { 312, 349 }, { 327, 366 }, + { 343, 384 }, { 358, 402 }, { 374, 420 }, { 390, 438 }, + { 405, 456 }, { 421, 475 }, { 437, 493 }, { 453, 511 }, + { 469, 530 }, { 484, 548 }, { 500, 567 }, { 516, 586 }, + { 532, 604 }, { 548, 623 }, { 564, 642 }, { 580, 660 }, + { 596, 679 }, { 611, 698 }, { 627, 716 }, { 643, 735 }, + { 659, 753 }, { 674, 772 }, { 690, 791 }, { 706, 809 }, + { 721, 828 }, { 737, 846 }, { 752, 865 }, { 768, 884 }, + { 783, 902 }, { 798, 920 }, { 814, 939 }, { 829, 957 }, + { 844, 976 }, { 859, 994 }, { 874, 1012 }, { 889, 1030 }, + { 904, 1049 }, { 919, 1067 }, { 934, 1085 }, { 949, 1103 }, + { 964, 1121 }, { 978, 1139 }, { 993, 1157 }, { 1008, 1175 }, + { 1022, 1193 }, { 1037, 1211 }, { 1051, 1229 }, { 1065, 1246 }, + { 1080, 1264 }, { 1094, 1282 }, { 1108, 1299 }, { 1122, 1317 }, + { 1136, 1335 }, { 1151, 1352 }, { 1165, 1370 }, { 1179, 1387 }, + { 1192, 1405 }, { 1206, 1422 }, { 1220, 1440 }, { 1234, 1457 }, + { 1248, 1474 }, { 1261, 1491 }, { 1275, 1509 }, { 1288, 1526 }, + { 1302, 1543 }, { 1315, 1560 }, { 1329, 1577 }, { 1342, 1595 }, + { 1368, 1627 }, { 1393, 1660 }, { 1419, 1693 }, { 1444, 1725 }, + { 1469, 1758 }, { 1494, 1791 }, { 1519, 1824 }, { 1544, 1856 }, + { 1569, 1889 }, { 1594, 1922 }, { 1618, 1954 }, { 1643, 1987 }, + { 1668, 2020 }, { 1692, 2052 }, { 1717, 2085 }, { 1741, 2118 }, + { 1765, 2150 }, { 1789, 2183 }, { 1814, 2216 }, { 1838, 2248 }, + { 1862, 2281 }, { 1885, 2313 }, { 1909, 2346 }, { 1933, 2378 }, + { 1957, 2411 }, { 1992, 2459 }, { 2027, 2508 }, { 2061, 2556 }, + { 2096, 2605 }, { 2130, 2653 }, { 2165, 2701 }, { 2199, 2750 }, + { 2233, 2798 }, { 2267, 2847 }, { 2300, 2895 }, { 2334, 2943 }, + { 2367, 2992 }, { 2400, 3040 }, { 2434, 3088 }, { 2467, 3137 }, + { 2499, 3185 }, { 2532, 3234 }, { 2575, 3298 }, { 2618, 3362 }, + { 2661, 3426 }, { 2704, 3491 }, { 2746, 3555 }, { 2788, 3619 }, + { 2830, 3684 }, { 2872, 3748 }, { 2913, 3812 }, { 2954, 3876 }, + { 2995, 3941 }, { 3036, 4005 }, { 3076, 4069 }, { 3127, 4149 }, + { 3177, 4230 }, { 3226, 4310 }, { 3275, 4390 }, { 3324, 4470 }, + { 3373, 4550 }, { 3421, 4631 }, { 3469, 4711 }, { 3517, 4791 }, + { 3565, 4871 }, { 3621, 4967 }, { 3677, 5064 }, { 3733, 5160 }, + { 3788, 5256 }, { 3843, 5352 }, { 3897, 5448 }, { 3951, 5544 }, + { 4005, 5641 }, { 4058, 5737 }, { 4119, 5849 }, { 4181, 5961 }, + { 4241, 6073 }, { 4301, 6185 }, { 4361, 6297 }, { 4420, 6410 }, + { 4479, 6522 }, { 4546, 6650 }, { 4612, 6778 }, { 4677, 6906 }, + { 4742, 7034 }, { 4807, 7162 }, { 4871, 7290 }, { 4942, 7435 }, + { 5013, 7579 }, { 5083, 7723 }, { 5153, 7867 }, { 5222, 8011 }, + { 5291, 8155 }, { 5367, 8315 }, { 5442, 8475 }, { 5517, 8635 }, + { 5591, 8795 }, { 5665, 8956 }, { 5745, 9132 }, { 5825, 9308 }, + { 5905, 9484 }, { 5984, 9660 }, { 6063, 9836 }, { 6149, 10028 }, + { 6234, 10220 }, { 6319, 10412 }, { 6404, 10604 }, { 6495, 10812 }, + { 6587, 11020 }, { 6678, 11228 }, { 6769, 11437 }, { 6867, 11661 }, + { 6966, 11885 }, { 7064, 12109 }, { 7163, 12333 }, { 7269, 12573 }, + { 7376, 12813 }, { 7483, 13053 }, { 7599, 13309 }, { 7715, 13565 }, + { 7832, 13821 }, { 7958, 14093 }, { 8085, 14365 }, { 8214, 14637 }, + { 8352, 14925 }, { 8492, 15213 }, { 8635, 15502 }, { 8788, 15806 }, + { 8945, 16110 }, { 9104, 16414 }, { 9275, 16734 }, { 9450, 17054 }, + { 9639, 17390 }, { 9832, 17726 }, { 10031, 18062 }, { 10245, 18414 }, + { 10465, 18766 }, { 10702, 19134 }, { 10946, 19502 }, { 11210, 19886 }, + { 11482, 20270 }, { 11776, 20670 }, { 12081, 21070 }, { 12409, 21486 }, + { 12750, 21902 }, { 13118, 22334 }, { 13501, 22766 }, { 13913, 23214 }, + { 14343, 23662 }, { 14807, 24126 }, { 15290, 24590 }, { 15812, 25070 }, + { 16356, 25551 }, { 16943, 26047 }, { 17575, 26559 }, { 18237, 27071 }, + { 18949, 27599 }, { 19718, 28143 }, { 20521, 28687 }, { 21387, 29247 }, + } +}; diff --git a/third_party/dav1d/src/dequant_tables.h b/third_party/dav1d/src/dequant_tables.h new file mode 100644 index 0000000000..66bb3b53a4 --- /dev/null +++ b/third_party/dav1d/src/dequant_tables.h @@ -0,0 +1,37 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_DEQUANT_TABLES_H +#define DAV1D_SRC_DEQUANT_TABLES_H + +#include + +#include "src/levels.h" + +extern const uint16_t dav1d_dq_tbl[][QINDEX_RANGE][2]; + +#endif /* DAV1D_SRC_DEQUANT_TABLES_H */ diff --git a/third_party/dav1d/src/env.h b/third_party/dav1d/src/env.h new file mode 100644 index 0000000000..7b91c4cab6 --- /dev/null +++ b/third_party/dav1d/src/env.h @@ -0,0 +1,521 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_ENV_H +#define DAV1D_SRC_ENV_H + +#include +#include +#include + +#include "src/levels.h" +#include "src/refmvs.h" +#include "src/tables.h" + +typedef struct BlockContext { + uint8_t ALIGN(mode[32], 8); + uint8_t ALIGN(lcoef[32], 8); + uint8_t ALIGN(ccoef[2][32], 8); + uint8_t ALIGN(seg_pred[32], 8); + uint8_t ALIGN(skip[32], 8); + uint8_t ALIGN(skip_mode[32], 8); + uint8_t ALIGN(intra[32], 8); + uint8_t ALIGN(comp_type[32], 8); + int8_t ALIGN(ref[2][32], 8); // -1 means intra + uint8_t ALIGN(filter[2][32], 8); // 3 means unset + int8_t ALIGN(tx_intra[32], 8); + int8_t ALIGN(tx[32], 8); + uint8_t ALIGN(tx_lpf_y[32], 8); + uint8_t ALIGN(tx_lpf_uv[32], 8); + uint8_t ALIGN(partition[16], 8); + uint8_t ALIGN(uvmode[32], 8); + uint8_t ALIGN(pal_sz[32], 8); +} BlockContext; + +static inline int get_intra_ctx(const BlockContext *const a, + const BlockContext *const l, + const int yb4, const int xb4, + const int have_top, const int have_left) +{ + if (have_left) { + if (have_top) { + const int ctx = l->intra[yb4] + a->intra[xb4]; + return ctx + (ctx == 2); + } else + return l->intra[yb4] * 2; + } else { + return have_top ? a->intra[xb4] * 2 : 0; + } +} + +static inline int get_tx_ctx(const BlockContext *const a, + const BlockContext *const l, + const TxfmInfo *const max_tx, + const int yb4, const int xb4) +{ + return (l->tx_intra[yb4] >= max_tx->lh) + (a->tx_intra[xb4] >= max_tx->lw); +} + +static inline int get_partition_ctx(const BlockContext *const a, + const BlockContext *const l, + const enum BlockLevel bl, + const int yb8, const int xb8) +{ + return ((a->partition[xb8] >> (4 - bl)) & 1) + + (((l->partition[yb8] >> (4 - bl)) & 1) << 1); +} + +static inline unsigned gather_left_partition_prob(const uint16_t *const in, + const enum BlockLevel bl) +{ + unsigned out = in[PARTITION_H - 1] - in[PARTITION_H]; + // Exploit the fact that cdfs for PARTITION_SPLIT, PARTITION_T_TOP_SPLIT, + // PARTITION_T_BOTTOM_SPLIT and PARTITION_T_LEFT_SPLIT are neighbors. + out += in[PARTITION_SPLIT - 1] - in[PARTITION_T_LEFT_SPLIT]; + if (bl != BL_128X128) + out += in[PARTITION_H4 - 1] - in[PARTITION_H4]; + return out; +} + +static inline unsigned gather_top_partition_prob(const uint16_t *const in, + const enum BlockLevel bl) +{ + // Exploit the fact that cdfs for PARTITION_V, PARTITION_SPLIT and + // PARTITION_T_TOP_SPLIT are neighbors. + unsigned out = in[PARTITION_V - 1] - in[PARTITION_T_TOP_SPLIT]; + // Exploit the facts that cdfs for PARTITION_T_LEFT_SPLIT and + // PARTITION_T_RIGHT_SPLIT are neighbors, the probability for + // PARTITION_V4 is always zero, and the probability for + // PARTITION_T_RIGHT_SPLIT is zero in 128x128 blocks. + out += in[PARTITION_T_LEFT_SPLIT - 1]; + if (bl != BL_128X128) + out += in[PARTITION_V4 - 1] - in[PARTITION_T_RIGHT_SPLIT]; + return out; +} + +static inline enum TxfmType get_uv_inter_txtp(const TxfmInfo *const uvt_dim, + const enum TxfmType ytxtp) +{ + if (uvt_dim->max == TX_32X32) + return ytxtp == IDTX ? IDTX : DCT_DCT; + if (uvt_dim->min == TX_16X16 && + ((1 << ytxtp) & ((1 << H_FLIPADST) | (1 << V_FLIPADST) | + (1 << H_ADST) | (1 << V_ADST)))) + { + return DCT_DCT; + } + + return ytxtp; +} + +static inline int get_filter_ctx(const BlockContext *const a, + const BlockContext *const l, + const int comp, const int dir, const int ref, + const int yb4, const int xb4) +{ + const int a_filter = (a->ref[0][xb4] == ref || a->ref[1][xb4] == ref) ? + a->filter[dir][xb4] : DAV1D_N_SWITCHABLE_FILTERS; + const int l_filter = (l->ref[0][yb4] == ref || l->ref[1][yb4] == ref) ? + l->filter[dir][yb4] : DAV1D_N_SWITCHABLE_FILTERS; + + if (a_filter == l_filter) { + return comp * 4 + a_filter; + } else if (a_filter == DAV1D_N_SWITCHABLE_FILTERS) { + return comp * 4 + l_filter; + } else if (l_filter == DAV1D_N_SWITCHABLE_FILTERS) { + return comp * 4 + a_filter; + } else { + return comp * 4 + DAV1D_N_SWITCHABLE_FILTERS; + } +} + +static inline int get_comp_ctx(const BlockContext *const a, + const BlockContext *const l, + const int yb4, const int xb4, + const int have_top, const int have_left) +{ + if (have_top) { + if (have_left) { + if (a->comp_type[xb4]) { + if (l->comp_type[yb4]) { + return 4; + } else { + // 4U means intra (-1) or bwd (>= 4) + return 2 + ((unsigned)l->ref[0][yb4] >= 4U); + } + } else if (l->comp_type[yb4]) { + // 4U means intra (-1) or bwd (>= 4) + return 2 + ((unsigned)a->ref[0][xb4] >= 4U); + } else { + return (l->ref[0][yb4] >= 4) ^ (a->ref[0][xb4] >= 4); + } + } else { + return a->comp_type[xb4] ? 3 : a->ref[0][xb4] >= 4; + } + } else if (have_left) { + return l->comp_type[yb4] ? 3 : l->ref[0][yb4] >= 4; + } else { + return 1; + } +} + +static inline int get_comp_dir_ctx(const BlockContext *const a, + const BlockContext *const l, + const int yb4, const int xb4, + const int have_top, const int have_left) +{ +#define has_uni_comp(edge, off) \ + ((edge->ref[0][off] < 4) == (edge->ref[1][off] < 4)) + + if (have_top && have_left) { + const int a_intra = a->intra[xb4], l_intra = l->intra[yb4]; + + if (a_intra && l_intra) return 2; + if (a_intra || l_intra) { + const BlockContext *const edge = a_intra ? l : a; + const int off = a_intra ? yb4 : xb4; + + if (edge->comp_type[off] == COMP_INTER_NONE) return 2; + return 1 + 2 * has_uni_comp(edge, off); + } + + const int a_comp = a->comp_type[xb4] != COMP_INTER_NONE; + const int l_comp = l->comp_type[yb4] != COMP_INTER_NONE; + const int a_ref0 = a->ref[0][xb4], l_ref0 = l->ref[0][yb4]; + + if (!a_comp && !l_comp) { + return 1 + 2 * ((a_ref0 >= 4) == (l_ref0 >= 4)); + } else if (!a_comp || !l_comp) { + const BlockContext *const edge = a_comp ? a : l; + const int off = a_comp ? xb4 : yb4; + + if (!has_uni_comp(edge, off)) return 1; + return 3 + ((a_ref0 >= 4) == (l_ref0 >= 4)); + } else { + const int a_uni = has_uni_comp(a, xb4), l_uni = has_uni_comp(l, yb4); + + if (!a_uni && !l_uni) return 0; + if (!a_uni || !l_uni) return 2; + return 3 + ((a_ref0 == 4) == (l_ref0 == 4)); + } + } else if (have_top || have_left) { + const BlockContext *const edge = have_left ? l : a; + const int off = have_left ? yb4 : xb4; + + if (edge->intra[off]) return 2; + if (edge->comp_type[off] == COMP_INTER_NONE) return 2; + return 4 * has_uni_comp(edge, off); + } else { + return 2; + } +} + +static inline int get_poc_diff(const int order_hint_n_bits, + const int poc0, const int poc1) +{ + if (!order_hint_n_bits) return 0; + const int mask = 1 << (order_hint_n_bits - 1); + const int diff = poc0 - poc1; + return (diff & (mask - 1)) - (diff & mask); +} + +static inline int get_jnt_comp_ctx(const int order_hint_n_bits, + const unsigned poc, const unsigned ref0poc, + const unsigned ref1poc, + const BlockContext *const a, + const BlockContext *const l, + const int yb4, const int xb4) +{ + const unsigned d0 = abs(get_poc_diff(order_hint_n_bits, ref0poc, poc)); + const unsigned d1 = abs(get_poc_diff(order_hint_n_bits, poc, ref1poc)); + const int offset = d0 == d1; + const int a_ctx = a->comp_type[xb4] >= COMP_INTER_AVG || + a->ref[0][xb4] == 6; + const int l_ctx = l->comp_type[yb4] >= COMP_INTER_AVG || + l->ref[0][yb4] == 6; + + return 3 * offset + a_ctx + l_ctx; +} + +static inline int get_mask_comp_ctx(const BlockContext *const a, + const BlockContext *const l, + const int yb4, const int xb4) +{ + const int a_ctx = a->comp_type[xb4] >= COMP_INTER_SEG ? 1 : + a->ref[0][xb4] == 6 ? 3 : 0; + const int l_ctx = l->comp_type[yb4] >= COMP_INTER_SEG ? 1 : + l->ref[0][yb4] == 6 ? 3 : 0; + + return imin(a_ctx + l_ctx, 5); +} + +#define av1_get_ref_2_ctx av1_get_bwd_ref_ctx +#define av1_get_ref_3_ctx av1_get_fwd_ref_ctx +#define av1_get_ref_4_ctx av1_get_fwd_ref_1_ctx +#define av1_get_ref_5_ctx av1_get_fwd_ref_2_ctx +#define av1_get_ref_6_ctx av1_get_bwd_ref_1_ctx +#define av1_get_uni_p_ctx av1_get_ref_ctx +#define av1_get_uni_p2_ctx av1_get_fwd_ref_2_ctx + +static inline int av1_get_ref_ctx(const BlockContext *const a, + const BlockContext *const l, + const int yb4, const int xb4, + int have_top, int have_left) +{ + int cnt[2] = { 0 }; + + if (have_top && !a->intra[xb4]) { + cnt[a->ref[0][xb4] >= 4]++; + if (a->comp_type[xb4]) cnt[a->ref[1][xb4] >= 4]++; + } + + if (have_left && !l->intra[yb4]) { + cnt[l->ref[0][yb4] >= 4]++; + if (l->comp_type[yb4]) cnt[l->ref[1][yb4] >= 4]++; + } + + return cnt[0] == cnt[1] ? 1 : cnt[0] < cnt[1] ? 0 : 2; +} + +static inline int av1_get_fwd_ref_ctx(const BlockContext *const a, + const BlockContext *const l, + const int yb4, const int xb4, + const int have_top, const int have_left) +{ + int cnt[4] = { 0 }; + + if (have_top && !a->intra[xb4]) { + if (a->ref[0][xb4] < 4) cnt[a->ref[0][xb4]]++; + if (a->comp_type[xb4] && a->ref[1][xb4] < 4) cnt[a->ref[1][xb4]]++; + } + + if (have_left && !l->intra[yb4]) { + if (l->ref[0][yb4] < 4) cnt[l->ref[0][yb4]]++; + if (l->comp_type[yb4] && l->ref[1][yb4] < 4) cnt[l->ref[1][yb4]]++; + } + + cnt[0] += cnt[1]; + cnt[2] += cnt[3]; + + return cnt[0] == cnt[2] ? 1 : cnt[0] < cnt[2] ? 0 : 2; +} + +static inline int av1_get_fwd_ref_1_ctx(const BlockContext *const a, + const BlockContext *const l, + const int yb4, const int xb4, + const int have_top, const int have_left) +{ + int cnt[2] = { 0 }; + + if (have_top && !a->intra[xb4]) { + if (a->ref[0][xb4] < 2) cnt[a->ref[0][xb4]]++; + if (a->comp_type[xb4] && a->ref[1][xb4] < 2) cnt[a->ref[1][xb4]]++; + } + + if (have_left && !l->intra[yb4]) { + if (l->ref[0][yb4] < 2) cnt[l->ref[0][yb4]]++; + if (l->comp_type[yb4] && l->ref[1][yb4] < 2) cnt[l->ref[1][yb4]]++; + } + + return cnt[0] == cnt[1] ? 1 : cnt[0] < cnt[1] ? 0 : 2; +} + +static inline int av1_get_fwd_ref_2_ctx(const BlockContext *const a, + const BlockContext *const l, + const int yb4, const int xb4, + const int have_top, const int have_left) +{ + int cnt[2] = { 0 }; + + if (have_top && !a->intra[xb4]) { + if ((a->ref[0][xb4] ^ 2U) < 2) cnt[a->ref[0][xb4] - 2]++; + if (a->comp_type[xb4] && (a->ref[1][xb4] ^ 2U) < 2) cnt[a->ref[1][xb4] - 2]++; + } + + if (have_left && !l->intra[yb4]) { + if ((l->ref[0][yb4] ^ 2U) < 2) cnt[l->ref[0][yb4] - 2]++; + if (l->comp_type[yb4] && (l->ref[1][yb4] ^ 2U) < 2) cnt[l->ref[1][yb4] - 2]++; + } + + return cnt[0] == cnt[1] ? 1 : cnt[0] < cnt[1] ? 0 : 2; +} + +static inline int av1_get_bwd_ref_ctx(const BlockContext *const a, + const BlockContext *const l, + const int yb4, const int xb4, + const int have_top, const int have_left) +{ + int cnt[3] = { 0 }; + + if (have_top && !a->intra[xb4]) { + if (a->ref[0][xb4] >= 4) cnt[a->ref[0][xb4] - 4]++; + if (a->comp_type[xb4] && a->ref[1][xb4] >= 4) cnt[a->ref[1][xb4] - 4]++; + } + + if (have_left && !l->intra[yb4]) { + if (l->ref[0][yb4] >= 4) cnt[l->ref[0][yb4] - 4]++; + if (l->comp_type[yb4] && l->ref[1][yb4] >= 4) cnt[l->ref[1][yb4] - 4]++; + } + + cnt[1] += cnt[0]; + + return cnt[2] == cnt[1] ? 1 : cnt[1] < cnt[2] ? 0 : 2; +} + +static inline int av1_get_bwd_ref_1_ctx(const BlockContext *const a, + const BlockContext *const l, + const int yb4, const int xb4, + const int have_top, const int have_left) +{ + int cnt[3] = { 0 }; + + if (have_top && !a->intra[xb4]) { + if (a->ref[0][xb4] >= 4) cnt[a->ref[0][xb4] - 4]++; + if (a->comp_type[xb4] && a->ref[1][xb4] >= 4) cnt[a->ref[1][xb4] - 4]++; + } + + if (have_left && !l->intra[yb4]) { + if (l->ref[0][yb4] >= 4) cnt[l->ref[0][yb4] - 4]++; + if (l->comp_type[yb4] && l->ref[1][yb4] >= 4) cnt[l->ref[1][yb4] - 4]++; + } + + return cnt[0] == cnt[1] ? 1 : cnt[0] < cnt[1] ? 0 : 2; +} + +static inline int av1_get_uni_p1_ctx(const BlockContext *const a, + const BlockContext *const l, + const int yb4, const int xb4, + const int have_top, const int have_left) +{ + int cnt[3] = { 0 }; + + if (have_top && !a->intra[xb4]) { + if (a->ref[0][xb4] - 1U < 3) cnt[a->ref[0][xb4] - 1]++; + if (a->comp_type[xb4] && a->ref[1][xb4] - 1U < 3) cnt[a->ref[1][xb4] - 1]++; + } + + if (have_left && !l->intra[yb4]) { + if (l->ref[0][yb4] - 1U < 3) cnt[l->ref[0][yb4] - 1]++; + if (l->comp_type[yb4] && l->ref[1][yb4] - 1U < 3) cnt[l->ref[1][yb4] - 1]++; + } + + cnt[1] += cnt[2]; + + return cnt[0] == cnt[1] ? 1 : cnt[0] < cnt[1] ? 0 : 2; +} + +static inline int get_drl_context(const refmvs_candidate *const ref_mv_stack, + const int ref_idx) +{ + if (ref_mv_stack[ref_idx].weight >= 640) + return ref_mv_stack[ref_idx + 1].weight < 640; + + return ref_mv_stack[ref_idx + 1].weight < 640 ? 2 : 0; +} + +static inline unsigned get_cur_frame_segid(const int by, const int bx, + const int have_top, + const int have_left, + int *const seg_ctx, + const uint8_t *cur_seg_map, + const ptrdiff_t stride) +{ + cur_seg_map += bx + by * stride; + if (have_left && have_top) { + const int l = cur_seg_map[-1]; + const int a = cur_seg_map[-stride]; + const int al = cur_seg_map[-(stride + 1)]; + + if (l == a && al == l) *seg_ctx = 2; + else if (l == a || al == l || a == al) *seg_ctx = 1; + else *seg_ctx = 0; + return a == al ? a : l; + } else { + *seg_ctx = 0; + return have_left ? cur_seg_map[-1] : have_top ? cur_seg_map[-stride] : 0; + } +} + +static inline void fix_int_mv_precision(mv *const mv) { + mv->x = (mv->x - (mv->x >> 15) + 3) & ~7U; + mv->y = (mv->y - (mv->y >> 15) + 3) & ~7U; +} + +static inline void fix_mv_precision(const Dav1dFrameHeader *const hdr, + mv *const mv) +{ + if (hdr->force_integer_mv) { + fix_int_mv_precision(mv); + } else if (!hdr->hp) { + mv->x = (mv->x - (mv->x >> 15)) & ~1U; + mv->y = (mv->y - (mv->y >> 15)) & ~1U; + } +} + +static inline mv get_gmv_2d(const Dav1dWarpedMotionParams *const gmv, + const int bx4, const int by4, + const int bw4, const int bh4, + const Dav1dFrameHeader *const hdr) +{ + switch (gmv->type) { + case DAV1D_WM_TYPE_ROT_ZOOM: + assert(gmv->matrix[5] == gmv->matrix[2]); + assert(gmv->matrix[4] == -gmv->matrix[3]); + // fall-through + default: + case DAV1D_WM_TYPE_AFFINE: { + const int x = bx4 * 4 + bw4 * 2 - 1; + const int y = by4 * 4 + bh4 * 2 - 1; + const int xc = (gmv->matrix[2] - (1 << 16)) * x + + gmv->matrix[3] * y + gmv->matrix[0]; + const int yc = (gmv->matrix[5] - (1 << 16)) * y + + gmv->matrix[4] * x + gmv->matrix[1]; + const int shift = 16 - (3 - !hdr->hp); + const int round = (1 << shift) >> 1; + mv res = (mv) { + .y = apply_sign(((abs(yc) + round) >> shift) << !hdr->hp, yc), + .x = apply_sign(((abs(xc) + round) >> shift) << !hdr->hp, xc), + }; + if (hdr->force_integer_mv) + fix_int_mv_precision(&res); + return res; + } + case DAV1D_WM_TYPE_TRANSLATION: { + mv res = (mv) { + .y = gmv->matrix[0] >> 13, + .x = gmv->matrix[1] >> 13, + }; + if (hdr->force_integer_mv) + fix_int_mv_precision(&res); + return res; + } + case DAV1D_WM_TYPE_IDENTITY: + return (mv) { .x = 0, .y = 0 }; + } +} + +#endif /* DAV1D_SRC_ENV_H */ diff --git a/third_party/dav1d/src/ext/x86/x86inc.asm b/third_party/dav1d/src/ext/x86/x86inc.asm new file mode 100644 index 0000000000..60384bf720 --- /dev/null +++ b/third_party/dav1d/src/ext/x86/x86inc.asm @@ -0,0 +1,1827 @@ +;***************************************************************************** +;* x86inc.asm: x86 abstraction layer +;***************************************************************************** +;* Copyright (C) 2005-2020 x264 project +;* +;* Authors: Loren Merritt +;* Henrik Gramner +;* Anton Mitrofanov +;* Fiona Glaser +;* +;* Permission to use, copy, modify, and/or distribute this software for any +;* purpose with or without fee is hereby granted, provided that the above +;* copyright notice and this permission notice appear in all copies. +;* +;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +;***************************************************************************** + +; This is a header file for the x86inc.asm assembly language, which uses +; NASM/YASM syntax combined with a large number of macros to provide easy +; abstraction between different calling conventions (x86_32, win64, linux64). +; It also has various other useful features to simplify writing the kind of +; DSP functions that are most often used. + +%ifndef private_prefix + %error private_prefix not defined +%endif + +%ifndef public_prefix + %define public_prefix private_prefix +%endif + +%ifndef STACK_ALIGNMENT + %if ARCH_X86_64 + %define STACK_ALIGNMENT 16 + %else + %define STACK_ALIGNMENT 4 + %endif +%endif + +%define WIN64 0 +%define UNIX64 0 +%if ARCH_X86_64 + %ifidn __OUTPUT_FORMAT__,win32 + %define WIN64 1 + %elifidn __OUTPUT_FORMAT__,win64 + %define WIN64 1 + %elifidn __OUTPUT_FORMAT__,x64 + %define WIN64 1 + %else + %define UNIX64 1 + %endif +%endif + +%define FORMAT_ELF 0 +%define FORMAT_MACHO 0 +%ifidn __OUTPUT_FORMAT__,elf + %define FORMAT_ELF 1 +%elifidn __OUTPUT_FORMAT__,elf32 + %define FORMAT_ELF 1 +%elifidn __OUTPUT_FORMAT__,elf64 + %define FORMAT_ELF 1 +%elifidn __OUTPUT_FORMAT__,macho + %define FORMAT_MACHO 1 +%elifidn __OUTPUT_FORMAT__,macho32 + %define FORMAT_MACHO 1 +%elifidn __OUTPUT_FORMAT__,macho64 + %define FORMAT_MACHO 1 +%endif + +%ifdef PREFIX + %define mangle(x) _ %+ x +%else + %define mangle(x) x +%endif + +%macro SECTION_RODATA 0-1 16 + %ifidn __OUTPUT_FORMAT__,win32 + SECTION .rdata align=%1 + %elif WIN64 + SECTION .rdata align=%1 + %else + SECTION .rodata align=%1 + %endif +%endmacro + +%if ARCH_X86_64 + %define PIC 1 ; always use PIC on x86-64 + default rel +%elifidn __OUTPUT_FORMAT__,win32 + %define PIC 0 ; PIC isn't used on 32-bit Windows +%elifndef PIC + %define PIC 0 +%endif + +%define HAVE_PRIVATE_EXTERN 1 +%ifdef __NASM_VER__ + %use smartalign + %if __NASM_VERSION_ID__ < 0x020e0000 ; 2.14 + %define HAVE_PRIVATE_EXTERN 0 + %endif +%endif + +; Macros to eliminate most code duplication between x86_32 and x86_64: +; Currently this works only for leaf functions which load all their arguments +; into registers at the start, and make no other use of the stack. Luckily that +; covers most use cases. + +; PROLOGUE: +; %1 = number of arguments. loads them from stack if needed. +; %2 = number of registers used. pushes callee-saved regs if needed. +; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. +; %4 = (optional) stack size to be allocated. The stack will be aligned before +; allocating the specified stack size. If the required stack alignment is +; larger than the known stack alignment the stack will be manually aligned +; and an extra register will be allocated to hold the original stack +; pointer (to not invalidate r0m etc.). To prevent the use of an extra +; register as stack pointer, request a negative stack size. +; %4+/%5+ = list of names to define to registers +; PROLOGUE can also be invoked by adding the same options to cglobal + +; e.g. +; cglobal foo, 2,3,7,0x40, dst, src, tmp +; declares a function (foo) that automatically loads two arguments (dst and +; src) into registers, uses one additional register (tmp) plus 7 vector +; registers (m0-m6) and allocates 0x40 bytes of stack space. + +; TODO Some functions can use some args directly from the stack. If they're the +; last args then you can just not declare them, but if they're in the middle +; we need more flexible macro. + +; RET: +; Pops anything that was pushed by PROLOGUE, and returns. + +; REP_RET: +; Use this instead of RET if it's a branch target. + +; registers: +; rN and rNq are the native-size register holding function argument N +; rNd, rNw, rNb are dword, word, and byte size +; rNh is the high 8 bits of the word size +; rNm is the original location of arg N (a register or on the stack), dword +; rNmp is native size + +%macro DECLARE_REG 2-3 + %define r%1q %2 + %define r%1d %2d + %define r%1w %2w + %define r%1b %2b + %define r%1h %2h + %define %2q %2 + %if %0 == 2 + %define r%1m %2d + %define r%1mp %2 + %elif ARCH_X86_64 ; memory + %define r%1m [rstk + stack_offset + %3] + %define r%1mp qword r %+ %1 %+ m + %else + %define r%1m [rstk + stack_offset + %3] + %define r%1mp dword r %+ %1 %+ m + %endif + %define r%1 %2 +%endmacro + +%macro DECLARE_REG_SIZE 3 + %define r%1q r%1 + %define e%1q r%1 + %define r%1d e%1 + %define e%1d e%1 + %define r%1w %1 + %define e%1w %1 + %define r%1h %3 + %define e%1h %3 + %define r%1b %2 + %define e%1b %2 + %if ARCH_X86_64 == 0 + %define r%1 e%1 + %endif +%endmacro + +DECLARE_REG_SIZE ax, al, ah +DECLARE_REG_SIZE bx, bl, bh +DECLARE_REG_SIZE cx, cl, ch +DECLARE_REG_SIZE dx, dl, dh +DECLARE_REG_SIZE si, sil, null +DECLARE_REG_SIZE di, dil, null +DECLARE_REG_SIZE bp, bpl, null + +; t# defines for when per-arch register allocation is more complex than just function arguments + +%macro DECLARE_REG_TMP 1-* + %assign %%i 0 + %rep %0 + CAT_XDEFINE t, %%i, r%1 + %assign %%i %%i+1 + %rotate 1 + %endrep +%endmacro + +%macro DECLARE_REG_TMP_SIZE 0-* + %rep %0 + %define t%1q t%1 %+ q + %define t%1d t%1 %+ d + %define t%1w t%1 %+ w + %define t%1h t%1 %+ h + %define t%1b t%1 %+ b + %rotate 1 + %endrep +%endmacro + +DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 + +%if ARCH_X86_64 + %define gprsize 8 +%else + %define gprsize 4 +%endif + +%macro LEA 2 +%if ARCH_X86_64 + lea %1, [%2] +%elif PIC + call $+5 ; special-cased to not affect the RSB on most CPU:s + pop %1 + add %1, (%2)-$+1 +%else + mov %1, %2 +%endif +%endmacro + +%macro PUSH 1 + push %1 + %ifidn rstk, rsp + %assign stack_offset stack_offset+gprsize + %endif +%endmacro + +%macro POP 1 + pop %1 + %ifidn rstk, rsp + %assign stack_offset stack_offset-gprsize + %endif +%endmacro + +%macro PUSH_IF_USED 1-* + %rep %0 + %if %1 < regs_used + PUSH r%1 + %endif + %rotate 1 + %endrep +%endmacro + +%macro POP_IF_USED 1-* + %rep %0 + %if %1 < regs_used + pop r%1 + %endif + %rotate 1 + %endrep +%endmacro + +%macro LOAD_IF_USED 1-* + %rep %0 + %if %1 < num_args + mov r%1, r %+ %1 %+ mp + %endif + %rotate 1 + %endrep +%endmacro + +%macro SUB 2 + sub %1, %2 + %ifidn %1, rstk + %assign stack_offset stack_offset+(%2) + %endif +%endmacro + +%macro ADD 2 + add %1, %2 + %ifidn %1, rstk + %assign stack_offset stack_offset-(%2) + %endif +%endmacro + +%macro movifnidn 2 + %ifnidn %1, %2 + mov %1, %2 + %endif +%endmacro + +%if ARCH_X86_64 == 0 + %define movsxd movifnidn +%endif + +%macro movsxdifnidn 2 + %ifnidn %1, %2 + movsxd %1, %2 + %endif +%endmacro + +%macro ASSERT 1 + %if (%1) == 0 + %error assertion ``%1'' failed + %endif +%endmacro + +%macro DEFINE_ARGS 0-* + %ifdef n_arg_names + %assign %%i 0 + %rep n_arg_names + CAT_UNDEF arg_name %+ %%i, q + CAT_UNDEF arg_name %+ %%i, d + CAT_UNDEF arg_name %+ %%i, w + CAT_UNDEF arg_name %+ %%i, h + CAT_UNDEF arg_name %+ %%i, b + CAT_UNDEF arg_name %+ %%i, m + CAT_UNDEF arg_name %+ %%i, mp + CAT_UNDEF arg_name, %%i + %assign %%i %%i+1 + %endrep + %endif + + %xdefine %%stack_offset stack_offset + %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine + %assign %%i 0 + %rep %0 + %xdefine %1q r %+ %%i %+ q + %xdefine %1d r %+ %%i %+ d + %xdefine %1w r %+ %%i %+ w + %xdefine %1h r %+ %%i %+ h + %xdefine %1b r %+ %%i %+ b + %xdefine %1m r %+ %%i %+ m + %xdefine %1mp r %+ %%i %+ mp + CAT_XDEFINE arg_name, %%i, %1 + %assign %%i %%i+1 + %rotate 1 + %endrep + %xdefine stack_offset %%stack_offset + %assign n_arg_names %0 +%endmacro + +%define required_stack_alignment ((mmsize + 15) & ~15) +%define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512))) +%define high_mm_regs (16*cpuflag(avx512)) + +%macro ALLOC_STACK 0-2 0, 0 ; stack_size, n_xmm_regs (for win64 only) + %ifnum %1 + %if %1 != 0 + %assign %%pad 0 + %assign stack_size %1 + %if stack_size < 0 + %assign stack_size -stack_size + %endif + %if WIN64 + %assign %%pad %%pad + 32 ; shadow space + %if mmsize != 8 + %assign xmm_regs_used %2 + %if xmm_regs_used > 8 + %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers + %endif + %endif + %endif + %if required_stack_alignment <= STACK_ALIGNMENT + ; maintain the current stack alignment + %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) + SUB rsp, stack_size_padded + %else + %assign %%reg_num (regs_used - 1) + %xdefine rstk r %+ %%reg_num + ; align stack, and save original stack location directly above + ; it, i.e. in [rsp+stack_size_padded], so we can restore the + ; stack in a single instruction (i.e. mov rsp, rstk or mov + ; rsp, [rsp+stack_size_padded]) + %if %1 < 0 ; need to store rsp on stack + %xdefine rstkm [rsp + stack_size + %%pad] + %assign %%pad %%pad + gprsize + %else ; can keep rsp in rstk during whole function + %xdefine rstkm rstk + %endif + %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1)) + mov rstk, rsp + and rsp, ~(required_stack_alignment-1) + sub rsp, stack_size_padded + movifnidn rstkm, rstk + %endif + WIN64_PUSH_XMM + %endif + %endif +%endmacro + +%macro SETUP_STACK_POINTER 0-1 0 + %ifnum %1 + %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT + %if %1 > 0 + ; Reserve an additional register for storing the original stack pointer, but avoid using + ; eax/rax for this purpose since it can potentially get overwritten as a return value. + %assign regs_used (regs_used + 1) + %if ARCH_X86_64 && regs_used == 7 + %assign regs_used 8 + %elif ARCH_X86_64 == 0 && regs_used == 1 + %assign regs_used 2 + %endif + %endif + %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3 + ; Ensure that we don't clobber any registers containing arguments. For UNIX64 we also preserve r6 (rax) + ; since it's used as a hidden argument in vararg functions to specify the number of vector registers used. + %assign regs_used 5 + UNIX64 * 3 + %endif + %endif + %endif +%endmacro + +%if WIN64 ; Windows x64 ;================================================= + +DECLARE_REG 0, rcx +DECLARE_REG 1, rdx +DECLARE_REG 2, R8 +DECLARE_REG 3, R9 +DECLARE_REG 4, R10, 40 +DECLARE_REG 5, R11, 48 +DECLARE_REG 6, rax, 56 +DECLARE_REG 7, rdi, 64 +DECLARE_REG 8, rsi, 72 +DECLARE_REG 9, rbx, 80 +DECLARE_REG 10, rbp, 88 +DECLARE_REG 11, R14, 96 +DECLARE_REG 12, R15, 104 +DECLARE_REG 13, R12, 112 +DECLARE_REG 14, R13, 120 + +%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... + %assign num_args %1 + %assign regs_used %2 + ASSERT regs_used >= num_args + SETUP_STACK_POINTER %4 + ASSERT regs_used <= 15 + PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 + ALLOC_STACK %4, %3 + %if mmsize != 8 && stack_size == 0 + WIN64_SPILL_XMM %3 + %endif + LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 + %if %0 > 4 + %ifnum %4 + DEFINE_ARGS %5 + %else + DEFINE_ARGS %4, %5 + %endif + %elifnnum %4 + DEFINE_ARGS %4 + %endif +%endmacro + +%macro WIN64_PUSH_XMM 0 + ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated. + %if xmm_regs_used > 6 + high_mm_regs + movaps [rstk + stack_offset + 8], xmm6 + %endif + %if xmm_regs_used > 7 + high_mm_regs + movaps [rstk + stack_offset + 24], xmm7 + %endif + %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 + %if %%xmm_regs_on_stack > 0 + %assign %%i 8 + %rep %%xmm_regs_on_stack + movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i + %assign %%i %%i+1 + %endrep + %endif +%endmacro + +%macro WIN64_SPILL_XMM 1 + %assign xmm_regs_used %1 + ASSERT xmm_regs_used <= 16 + high_mm_regs + %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 + %if %%xmm_regs_on_stack > 0 + ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack. + %assign %%pad %%xmm_regs_on_stack*16 + 32 + %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) + SUB rsp, stack_size_padded + %endif + WIN64_PUSH_XMM +%endmacro + +%macro WIN64_RESTORE_XMM_INTERNAL 0 + %assign %%pad_size 0 + %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 + %if %%xmm_regs_on_stack > 0 + %assign %%i xmm_regs_used - high_mm_regs + %rep %%xmm_regs_on_stack + %assign %%i %%i-1 + movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32] + %endrep + %endif + %if stack_size_padded > 0 + %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT + mov rsp, rstkm + %else + add rsp, stack_size_padded + %assign %%pad_size stack_size_padded + %endif + %endif + %if xmm_regs_used > 7 + high_mm_regs + movaps xmm7, [rsp + stack_offset - %%pad_size + 24] + %endif + %if xmm_regs_used > 6 + high_mm_regs + movaps xmm6, [rsp + stack_offset - %%pad_size + 8] + %endif +%endmacro + +%macro WIN64_RESTORE_XMM 0 + WIN64_RESTORE_XMM_INTERNAL + %assign stack_offset (stack_offset-stack_size_padded) + %assign stack_size_padded 0 + %assign xmm_regs_used 0 +%endmacro + +%define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs + +%macro RET 0 + WIN64_RESTORE_XMM_INTERNAL + POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 + %if vzeroupper_required + vzeroupper + %endif + AUTO_REP_RET +%endmacro + +%elif ARCH_X86_64 ; *nix x64 ;============================================= + +DECLARE_REG 0, rdi +DECLARE_REG 1, rsi +DECLARE_REG 2, rdx +DECLARE_REG 3, rcx +DECLARE_REG 4, R8 +DECLARE_REG 5, R9 +DECLARE_REG 6, rax, 8 +DECLARE_REG 7, R10, 16 +DECLARE_REG 8, R11, 24 +DECLARE_REG 9, rbx, 32 +DECLARE_REG 10, rbp, 40 +DECLARE_REG 11, R14, 48 +DECLARE_REG 12, R15, 56 +DECLARE_REG 13, R12, 64 +DECLARE_REG 14, R13, 72 + +%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... + %assign num_args %1 + %assign regs_used %2 + %assign xmm_regs_used %3 + ASSERT regs_used >= num_args + SETUP_STACK_POINTER %4 + ASSERT regs_used <= 15 + PUSH_IF_USED 9, 10, 11, 12, 13, 14 + ALLOC_STACK %4 + LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 + %if %0 > 4 + %ifnum %4 + DEFINE_ARGS %5 + %else + DEFINE_ARGS %4, %5 + %endif + %elifnnum %4 + DEFINE_ARGS %4 + %endif +%endmacro + +%define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required + +%macro RET 0 + %if stack_size_padded > 0 + %if required_stack_alignment > STACK_ALIGNMENT + mov rsp, rstkm + %else + add rsp, stack_size_padded + %endif + %endif + POP_IF_USED 14, 13, 12, 11, 10, 9 + %if vzeroupper_required + vzeroupper + %endif + AUTO_REP_RET +%endmacro + +%else ; X86_32 ;============================================================== + +DECLARE_REG 0, eax, 4 +DECLARE_REG 1, ecx, 8 +DECLARE_REG 2, edx, 12 +DECLARE_REG 3, ebx, 16 +DECLARE_REG 4, esi, 20 +DECLARE_REG 5, edi, 24 +DECLARE_REG 6, ebp, 28 +%define rsp esp + +%macro DECLARE_ARG 1-* + %rep %0 + %define r%1m [rstk + stack_offset + 4*%1 + 4] + %define r%1mp dword r%1m + %rotate 1 + %endrep +%endmacro + +DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 + +%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... + %assign num_args %1 + %assign regs_used %2 + ASSERT regs_used >= num_args + %if num_args > 7 + %assign num_args 7 + %endif + %if regs_used > 7 + %assign regs_used 7 + %endif + SETUP_STACK_POINTER %4 + ASSERT regs_used <= 7 + PUSH_IF_USED 3, 4, 5, 6 + ALLOC_STACK %4 + LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 + %if %0 > 4 + %ifnum %4 + DEFINE_ARGS %5 + %else + DEFINE_ARGS %4, %5 + %endif + %elifnnum %4 + DEFINE_ARGS %4 + %endif +%endmacro + +%define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required + +%macro RET 0 + %if stack_size_padded > 0 + %if required_stack_alignment > STACK_ALIGNMENT + mov rsp, rstkm + %else + add rsp, stack_size_padded + %endif + %endif + POP_IF_USED 6, 5, 4, 3 + %if vzeroupper_required + vzeroupper + %endif + AUTO_REP_RET +%endmacro + +%endif ;====================================================================== + +%if WIN64 == 0 + %macro WIN64_SPILL_XMM 1 + %assign xmm_regs_used %1 + %endmacro + %macro WIN64_RESTORE_XMM 0 + %assign xmm_regs_used 0 + %endmacro + %macro WIN64_PUSH_XMM 0 + %endmacro +%endif + +; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either +; a branch or a branch target. So switch to a 2-byte form of ret in that case. +; We can automatically detect "follows a branch", but not a branch target. +; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) +%macro REP_RET 0 + %if has_epilogue || cpuflag(ssse3) + RET + %else + rep ret + %endif + annotate_function_size +%endmacro + +%define last_branch_adr $$ +%macro AUTO_REP_RET 0 + %if notcpuflag(ssse3) + times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr. + %endif + ret + annotate_function_size +%endmacro + +%macro BRANCH_INSTR 0-* + %rep %0 + %macro %1 1-2 %1 + %2 %1 + %if notcpuflag(ssse3) + %%branch_instr equ $ + %xdefine last_branch_adr %%branch_instr + %endif + %endmacro + %rotate 1 + %endrep +%endmacro + +BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp + +%macro TAIL_CALL 1-2 1 ; callee, is_nonadjacent + %if has_epilogue + call %1 + RET + %elif %2 + jmp %1 + %endif + annotate_function_size +%endmacro + +;============================================================================= +; arch-independent part +;============================================================================= + +%assign function_align 16 + +; Begin a function. +; Applies any symbol mangling needed for C linkage, and sets up a define such that +; subsequent uses of the function name automatically refer to the mangled version. +; Appends cpuflags to the function name if cpuflags has been specified. +; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX +; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2). +%macro cglobal 1-2+ "" ; name, [PROLOGUE args] + cglobal_internal 1, %1 %+ SUFFIX, %2 +%endmacro +%macro cvisible 1-2+ "" ; name, [PROLOGUE args] + cglobal_internal 0, %1 %+ SUFFIX, %2 +%endmacro +%macro cglobal_internal 2-3+ + annotate_function_size + %ifndef cglobaled_%2 + %if %1 + %xdefine %2 mangle(private_prefix %+ _ %+ %2) + %else + %xdefine %2 mangle(public_prefix %+ _ %+ %2) + %endif + %xdefine %2.skip_prologue %2 %+ .skip_prologue + CAT_XDEFINE cglobaled_, %2, 1 + %endif + %xdefine current_function %2 + %xdefine current_function_section __SECT__ + %if FORMAT_ELF + %if %1 + global %2:function hidden + %else + global %2:function + %endif + %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN && %1 + global %2:private_extern + %else + global %2 + %endif + align function_align + %2: + RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer + %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required + %assign stack_offset 0 ; stack pointer offset relative to the return address + %assign stack_size 0 ; amount of stack space that can be freely used inside a function + %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding + %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper + %ifnidn %3, "" + PROLOGUE %3 + %endif +%endmacro + +; Create a global symbol from a local label with the correct name mangling and type +%macro cglobal_label 1 + %if FORMAT_ELF + global current_function %+ %1:function hidden + %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN + global current_function %+ %1:private_extern + %else + global current_function %+ %1 + %endif + %1: +%endmacro + +%macro cextern 1 + %xdefine %1 mangle(private_prefix %+ _ %+ %1) + CAT_XDEFINE cglobaled_, %1, 1 + extern %1 +%endmacro + +; like cextern, but without the prefix +%macro cextern_naked 1 + %ifdef PREFIX + %xdefine %1 mangle(%1) + %endif + CAT_XDEFINE cglobaled_, %1, 1 + extern %1 +%endmacro + +%macro const 1-2+ + %xdefine %1 mangle(private_prefix %+ _ %+ %1) + %if FORMAT_ELF + global %1:data hidden + %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN + global %1:private_extern + %else + global %1 + %endif + %1: %2 +%endmacro + +; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default. +%if FORMAT_ELF + [SECTION .note.GNU-stack noalloc noexec nowrite progbits] +%endif + +; Tell debuggers how large the function was. +; This may be invoked multiple times per function; we rely on later instances overriding earlier ones. +; This is invoked by RET and similar macros, and also cglobal does it for the previous function, +; but if the last function in a source file doesn't use any of the standard macros for its epilogue, +; then its size might be unspecified. +%macro annotate_function_size 0 + %ifdef __YASM_VER__ + %ifdef current_function + %if FORMAT_ELF + current_function_section + %%ecf equ $ + size current_function %%ecf - current_function + __SECT__ + %endif + %endif + %endif +%endmacro + +; cpuflags + +%assign cpuflags_mmx (1<<0) +%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx +%assign cpuflags_3dnow (1<<2) | cpuflags_mmx +%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow +%assign cpuflags_sse (1<<4) | cpuflags_mmx2 +%assign cpuflags_sse2 (1<<5) | cpuflags_sse +%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 +%assign cpuflags_lzcnt (1<<7) | cpuflags_sse2 +%assign cpuflags_sse3 (1<<8) | cpuflags_sse2 +%assign cpuflags_ssse3 (1<<9) | cpuflags_sse3 +%assign cpuflags_sse4 (1<<10) | cpuflags_ssse3 +%assign cpuflags_sse42 (1<<11) | cpuflags_sse4 +%assign cpuflags_aesni (1<<12) | cpuflags_sse42 +%assign cpuflags_gfni (1<<13) | cpuflags_sse42 +%assign cpuflags_avx (1<<14) | cpuflags_sse42 +%assign cpuflags_xop (1<<15) | cpuflags_avx +%assign cpuflags_fma4 (1<<16) | cpuflags_avx +%assign cpuflags_fma3 (1<<17) | cpuflags_avx +%assign cpuflags_bmi1 (1<<18) | cpuflags_avx|cpuflags_lzcnt +%assign cpuflags_bmi2 (1<<19) | cpuflags_bmi1 +%assign cpuflags_avx2 (1<<20) | cpuflags_fma3|cpuflags_bmi2 +%assign cpuflags_avx512 (1<<21) | cpuflags_avx2 ; F, CD, BW, DQ, VL +%assign cpuflags_avx512icl (1<<22) | cpuflags_avx512|cpuflags_gfni ; VNNI, IFMA, VBMI, VBMI2, VPOPCNTDQ, BITALG, VAES, VPCLMULQDQ + +%assign cpuflags_cache32 (1<<23) +%assign cpuflags_cache64 (1<<24) +%assign cpuflags_aligned (1<<25) ; not a cpu feature, but a function variant +%assign cpuflags_atom (1<<26) + +; Returns a boolean value expressing whether or not the specified cpuflag is enabled. +%define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1) +%define notcpuflag(x) (cpuflag(x) ^ 1) + +; Takes an arbitrary number of cpuflags from the above list. +; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. +; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. +%macro INIT_CPUFLAGS 0-* + %xdefine SUFFIX + %undef cpuname + %assign cpuflags 0 + + %if %0 >= 1 + %rep %0 + %ifdef cpuname + %xdefine cpuname cpuname %+ _%1 + %else + %xdefine cpuname %1 + %endif + %assign cpuflags cpuflags | cpuflags_%1 + %rotate 1 + %endrep + %xdefine SUFFIX _ %+ cpuname + + %if cpuflag(avx) + %assign avx_enabled 1 + %endif + %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2)) + %define mova movaps + %define movu movups + %define movnta movntps + %endif + %if cpuflag(aligned) + %define movu mova + %elif cpuflag(sse3) && notcpuflag(ssse3) + %define movu lddqu + %endif + %endif + + %if ARCH_X86_64 || cpuflag(sse2) + %ifdef __NASM_VER__ + ALIGNMODE p6 + %else + CPU amdnop + %endif + %else + %ifdef __NASM_VER__ + ALIGNMODE nop + %else + CPU basicnop + %endif + %endif +%endmacro + +; Merge mmx, sse*, and avx* +; m# is a simd register of the currently selected size +; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m# +; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m# +; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m# +; (All 4 remain in sync through SWAP.) + +%macro CAT_XDEFINE 3 + %xdefine %1%2 %3 +%endmacro + +%macro CAT_UNDEF 2 + %undef %1%2 +%endmacro + +%macro DEFINE_MMREGS 1 ; mmtype + %assign %%prev_mmregs 0 + %ifdef num_mmregs + %assign %%prev_mmregs num_mmregs + %endif + + %assign num_mmregs 8 + %if ARCH_X86_64 && mmsize >= 16 + %assign num_mmregs 16 + %if cpuflag(avx512) || mmsize == 64 + %assign num_mmregs 32 + %endif + %endif + + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE m, %%i, %1 %+ %%i + CAT_XDEFINE nn%1, %%i, %%i + %assign %%i %%i+1 + %endrep + %if %%prev_mmregs > num_mmregs + %rep %%prev_mmregs - num_mmregs + CAT_UNDEF m, %%i + CAT_UNDEF nn %+ mmtype, %%i + %assign %%i %%i+1 + %endrep + %endif + %xdefine mmtype %1 +%endmacro + +; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper +%macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg + %if ARCH_X86_64 && cpuflag(avx512) + %assign %%i %1 + %rep 16-%1 + %assign %%i_high %%i+16 + SWAP %%i, %%i_high + %assign %%i %%i+1 + %endrep + %endif +%endmacro + +%macro INIT_MMX 0-1+ + %assign avx_enabled 0 + %define RESET_MM_PERMUTATION INIT_MMX %1 + %define mmsize 8 + %define mova movq + %define movu movq + %define movh movd + %define movnta movntq + INIT_CPUFLAGS %1 + DEFINE_MMREGS mm +%endmacro + +%macro INIT_XMM 0-1+ + %assign avx_enabled 0 + %define RESET_MM_PERMUTATION INIT_XMM %1 + %define mmsize 16 + %define mova movdqa + %define movu movdqu + %define movh movq + %define movnta movntdq + INIT_CPUFLAGS %1 + DEFINE_MMREGS xmm + %if WIN64 + AVX512_MM_PERMUTATION 6 ; Swap callee-saved registers with volatile registers + %endif + %xdefine bcstd 1to4 + %xdefine bcstq 1to2 +%endmacro + +%macro INIT_YMM 0-1+ + %assign avx_enabled 1 + %define RESET_MM_PERMUTATION INIT_YMM %1 + %define mmsize 32 + %define mova movdqa + %define movu movdqu + %undef movh + %define movnta movntdq + INIT_CPUFLAGS %1 + DEFINE_MMREGS ymm + AVX512_MM_PERMUTATION + %xdefine bcstd 1to8 + %xdefine bcstq 1to4 +%endmacro + +%macro INIT_ZMM 0-1+ + %assign avx_enabled 1 + %define RESET_MM_PERMUTATION INIT_ZMM %1 + %define mmsize 64 + %define mova movdqa + %define movu movdqu + %undef movh + %define movnta movntdq + INIT_CPUFLAGS %1 + DEFINE_MMREGS zmm + AVX512_MM_PERMUTATION + %xdefine bcstd 1to16 + %xdefine bcstq 1to8 +%endmacro + +INIT_XMM + +%macro DECLARE_MMCAST 1 + %define mmmm%1 mm%1 + %define mmxmm%1 mm%1 + %define mmymm%1 mm%1 + %define mmzmm%1 mm%1 + %define xmmmm%1 mm%1 + %define xmmxmm%1 xmm%1 + %define xmmymm%1 xmm%1 + %define xmmzmm%1 xmm%1 + %define ymmmm%1 mm%1 + %define ymmxmm%1 xmm%1 + %define ymmymm%1 ymm%1 + %define ymmzmm%1 ymm%1 + %define zmmmm%1 mm%1 + %define zmmxmm%1 xmm%1 + %define zmmymm%1 ymm%1 + %define zmmzmm%1 zmm%1 + %define xm%1 xmm %+ m%1 + %define ym%1 ymm %+ m%1 + %define zm%1 zmm %+ m%1 +%endmacro + +%assign i 0 +%rep 32 + DECLARE_MMCAST i + %assign i i+1 +%endrep + +; I often want to use macros that permute their arguments. e.g. there's no +; efficient way to implement butterfly or transpose or dct without swapping some +; arguments. +; +; I would like to not have to manually keep track of the permutations: +; If I insert a permutation in the middle of a function, it should automatically +; change everything that follows. For more complex macros I may also have multiple +; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. +; +; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that +; permutes its arguments. It's equivalent to exchanging the contents of the +; registers, except that this way you exchange the register names instead, so it +; doesn't cost any cycles. + +%macro PERMUTE 2-* ; takes a list of pairs to swap + %rep %0/2 + %xdefine %%tmp%2 m%2 + %rotate 2 + %endrep + %rep %0/2 + %xdefine m%1 %%tmp%2 + CAT_XDEFINE nn, m%1, %1 + %rotate 2 + %endrep +%endmacro + +%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs) + %ifnum %1 ; SWAP 0, 1, ... + SWAP_INTERNAL_NUM %1, %2 + %else ; SWAP m0, m1, ... + SWAP_INTERNAL_NAME %1, %2 + %endif +%endmacro + +%macro SWAP_INTERNAL_NUM 2-* + %rep %0-1 + %xdefine %%tmp m%1 + %xdefine m%1 m%2 + %xdefine m%2 %%tmp + CAT_XDEFINE nn, m%1, %1 + CAT_XDEFINE nn, m%2, %2 + %rotate 1 + %endrep +%endmacro + +%macro SWAP_INTERNAL_NAME 2-* + %xdefine %%args nn %+ %1 + %rep %0-1 + %xdefine %%args %%args, nn %+ %2 + %rotate 1 + %endrep + SWAP_INTERNAL_NUM %%args +%endmacro + +; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later +; calls to that function will automatically load the permutation, so values can +; be returned in mmregs. +%macro SAVE_MM_PERMUTATION 0-1 + %if %0 + %xdefine %%f %1_m + %else + %xdefine %%f current_function %+ _m + %endif + %assign %%i 0 + %rep num_mmregs + %xdefine %%tmp m %+ %%i + CAT_XDEFINE %%f, %%i, regnumof %+ %%tmp + %assign %%i %%i+1 + %endrep +%endmacro + +%macro LOAD_MM_PERMUTATION 0-1 ; name to load from + %if %0 + %xdefine %%f %1_m + %else + %xdefine %%f current_function %+ _m + %endif + %xdefine %%tmp %%f %+ 0 + %ifnum %%tmp + RESET_MM_PERMUTATION + AVX512_MM_PERMUTATION + %assign %%i 0 + %rep num_mmregs + %xdefine %%tmp %%f %+ %%i + CAT_XDEFINE %%m, %%i, m %+ %%tmp + %assign %%i %%i+1 + %endrep + %rep num_mmregs + %assign %%i %%i-1 + CAT_XDEFINE m, %%i, %%m %+ %%i + CAT_XDEFINE nn, m %+ %%i, %%i + %endrep + %endif +%endmacro + +; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't +%macro call 1 + %ifid %1 + call_internal %1 %+ SUFFIX, %1 + %else + call %1 + %endif +%endmacro +%macro call_internal 2 + %xdefine %%i %2 + %ifndef cglobaled_%2 + %ifdef cglobaled_%1 + %xdefine %%i %1 + %endif + %endif + call %%i + LOAD_MM_PERMUTATION %%i +%endmacro + +; Substitutions that reduce instruction size but are functionally equivalent +%macro add 2 + %ifnum %2 + %if %2==128 + sub %1, -128 + %else + add %1, %2 + %endif + %else + add %1, %2 + %endif +%endmacro + +%macro sub 2 + %ifnum %2 + %if %2==128 + add %1, -128 + %else + sub %1, %2 + %endif + %else + sub %1, %2 + %endif +%endmacro + +;============================================================================= +; AVX abstraction layer +;============================================================================= + +%assign i 0 +%rep 32 + %if i < 8 + CAT_XDEFINE sizeofmm, i, 8 + CAT_XDEFINE regnumofmm, i, i + %endif + CAT_XDEFINE sizeofxmm, i, 16 + CAT_XDEFINE sizeofymm, i, 32 + CAT_XDEFINE sizeofzmm, i, 64 + CAT_XDEFINE regnumofxmm, i, i + CAT_XDEFINE regnumofymm, i, i + CAT_XDEFINE regnumofzmm, i, i + %assign i i+1 +%endrep +%undef i + +%macro CHECK_AVX_INSTR_EMU 3-* + %xdefine %%opcode %1 + %xdefine %%dst %2 + %rep %0-2 + %ifidn %%dst, %3 + %error non-avx emulation of ``%%opcode'' is not supported + %endif + %rotate 1 + %endrep +%endmacro + +;%1 == instruction +;%2 == minimal instruction set +;%3 == 1 if float, 0 if int +;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation) +;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not +;%6+: operands +%macro RUN_AVX_INSTR 6-9+ + %ifnum sizeof%7 + %assign __sizeofreg sizeof%7 + %elifnum sizeof%6 + %assign __sizeofreg sizeof%6 + %else + %assign __sizeofreg mmsize + %endif + %assign __emulate_avx 0 + %if avx_enabled && __sizeofreg >= 16 + %xdefine __instr v%1 + %else + %xdefine __instr %1 + %if %0 >= 8+%4 + %assign __emulate_avx 1 + %endif + %endif + %ifnidn %2, fnord + %ifdef cpuname + %if notcpuflag(%2) + %error use of ``%1'' %2 instruction in cpuname function: current_function + %elif %3 == 0 && __sizeofreg == 16 && notcpuflag(sse2) + %error use of ``%1'' sse2 instruction in cpuname function: current_function + %elif %3 == 0 && __sizeofreg == 32 && notcpuflag(avx2) + %error use of ``%1'' avx2 instruction in cpuname function: current_function + %elif __sizeofreg == 16 && notcpuflag(sse) + %error use of ``%1'' sse instruction in cpuname function: current_function + %elif __sizeofreg == 32 && notcpuflag(avx) + %error use of ``%1'' avx instruction in cpuname function: current_function + %elif __sizeofreg == 64 && notcpuflag(avx512) + %error use of ``%1'' avx512 instruction in cpuname function: current_function + %elifidn %1, pextrw ; special case because the base instruction is mmx2, + %ifnid %6 ; but sse4 is required for memory operands + %if notcpuflag(sse4) + %error use of ``%1'' sse4 instruction in cpuname function: current_function + %endif + %endif + %endif + %endif + %endif + + %if __emulate_avx + %xdefine __src1 %7 + %xdefine __src2 %8 + %if %5 && %4 == 0 + %ifnidn %6, %7 + %ifidn %6, %8 + %xdefine __src1 %8 + %xdefine __src2 %7 + %elifnnum sizeof%8 + ; 3-operand AVX instructions with a memory arg can only have it in src2, + ; whereas SSE emulation prefers to have it in src1 (i.e. the mov). + ; So, if the instruction is commutative with a memory arg, swap them. + %xdefine __src1 %8 + %xdefine __src2 %7 + %endif + %endif + %endif + %ifnidn %6, __src1 + %if %0 >= 9 + CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, __src2, %9 + %else + CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, __src2 + %endif + %if __sizeofreg == 8 + MOVQ %6, __src1 + %elif %3 + MOVAPS %6, __src1 + %else + MOVDQA %6, __src1 + %endif + %endif + %if %0 >= 9 + %1 %6, __src2, %9 + %else + %1 %6, __src2 + %endif + %elif %0 >= 9 + __instr %6, %7, %8, %9 + %elif %0 == 8 + %if avx_enabled && %5 + %xdefine __src1 %7 + %xdefine __src2 %8 + %ifnum regnumof%7 + %ifnum regnumof%8 + %if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32 + ; Most VEX-encoded instructions require an additional byte to encode when + ; src2 is a high register (e.g. m8..15). If the instruction is commutative + ; we can swap src1 and src2 when doing so reduces the instruction length. + %xdefine __src1 %8 + %xdefine __src2 %7 + %endif + %endif + %endif + __instr %6, __src1, __src2 + %else + __instr %6, %7, %8 + %endif + %elif %0 == 7 + %if avx_enabled && %5 + %xdefine __src1 %6 + %xdefine __src2 %7 + %ifnum regnumof%6 + %ifnum regnumof%7 + %if regnumof%6 < 8 && regnumof%7 >= 8 && regnumof%7 < 16 && sizeof%7 <= 32 + %xdefine __src1 %7 + %xdefine __src2 %6 + %endif + %endif + %endif + __instr %6, __src1, __src2 + %else + __instr %6, %7 + %endif + %else + __instr %6 + %endif +%endmacro + +;%1 == instruction +;%2 == minimal instruction set +;%3 == 1 if float, 0 if int +;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation) +;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not +%macro AVX_INSTR 1-5 fnord, 0, 255, 0 + %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5 + %ifidn %2, fnord + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1 + %elifidn %3, fnord + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2 + %elifidn %4, fnord + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3 + %elifidn %5, fnord + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4 + %else + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5 + %endif + %endmacro +%endmacro + +; Instructions with both VEX/EVEX and legacy encodings +; Non-destructive instructions are written without parameters +AVX_INSTR addpd, sse2, 1, 0, 1 +AVX_INSTR addps, sse, 1, 0, 1 +AVX_INSTR addsd, sse2, 1, 0, 0 +AVX_INSTR addss, sse, 1, 0, 0 +AVX_INSTR addsubpd, sse3, 1, 0, 0 +AVX_INSTR addsubps, sse3, 1, 0, 0 +AVX_INSTR aesdec, aesni, 0, 0, 0 +AVX_INSTR aesdeclast, aesni, 0, 0, 0 +AVX_INSTR aesenc, aesni, 0, 0, 0 +AVX_INSTR aesenclast, aesni, 0, 0, 0 +AVX_INSTR aesimc, aesni +AVX_INSTR aeskeygenassist, aesni +AVX_INSTR andnpd, sse2, 1, 0, 0 +AVX_INSTR andnps, sse, 1, 0, 0 +AVX_INSTR andpd, sse2, 1, 0, 1 +AVX_INSTR andps, sse, 1, 0, 1 +AVX_INSTR blendpd, sse4, 1, 1, 0 +AVX_INSTR blendps, sse4, 1, 1, 0 +AVX_INSTR blendvpd, sse4 ; can't be emulated +AVX_INSTR blendvps, sse4 ; can't be emulated +AVX_INSTR cmpeqpd, sse2, 1, 0, 1 +AVX_INSTR cmpeqps, sse, 1, 0, 1 +AVX_INSTR cmpeqsd, sse2, 1, 0, 0 +AVX_INSTR cmpeqss, sse, 1, 0, 0 +AVX_INSTR cmplepd, sse2, 1, 0, 0 +AVX_INSTR cmpleps, sse, 1, 0, 0 +AVX_INSTR cmplesd, sse2, 1, 0, 0 +AVX_INSTR cmpless, sse, 1, 0, 0 +AVX_INSTR cmpltpd, sse2, 1, 0, 0 +AVX_INSTR cmpltps, sse, 1, 0, 0 +AVX_INSTR cmpltsd, sse2, 1, 0, 0 +AVX_INSTR cmpltss, sse, 1, 0, 0 +AVX_INSTR cmpneqpd, sse2, 1, 0, 1 +AVX_INSTR cmpneqps, sse, 1, 0, 1 +AVX_INSTR cmpneqsd, sse2, 1, 0, 0 +AVX_INSTR cmpneqss, sse, 1, 0, 0 +AVX_INSTR cmpnlepd, sse2, 1, 0, 0 +AVX_INSTR cmpnleps, sse, 1, 0, 0 +AVX_INSTR cmpnlesd, sse2, 1, 0, 0 +AVX_INSTR cmpnless, sse, 1, 0, 0 +AVX_INSTR cmpnltpd, sse2, 1, 0, 0 +AVX_INSTR cmpnltps, sse, 1, 0, 0 +AVX_INSTR cmpnltsd, sse2, 1, 0, 0 +AVX_INSTR cmpnltss, sse, 1, 0, 0 +AVX_INSTR cmpordpd, sse2 1, 0, 1 +AVX_INSTR cmpordps, sse 1, 0, 1 +AVX_INSTR cmpordsd, sse2 1, 0, 0 +AVX_INSTR cmpordss, sse 1, 0, 0 +AVX_INSTR cmppd, sse2, 1, 1, 0 +AVX_INSTR cmpps, sse, 1, 1, 0 +AVX_INSTR cmpsd, sse2, 1, 1, 0 +AVX_INSTR cmpss, sse, 1, 1, 0 +AVX_INSTR cmpunordpd, sse2, 1, 0, 1 +AVX_INSTR cmpunordps, sse, 1, 0, 1 +AVX_INSTR cmpunordsd, sse2, 1, 0, 0 +AVX_INSTR cmpunordss, sse, 1, 0, 0 +AVX_INSTR comisd, sse2, 1 +AVX_INSTR comiss, sse, 1 +AVX_INSTR cvtdq2pd, sse2, 1 +AVX_INSTR cvtdq2ps, sse2, 1 +AVX_INSTR cvtpd2dq, sse2, 1 +AVX_INSTR cvtpd2ps, sse2, 1 +AVX_INSTR cvtps2dq, sse2, 1 +AVX_INSTR cvtps2pd, sse2, 1 +AVX_INSTR cvtsd2si, sse2, 1 +AVX_INSTR cvtsd2ss, sse2, 1, 0, 0 +AVX_INSTR cvtsi2sd, sse2, 1, 0, 0 +AVX_INSTR cvtsi2ss, sse, 1, 0, 0 +AVX_INSTR cvtss2sd, sse2, 1, 0, 0 +AVX_INSTR cvtss2si, sse, 1 +AVX_INSTR cvttpd2dq, sse2, 1 +AVX_INSTR cvttps2dq, sse2, 1 +AVX_INSTR cvttsd2si, sse2, 1 +AVX_INSTR cvttss2si, sse, 1 +AVX_INSTR divpd, sse2, 1, 0, 0 +AVX_INSTR divps, sse, 1, 0, 0 +AVX_INSTR divsd, sse2, 1, 0, 0 +AVX_INSTR divss, sse, 1, 0, 0 +AVX_INSTR dppd, sse4, 1, 1, 0 +AVX_INSTR dpps, sse4, 1, 1, 0 +AVX_INSTR extractps, sse4, 1 +AVX_INSTR gf2p8affineinvqb, gfni, 0, 1, 0 +AVX_INSTR gf2p8affineqb, gfni, 0, 1, 0 +AVX_INSTR gf2p8mulb, gfni, 0, 0, 0 +AVX_INSTR haddpd, sse3, 1, 0, 0 +AVX_INSTR haddps, sse3, 1, 0, 0 +AVX_INSTR hsubpd, sse3, 1, 0, 0 +AVX_INSTR hsubps, sse3, 1, 0, 0 +AVX_INSTR insertps, sse4, 1, 1, 0 +AVX_INSTR lddqu, sse3 +AVX_INSTR ldmxcsr, sse, 1 +AVX_INSTR maskmovdqu, sse2 +AVX_INSTR maxpd, sse2, 1, 0, 1 +AVX_INSTR maxps, sse, 1, 0, 1 +AVX_INSTR maxsd, sse2, 1, 0, 0 +AVX_INSTR maxss, sse, 1, 0, 0 +AVX_INSTR minpd, sse2, 1, 0, 1 +AVX_INSTR minps, sse, 1, 0, 1 +AVX_INSTR minsd, sse2, 1, 0, 0 +AVX_INSTR minss, sse, 1, 0, 0 +AVX_INSTR movapd, sse2, 1 +AVX_INSTR movaps, sse, 1 +AVX_INSTR movd, mmx +AVX_INSTR movddup, sse3, 1 +AVX_INSTR movdqa, sse2 +AVX_INSTR movdqu, sse2 +AVX_INSTR movhlps, sse, 1, 0, 0 +AVX_INSTR movhpd, sse2, 1, 0, 0 +AVX_INSTR movhps, sse, 1, 0, 0 +AVX_INSTR movlhps, sse, 1, 0, 0 +AVX_INSTR movlpd, sse2, 1, 0, 0 +AVX_INSTR movlps, sse, 1, 0, 0 +AVX_INSTR movmskpd, sse2, 1 +AVX_INSTR movmskps, sse, 1 +AVX_INSTR movntdq, sse2 +AVX_INSTR movntdqa, sse4 +AVX_INSTR movntpd, sse2, 1 +AVX_INSTR movntps, sse, 1 +AVX_INSTR movq, mmx +AVX_INSTR movsd, sse2, 1, 0, 0 +AVX_INSTR movshdup, sse3, 1 +AVX_INSTR movsldup, sse3, 1 +AVX_INSTR movss, sse, 1, 0, 0 +AVX_INSTR movupd, sse2, 1 +AVX_INSTR movups, sse, 1 +AVX_INSTR mpsadbw, sse4, 0, 1, 0 +AVX_INSTR mulpd, sse2, 1, 0, 1 +AVX_INSTR mulps, sse, 1, 0, 1 +AVX_INSTR mulsd, sse2, 1, 0, 0 +AVX_INSTR mulss, sse, 1, 0, 0 +AVX_INSTR orpd, sse2, 1, 0, 1 +AVX_INSTR orps, sse, 1, 0, 1 +AVX_INSTR pabsb, ssse3 +AVX_INSTR pabsd, ssse3 +AVX_INSTR pabsw, ssse3 +AVX_INSTR packssdw, mmx, 0, 0, 0 +AVX_INSTR packsswb, mmx, 0, 0, 0 +AVX_INSTR packusdw, sse4, 0, 0, 0 +AVX_INSTR packuswb, mmx, 0, 0, 0 +AVX_INSTR paddb, mmx, 0, 0, 1 +AVX_INSTR paddd, mmx, 0, 0, 1 +AVX_INSTR paddq, sse2, 0, 0, 1 +AVX_INSTR paddsb, mmx, 0, 0, 1 +AVX_INSTR paddsw, mmx, 0, 0, 1 +AVX_INSTR paddusb, mmx, 0, 0, 1 +AVX_INSTR paddusw, mmx, 0, 0, 1 +AVX_INSTR paddw, mmx, 0, 0, 1 +AVX_INSTR palignr, ssse3, 0, 1, 0 +AVX_INSTR pand, mmx, 0, 0, 1 +AVX_INSTR pandn, mmx, 0, 0, 0 +AVX_INSTR pavgb, mmx2, 0, 0, 1 +AVX_INSTR pavgw, mmx2, 0, 0, 1 +AVX_INSTR pblendvb, sse4 ; can't be emulated +AVX_INSTR pblendw, sse4, 0, 1, 0 +AVX_INSTR pclmulhqhqdq, fnord, 0, 0, 0 +AVX_INSTR pclmulhqlqdq, fnord, 0, 0, 0 +AVX_INSTR pclmullqhqdq, fnord, 0, 0, 0 +AVX_INSTR pclmullqlqdq, fnord, 0, 0, 0 +AVX_INSTR pclmulqdq, fnord, 0, 1, 0 +AVX_INSTR pcmpeqb, mmx, 0, 0, 1 +AVX_INSTR pcmpeqd, mmx, 0, 0, 1 +AVX_INSTR pcmpeqq, sse4, 0, 0, 1 +AVX_INSTR pcmpeqw, mmx, 0, 0, 1 +AVX_INSTR pcmpestri, sse42 +AVX_INSTR pcmpestrm, sse42 +AVX_INSTR pcmpgtb, mmx, 0, 0, 0 +AVX_INSTR pcmpgtd, mmx, 0, 0, 0 +AVX_INSTR pcmpgtq, sse42, 0, 0, 0 +AVX_INSTR pcmpgtw, mmx, 0, 0, 0 +AVX_INSTR pcmpistri, sse42 +AVX_INSTR pcmpistrm, sse42 +AVX_INSTR pextrb, sse4 +AVX_INSTR pextrd, sse4 +AVX_INSTR pextrq, sse4 +AVX_INSTR pextrw, mmx2 +AVX_INSTR phaddd, ssse3, 0, 0, 0 +AVX_INSTR phaddsw, ssse3, 0, 0, 0 +AVX_INSTR phaddw, ssse3, 0, 0, 0 +AVX_INSTR phminposuw, sse4 +AVX_INSTR phsubd, ssse3, 0, 0, 0 +AVX_INSTR phsubsw, ssse3, 0, 0, 0 +AVX_INSTR phsubw, ssse3, 0, 0, 0 +AVX_INSTR pinsrb, sse4, 0, 1, 0 +AVX_INSTR pinsrd, sse4, 0, 1, 0 +AVX_INSTR pinsrq, sse4, 0, 1, 0 +AVX_INSTR pinsrw, mmx2, 0, 1, 0 +AVX_INSTR pmaddubsw, ssse3, 0, 0, 0 +AVX_INSTR pmaddwd, mmx, 0, 0, 1 +AVX_INSTR pmaxsb, sse4, 0, 0, 1 +AVX_INSTR pmaxsd, sse4, 0, 0, 1 +AVX_INSTR pmaxsw, mmx2, 0, 0, 1 +AVX_INSTR pmaxub, mmx2, 0, 0, 1 +AVX_INSTR pmaxud, sse4, 0, 0, 1 +AVX_INSTR pmaxuw, sse4, 0, 0, 1 +AVX_INSTR pminsb, sse4, 0, 0, 1 +AVX_INSTR pminsd, sse4, 0, 0, 1 +AVX_INSTR pminsw, mmx2, 0, 0, 1 +AVX_INSTR pminub, mmx2, 0, 0, 1 +AVX_INSTR pminud, sse4, 0, 0, 1 +AVX_INSTR pminuw, sse4, 0, 0, 1 +AVX_INSTR pmovmskb, mmx2 +AVX_INSTR pmovsxbd, sse4 +AVX_INSTR pmovsxbq, sse4 +AVX_INSTR pmovsxbw, sse4 +AVX_INSTR pmovsxdq, sse4 +AVX_INSTR pmovsxwd, sse4 +AVX_INSTR pmovsxwq, sse4 +AVX_INSTR pmovzxbd, sse4 +AVX_INSTR pmovzxbq, sse4 +AVX_INSTR pmovzxbw, sse4 +AVX_INSTR pmovzxdq, sse4 +AVX_INSTR pmovzxwd, sse4 +AVX_INSTR pmovzxwq, sse4 +AVX_INSTR pmuldq, sse4, 0, 0, 1 +AVX_INSTR pmulhrsw, ssse3, 0, 0, 1 +AVX_INSTR pmulhuw, mmx2, 0, 0, 1 +AVX_INSTR pmulhw, mmx, 0, 0, 1 +AVX_INSTR pmulld, sse4, 0, 0, 1 +AVX_INSTR pmullw, mmx, 0, 0, 1 +AVX_INSTR pmuludq, sse2, 0, 0, 1 +AVX_INSTR por, mmx, 0, 0, 1 +AVX_INSTR psadbw, mmx2, 0, 0, 1 +AVX_INSTR pshufb, ssse3, 0, 0, 0 +AVX_INSTR pshufd, sse2 +AVX_INSTR pshufhw, sse2 +AVX_INSTR pshuflw, sse2 +AVX_INSTR psignb, ssse3, 0, 0, 0 +AVX_INSTR psignd, ssse3, 0, 0, 0 +AVX_INSTR psignw, ssse3, 0, 0, 0 +AVX_INSTR pslld, mmx, 0, 0, 0 +AVX_INSTR pslldq, sse2, 0, 0, 0 +AVX_INSTR psllq, mmx, 0, 0, 0 +AVX_INSTR psllw, mmx, 0, 0, 0 +AVX_INSTR psrad, mmx, 0, 0, 0 +AVX_INSTR psraw, mmx, 0, 0, 0 +AVX_INSTR psrld, mmx, 0, 0, 0 +AVX_INSTR psrldq, sse2, 0, 0, 0 +AVX_INSTR psrlq, mmx, 0, 0, 0 +AVX_INSTR psrlw, mmx, 0, 0, 0 +AVX_INSTR psubb, mmx, 0, 0, 0 +AVX_INSTR psubd, mmx, 0, 0, 0 +AVX_INSTR psubq, sse2, 0, 0, 0 +AVX_INSTR psubsb, mmx, 0, 0, 0 +AVX_INSTR psubsw, mmx, 0, 0, 0 +AVX_INSTR psubusb, mmx, 0, 0, 0 +AVX_INSTR psubusw, mmx, 0, 0, 0 +AVX_INSTR psubw, mmx, 0, 0, 0 +AVX_INSTR ptest, sse4 +AVX_INSTR punpckhbw, mmx, 0, 0, 0 +AVX_INSTR punpckhdq, mmx, 0, 0, 0 +AVX_INSTR punpckhqdq, sse2, 0, 0, 0 +AVX_INSTR punpckhwd, mmx, 0, 0, 0 +AVX_INSTR punpcklbw, mmx, 0, 0, 0 +AVX_INSTR punpckldq, mmx, 0, 0, 0 +AVX_INSTR punpcklqdq, sse2, 0, 0, 0 +AVX_INSTR punpcklwd, mmx, 0, 0, 0 +AVX_INSTR pxor, mmx, 0, 0, 1 +AVX_INSTR rcpps, sse, 1 +AVX_INSTR rcpss, sse, 1, 0, 0 +AVX_INSTR roundpd, sse4, 1 +AVX_INSTR roundps, sse4, 1 +AVX_INSTR roundsd, sse4, 1, 1, 0 +AVX_INSTR roundss, sse4, 1, 1, 0 +AVX_INSTR rsqrtps, sse, 1 +AVX_INSTR rsqrtss, sse, 1, 0, 0 +AVX_INSTR shufpd, sse2, 1, 1, 0 +AVX_INSTR shufps, sse, 1, 1, 0 +AVX_INSTR sqrtpd, sse2, 1 +AVX_INSTR sqrtps, sse, 1 +AVX_INSTR sqrtsd, sse2, 1, 0, 0 +AVX_INSTR sqrtss, sse, 1, 0, 0 +AVX_INSTR stmxcsr, sse, 1 +AVX_INSTR subpd, sse2, 1, 0, 0 +AVX_INSTR subps, sse, 1, 0, 0 +AVX_INSTR subsd, sse2, 1, 0, 0 +AVX_INSTR subss, sse, 1, 0, 0 +AVX_INSTR ucomisd, sse2, 1 +AVX_INSTR ucomiss, sse, 1 +AVX_INSTR unpckhpd, sse2, 1, 0, 0 +AVX_INSTR unpckhps, sse, 1, 0, 0 +AVX_INSTR unpcklpd, sse2, 1, 0, 0 +AVX_INSTR unpcklps, sse, 1, 0, 0 +AVX_INSTR xorpd, sse2, 1, 0, 1 +AVX_INSTR xorps, sse, 1, 0, 1 + +; 3DNow instructions, for sharing code between AVX, SSE and 3DN +AVX_INSTR pfadd, 3dnow, 1, 0, 1 +AVX_INSTR pfmul, 3dnow, 1, 0, 1 +AVX_INSTR pfsub, 3dnow, 1, 0, 0 + +;%1 == instruction +;%2 == minimal instruction set +%macro GPR_INSTR 2 + %macro %1 2-5 fnord, %1, %2 + %ifdef cpuname + %if notcpuflag(%5) + %error use of ``%4'' %5 instruction in cpuname function: current_function + %endif + %endif + %ifidn %3, fnord + %4 %1, %2 + %else + %4 %1, %2, %3 + %endif + %endmacro +%endmacro + +GPR_INSTR andn, bmi1 +GPR_INSTR bextr, bmi1 +GPR_INSTR blsi, bmi1 +GPR_INSTR blsmsk, bmi1 +GPR_INSTR blsr, bmi1 +GPR_INSTR bzhi, bmi2 +GPR_INSTR mulx, bmi2 +GPR_INSTR pdep, bmi2 +GPR_INSTR pext, bmi2 +GPR_INSTR popcnt, sse42 +GPR_INSTR rorx, bmi2 +GPR_INSTR sarx, bmi2 +GPR_INSTR shlx, bmi2 +GPR_INSTR shrx, bmi2 + +; base-4 constants for shuffles +%assign i 0 +%rep 256 + %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) + %if j < 10 + CAT_XDEFINE q000, j, i + %elif j < 100 + CAT_XDEFINE q00, j, i + %elif j < 1000 + CAT_XDEFINE q0, j, i + %else + CAT_XDEFINE q, j, i + %endif + %assign i i+1 +%endrep +%undef i +%undef j + +%macro FMA_INSTR 3 + %macro %1 4-7 %1, %2, %3 + %if cpuflag(xop) + v%5 %1, %2, %3, %4 + %elifnidn %1, %4 + %6 %1, %2, %3 + %7 %1, %4 + %else + %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported + %endif + %endmacro +%endmacro + +FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation +FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation +FMA_INSTR pmacsww, pmullw, paddw +FMA_INSTR pmadcswd, pmaddwd, paddd + +; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax. +; FMA3 is only possible if dst is the same as one of the src registers. +; Either src2 or src3 can be a memory operand. +%macro FMA4_INSTR 2-* + %push fma4_instr + %xdefine %$prefix %1 + %rep %0 - 1 + %macro %$prefix%2 4-6 %$prefix, %2 + %if notcpuflag(fma3) && notcpuflag(fma4) + %error use of ``%5%6'' fma instruction in cpuname function: current_function + %elif cpuflag(fma4) + v%5%6 %1, %2, %3, %4 + %elifidn %1, %2 + ; If %3 or %4 is a memory operand it needs to be encoded as the last operand. + %ifnum sizeof%3 + v%{5}213%6 %2, %3, %4 + %else + v%{5}132%6 %2, %4, %3 + %endif + %elifidn %1, %3 + v%{5}213%6 %3, %2, %4 + %elifidn %1, %4 + v%{5}231%6 %4, %2, %3 + %else + %error fma3 emulation of ``%5%6 %1, %2, %3, %4'' is not supported + %endif + %endmacro + %rotate 1 + %endrep + %pop +%endmacro + +FMA4_INSTR fmadd, pd, ps, sd, ss +FMA4_INSTR fmaddsub, pd, ps +FMA4_INSTR fmsub, pd, ps, sd, ss +FMA4_INSTR fmsubadd, pd, ps +FMA4_INSTR fnmadd, pd, ps, sd, ss +FMA4_INSTR fnmsub, pd, ps, sd, ss + +; Macros for converting VEX instructions to equivalent EVEX ones. +%macro EVEX_INSTR 2-3 0 ; vex, evex, prefer_evex + %macro %1 2-7 fnord, fnord, %1, %2, %3 + %ifidn %3, fnord + %define %%args %1, %2 + %elifidn %4, fnord + %define %%args %1, %2, %3 + %else + %define %%args %1, %2, %3, %4 + %endif + %assign %%evex_required cpuflag(avx512) & %7 + %ifnum regnumof%1 + %if regnumof%1 >= 16 || sizeof%1 > 32 + %assign %%evex_required 1 + %endif + %endif + %ifnum regnumof%2 + %if regnumof%2 >= 16 || sizeof%2 > 32 + %assign %%evex_required 1 + %endif + %endif + %ifnum regnumof%3 + %if regnumof%3 >= 16 || sizeof%3 > 32 + %assign %%evex_required 1 + %endif + %endif + %if %%evex_required + %6 %%args + %else + %5 %%args ; Prefer VEX over EVEX due to shorter instruction length + %endif + %endmacro +%endmacro + +EVEX_INSTR vbroadcastf128, vbroadcastf32x4 +EVEX_INSTR vbroadcasti128, vbroadcasti32x4 +EVEX_INSTR vextractf128, vextractf32x4 +EVEX_INSTR vextracti128, vextracti32x4 +EVEX_INSTR vinsertf128, vinsertf32x4 +EVEX_INSTR vinserti128, vinserti32x4 +EVEX_INSTR vmovdqa, vmovdqa32 +EVEX_INSTR vmovdqu, vmovdqu32 +EVEX_INSTR vpand, vpandd +EVEX_INSTR vpandn, vpandnd +EVEX_INSTR vpor, vpord +EVEX_INSTR vpxor, vpxord +EVEX_INSTR vrcpps, vrcp14ps, 1 ; EVEX versions have higher precision +EVEX_INSTR vrcpss, vrcp14ss, 1 +EVEX_INSTR vrsqrtps, vrsqrt14ps, 1 +EVEX_INSTR vrsqrtss, vrsqrt14ss, 1 diff --git a/third_party/dav1d/src/fg_apply.h b/third_party/dav1d/src/fg_apply.h new file mode 100644 index 0000000000..6b96a06fc3 --- /dev/null +++ b/third_party/dav1d/src/fg_apply.h @@ -0,0 +1,41 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_FG_APPLY_H +#define DAV1D_SRC_FG_APPLY_H + +#include "dav1d/picture.h" + +#include "common/bitdepth.h" + +#include "src/film_grain.h" + +bitfn_decls(void dav1d_apply_grain, const Dav1dFilmGrainDSPContext *const dsp, + Dav1dPicture *const out, + Dav1dPicture *const in); + +#endif /* DAV1D_SRC_FG_APPLY_H */ diff --git a/third_party/dav1d/src/fg_apply_tmpl.c b/third_party/dav1d/src/fg_apply_tmpl.c new file mode 100644 index 0000000000..4cde92c49b --- /dev/null +++ b/third_party/dav1d/src/fg_apply_tmpl.c @@ -0,0 +1,209 @@ +/* + * Copyright © 2018, Niklas Haas + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include + +#include "dav1d/picture.h" + +#include "common.h" +#include "common/intops.h" +#include "common/bitdepth.h" + +#include "fg_apply.h" + +static void generate_scaling(const int bitdepth, + const uint8_t points[][2], const int num, + uint8_t scaling[SCALING_SIZE]) +{ +#if BITDEPTH == 8 + const int shift_x = 0; +#else + const int shift_x = bitdepth - 8; +#endif + const int scaling_size = 1 << bitdepth; + + // Fill up the preceding entries with the initial value + for (int i = 0; i < points[0][0] << shift_x; i++) + scaling[i] = points[0][1]; + + // Linearly interpolate the values in the middle + for (int i = 0; i < num - 1; i++) { + const int bx = points[i][0]; + const int by = points[i][1]; + const int ex = points[i+1][0]; + const int ey = points[i+1][1]; + const int dx = ex - bx; + const int dy = ey - by; + const int delta = dy * ((0x10000 + (dx >> 1)) / dx); + for (int x = 0; x < dx; x++) { + const int v = by + ((x * delta + 0x8000) >> 16); + scaling[(bx + x) << shift_x] = v; + } + } + + // Fill up the remaining entries with the final value + for (int i = points[num - 1][0] << shift_x; i < scaling_size; i++) + scaling[i] = points[num - 1][1]; + +#if BITDEPTH != 8 + const int pad = 1 << shift_x, rnd = pad >> 1; + for (int i = 0; i < num - 1; i++) { + const int bx = points[i][0] << shift_x; + const int ex = points[i+1][0] << shift_x; + const int dx = ex - bx; + for (int x = 0; x < dx; x += pad) { + const int range = scaling[bx + x + pad] - scaling[bx + x]; + for (int n = 1; n < pad; n++) { + scaling[bx + x + n] = scaling[bx + x] + ((range * n + rnd) >> shift_x); + } + } + } +#endif +} + +#ifndef UNIT_TEST +void bitfn(dav1d_apply_grain)(const Dav1dFilmGrainDSPContext *const dsp, + Dav1dPicture *const out, + Dav1dPicture *const in) +{ + const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data; + + entry grain_lut[3][GRAIN_HEIGHT + 1][GRAIN_WIDTH]; + uint8_t scaling[3][SCALING_SIZE]; +#if BITDEPTH != 8 + const int bitdepth_max = (1 << out->p.bpc) - 1; +#endif + + // Generate grain LUTs as needed + dsp->generate_grain_y(grain_lut[0], data HIGHBD_TAIL_SUFFIX); // always needed + if (data->num_uv_points[0] || data->chroma_scaling_from_luma) + dsp->generate_grain_uv[in->p.layout - 1](grain_lut[1], grain_lut[0], + data, 0 HIGHBD_TAIL_SUFFIX); + if (data->num_uv_points[1] || data->chroma_scaling_from_luma) + dsp->generate_grain_uv[in->p.layout - 1](grain_lut[2], grain_lut[0], + data, 1 HIGHBD_TAIL_SUFFIX); + + // Generate scaling LUTs as needed + if (data->num_y_points) + generate_scaling(in->p.bpc, data->y_points, data->num_y_points, scaling[0]); + if (data->num_uv_points[0]) + generate_scaling(in->p.bpc, data->uv_points[0], data->num_uv_points[0], scaling[1]); + if (data->num_uv_points[1]) + generate_scaling(in->p.bpc, data->uv_points[1], data->num_uv_points[1], scaling[2]); + + // Copy over the non-modified planes + // TODO: eliminate in favor of per-plane refs + assert(out->stride[0] == in->stride[0]); + if (!data->num_y_points) { + const ptrdiff_t stride = out->stride[0]; + const ptrdiff_t sz = out->p.h * stride; + if (sz < 0) + memcpy((uint8_t*) out->data[0] + sz - stride, + (uint8_t*) in->data[0] + sz - stride, -sz); + else + memcpy(out->data[0], in->data[0], sz); + } + + if (in->p.layout != DAV1D_PIXEL_LAYOUT_I400 && !data->chroma_scaling_from_luma) { + assert(out->stride[1] == in->stride[1]); + const int ss_ver = in->p.layout == DAV1D_PIXEL_LAYOUT_I420; + const ptrdiff_t stride = out->stride[1]; + const ptrdiff_t sz = (out->p.h * stride) >> ss_ver; + if (sz < 0) { + if (!data->num_uv_points[0]) + memcpy((uint8_t*) out->data[1] + sz - stride, + (uint8_t*) in->data[1] + sz - stride, -sz); + if (!data->num_uv_points[1]) + memcpy((uint8_t*) out->data[2] + sz - stride, + (uint8_t*) in->data[2] + sz - stride, -sz); + } else { + if (!data->num_uv_points[0]) + memcpy(out->data[1], in->data[1], sz); + if (!data->num_uv_points[1]) + memcpy(out->data[2], in->data[2], sz); + } + } + + // Synthesize grain for the affected planes + const int rows = (out->p.h + 31) >> 5; + const int ss_y = in->p.layout == DAV1D_PIXEL_LAYOUT_I420; + const int ss_x = in->p.layout != DAV1D_PIXEL_LAYOUT_I444; + const int cpw = (out->p.w + ss_x) >> ss_x; + const int is_id = out->seq_hdr->mtrx == DAV1D_MC_IDENTITY; + for (int row = 0; row < rows; row++) { + pixel *const luma_src = + ((pixel *) in->data[0]) + row * BLOCK_SIZE * PXSTRIDE(in->stride[0]); + + if (data->num_y_points) { + const int bh = imin(out->p.h - row * BLOCK_SIZE, BLOCK_SIZE); + dsp->fgy_32x32xn(((pixel *) out->data[0]) + row * BLOCK_SIZE * PXSTRIDE(out->stride[0]), + luma_src, out->stride[0], data, + out->p.w, scaling[0], grain_lut[0], bh, row HIGHBD_TAIL_SUFFIX); + } + + if (!data->num_uv_points[0] && !data->num_uv_points[1] && + !data->chroma_scaling_from_luma) + { + continue; + } + + const int bh = (imin(out->p.h - row * BLOCK_SIZE, BLOCK_SIZE) + ss_y) >> ss_y; + + // extend padding pixels + if (out->p.w & ss_x) { + pixel *ptr = luma_src; + for (int y = 0; y < bh; y++) { + ptr[out->p.w] = ptr[out->p.w - 1]; + ptr += PXSTRIDE(in->stride[0]) << ss_y; + } + } + + const ptrdiff_t uv_off = row * BLOCK_SIZE * PXSTRIDE(out->stride[1]) >> ss_y; + if (data->chroma_scaling_from_luma) { + for (int pl = 0; pl < 2; pl++) + dsp->fguv_32x32xn[in->p.layout - 1](((pixel *) out->data[1 + pl]) + uv_off, + ((const pixel *) in->data[1 + pl]) + uv_off, + in->stride[1], data, cpw, + scaling[0], grain_lut[1 + pl], + bh, row, luma_src, in->stride[0], + pl, is_id HIGHBD_TAIL_SUFFIX); + } else { + for (int pl = 0; pl < 2; pl++) + if (data->num_uv_points[pl]) + dsp->fguv_32x32xn[in->p.layout - 1](((pixel *) out->data[1 + pl]) + uv_off, + ((const pixel *) in->data[1 + pl]) + uv_off, + in->stride[1], data, cpw, + scaling[1 + pl], grain_lut[1 + pl], + bh, row, luma_src, in->stride[0], + pl, is_id HIGHBD_TAIL_SUFFIX); + } + } +} +#endif diff --git a/third_party/dav1d/src/film_grain.h b/third_party/dav1d/src/film_grain.h new file mode 100644 index 0000000000..5bd42877c3 --- /dev/null +++ b/third_party/dav1d/src/film_grain.h @@ -0,0 +1,85 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_FILM_GRAIN_H +#define DAV1D_SRC_FILM_GRAIN_H + +#include "common/bitdepth.h" + +#include "src/levels.h" + +#define GRAIN_WIDTH 82 +#define GRAIN_HEIGHT 73 +#define BLOCK_SIZE 32 +#if !defined(BITDEPTH) || BITDEPTH == 8 +#define SCALING_SIZE 256 +typedef int8_t entry; +#else +#define SCALING_SIZE 4096 +typedef int16_t entry; +#endif + +#define decl_generate_grain_y_fn(name) \ +void (name)(entry buf[][GRAIN_WIDTH], \ + const Dav1dFilmGrainData *const data HIGHBD_DECL_SUFFIX) +typedef decl_generate_grain_y_fn(*generate_grain_y_fn); + +#define decl_generate_grain_uv_fn(name) \ +void (name)(entry buf[][GRAIN_WIDTH], \ + const entry buf_y[][GRAIN_WIDTH], \ + const Dav1dFilmGrainData *const data, const intptr_t uv HIGHBD_DECL_SUFFIX) +typedef decl_generate_grain_uv_fn(*generate_grain_uv_fn); + +#define decl_fgy_32x32xn_fn(name) \ +void (name)(pixel *dst_row, const pixel *src_row, ptrdiff_t stride, \ + const Dav1dFilmGrainData *data, \ + size_t pw, const uint8_t scaling[SCALING_SIZE], \ + const entry grain_lut[][GRAIN_WIDTH], \ + int bh, int row_num HIGHBD_DECL_SUFFIX) +typedef decl_fgy_32x32xn_fn(*fgy_32x32xn_fn); + +#define decl_fguv_32x32xn_fn(name) \ +void (name)(pixel *dst_row, const pixel *src_row, ptrdiff_t stride, \ + const Dav1dFilmGrainData *data, int pw, \ + const uint8_t scaling[SCALING_SIZE], \ + const entry grain_lut[][GRAIN_WIDTH], int bh, int row_num, \ + const pixel *luma_row, ptrdiff_t luma_stride, \ + int uv_pl, int is_id HIGHBD_DECL_SUFFIX) +typedef decl_fguv_32x32xn_fn(*fguv_32x32xn_fn); + +typedef struct Dav1dFilmGrainDSPContext { + generate_grain_y_fn generate_grain_y; + generate_grain_uv_fn generate_grain_uv[3]; + + fgy_32x32xn_fn fgy_32x32xn; + fguv_32x32xn_fn fguv_32x32xn[3]; +} Dav1dFilmGrainDSPContext; + +bitfn_decls(void dav1d_film_grain_dsp_init, Dav1dFilmGrainDSPContext *c); +bitfn_decls(void dav1d_film_grain_dsp_init_x86, Dav1dFilmGrainDSPContext *c); + +#endif /* DAV1D_SRC_FILM_GRAIN_H */ diff --git a/third_party/dav1d/src/film_grain_tmpl.c b/third_party/dav1d/src/film_grain_tmpl.c new file mode 100644 index 0000000000..90a03a4ad2 --- /dev/null +++ b/third_party/dav1d/src/film_grain_tmpl.c @@ -0,0 +1,437 @@ +/* + * Copyright © 2018, Niklas Haas + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "common/attributes.h" +#include "common/intops.h" + +#include "film_grain.h" +#include "tables.h" + +#define SUB_GRAIN_WIDTH 44 +#define SUB_GRAIN_HEIGHT 38 + +static inline int get_random_number(const int bits, unsigned *const state) { + const int r = *state; + unsigned bit = ((r >> 0) ^ (r >> 1) ^ (r >> 3) ^ (r >> 12)) & 1; + *state = (r >> 1) | (bit << 15); + + return (*state >> (16 - bits)) & ((1 << bits) - 1); +} + +static inline int round2(const int x, const uint64_t shift) { + return (x + ((1 << shift) >> 1)) >> shift; +} + +static void generate_grain_y_c(entry buf[][GRAIN_WIDTH], + const Dav1dFilmGrainData *const data + HIGHBD_DECL_SUFFIX) +{ + const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; + unsigned seed = data->seed; + const int shift = 4 - bitdepth_min_8 + data->grain_scale_shift; + const int grain_ctr = 128 << bitdepth_min_8; + const int grain_min = -grain_ctr, grain_max = grain_ctr - 1; + + for (int y = 0; y < GRAIN_HEIGHT; y++) { + for (int x = 0; x < GRAIN_WIDTH; x++) { + const int value = get_random_number(11, &seed); + buf[y][x] = round2(dav1d_gaussian_sequence[ value ], shift); + } + } + + const int ar_pad = 3; + const int ar_lag = data->ar_coeff_lag; + + for (int y = ar_pad; y < GRAIN_HEIGHT; y++) { + for (int x = ar_pad; x < GRAIN_WIDTH - ar_pad; x++) { + const int8_t *coeff = data->ar_coeffs_y; + int sum = 0; + for (int dy = -ar_lag; dy <= 0; dy++) { + for (int dx = -ar_lag; dx <= ar_lag; dx++) { + if (!dx && !dy) + break; + sum += *(coeff++) * buf[y + dy][x + dx]; + } + } + + const int grain = buf[y][x] + round2(sum, data->ar_coeff_shift); + buf[y][x] = iclip(grain, grain_min, grain_max); + } + } +} + +static NOINLINE void +generate_grain_uv_c(entry buf[][GRAIN_WIDTH], + const entry buf_y[][GRAIN_WIDTH], + const Dav1dFilmGrainData *const data, const intptr_t uv, + const int subx, const int suby HIGHBD_DECL_SUFFIX) +{ + const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; + unsigned seed = data->seed ^ (uv ? 0x49d8 : 0xb524); + const int shift = 4 - bitdepth_min_8 + data->grain_scale_shift; + const int grain_ctr = 128 << bitdepth_min_8; + const int grain_min = -grain_ctr, grain_max = grain_ctr - 1; + + const int chromaW = subx ? SUB_GRAIN_WIDTH : GRAIN_WIDTH; + const int chromaH = suby ? SUB_GRAIN_HEIGHT : GRAIN_HEIGHT; + + for (int y = 0; y < chromaH; y++) { + for (int x = 0; x < chromaW; x++) { + const int value = get_random_number(11, &seed); + buf[y][x] = round2(dav1d_gaussian_sequence[ value ], shift); + } + } + + const int ar_pad = 3; + const int ar_lag = data->ar_coeff_lag; + + for (int y = ar_pad; y < chromaH; y++) { + for (int x = ar_pad; x < chromaW - ar_pad; x++) { + const int8_t *coeff = data->ar_coeffs_uv[uv]; + int sum = 0; + for (int dy = -ar_lag; dy <= 0; dy++) { + for (int dx = -ar_lag; dx <= ar_lag; dx++) { + // For the final (current) pixel, we need to add in the + // contribution from the luma grain texture + if (!dx && !dy) { + if (!data->num_y_points) + break; + int luma = 0; + const int lumaX = ((x - ar_pad) << subx) + ar_pad; + const int lumaY = ((y - ar_pad) << suby) + ar_pad; + for (int i = 0; i <= suby; i++) { + for (int j = 0; j <= subx; j++) { + luma += buf_y[lumaY + i][lumaX + j]; + } + } + luma = round2(luma, subx + suby); + sum += luma * (*coeff); + break; + } + + sum += *(coeff++) * buf[y + dy][x + dx]; + } + } + + const int grain = buf[y][x] + round2(sum, data->ar_coeff_shift); + buf[y][x] = iclip(grain, grain_min, grain_max); + } + } +} + +#define gnuv_ss_fn(nm, ss_x, ss_y) \ +static decl_generate_grain_uv_fn(generate_grain_uv_##nm##_c) { \ + generate_grain_uv_c(buf, buf_y, data, uv, ss_x, ss_y HIGHBD_TAIL_SUFFIX); \ +} + +gnuv_ss_fn(420, 1, 1); +gnuv_ss_fn(422, 1, 0); +gnuv_ss_fn(444, 0, 0); + +// samples from the correct block of a grain LUT, while taking into account the +// offsets provided by the offsets cache +static inline entry sample_lut(const entry grain_lut[][GRAIN_WIDTH], + const int offsets[2][2], const int subx, const int suby, + const int bx, const int by, const int x, const int y) +{ + const int randval = offsets[bx][by]; + const int offx = 3 + (2 >> subx) * (3 + (randval >> 4)); + const int offy = 3 + (2 >> suby) * (3 + (randval & 0xF)); + return grain_lut[offy + y + (BLOCK_SIZE >> suby) * by] + [offx + x + (BLOCK_SIZE >> subx) * bx]; +} + +static void fgy_32x32xn_c(pixel *const dst_row, const pixel *const src_row, + const ptrdiff_t stride, + const Dav1dFilmGrainData *const data, const size_t pw, + const uint8_t scaling[SCALING_SIZE], + const entry grain_lut[][GRAIN_WIDTH], + const int bh, const int row_num HIGHBD_DECL_SUFFIX) +{ + const int rows = 1 + (data->overlap_flag && row_num > 0); + const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; + const int grain_ctr = 128 << bitdepth_min_8; + const int grain_min = -grain_ctr, grain_max = grain_ctr - 1; + + int min_value, max_value; + if (data->clip_to_restricted_range) { + min_value = 16 << bitdepth_min_8; + max_value = 235 << bitdepth_min_8; + } else { + min_value = 0; +#if BITDEPTH == 8 + max_value = 0xff; +#else + max_value = bitdepth_max; +#endif + } + + // seed[0] contains the current row, seed[1] contains the previous + unsigned seed[2]; + for (int i = 0; i < rows; i++) { + seed[i] = data->seed; + seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8; + seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF); + } + + assert(stride % (BLOCK_SIZE * sizeof(pixel)) == 0); + + int offsets[2 /* col offset */][2 /* row offset */]; + + // process this row in BLOCK_SIZE^2 blocks + for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE) { + const int bw = imin(BLOCK_SIZE, (int) pw - bx); + + if (data->overlap_flag && bx) { + // shift previous offsets left + for (int i = 0; i < rows; i++) + offsets[1][i] = offsets[0][i]; + } + + // update current offsets + for (int i = 0; i < rows; i++) + offsets[0][i] = get_random_number(8, &seed[i]); + + // x/y block offsets to compensate for overlapped regions + const int ystart = data->overlap_flag && row_num ? imin(2, bh) : 0; + const int xstart = data->overlap_flag && bx ? imin(2, bw) : 0; + + static const int w[2][2] = { { 27, 17 }, { 17, 27 } }; + +#define add_noise_y(x, y, grain) \ + const pixel *const src = src_row + (y) * PXSTRIDE(stride) + (x) + bx; \ + pixel *const dst = dst_row + (y) * PXSTRIDE(stride) + (x) + bx; \ + const int noise = round2(scaling[ *src ] * (grain), data->scaling_shift); \ + *dst = iclip(*src + noise, min_value, max_value); + + for (int y = ystart; y < bh; y++) { + // Non-overlapped image region (straightforward) + for (int x = xstart; x < bw; x++) { + int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y); + add_noise_y(x, y, grain); + } + + // Special case for overlapped column + for (int x = 0; x < xstart; x++) { + int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y); + int old = sample_lut(grain_lut, offsets, 0, 0, 1, 0, x, y); + grain = round2(old * w[x][0] + grain * w[x][1], 5); + grain = iclip(grain, grain_min, grain_max); + add_noise_y(x, y, grain); + } + } + + for (int y = 0; y < ystart; y++) { + // Special case for overlapped row (sans corner) + for (int x = xstart; x < bw; x++) { + int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y); + int old = sample_lut(grain_lut, offsets, 0, 0, 0, 1, x, y); + grain = round2(old * w[y][0] + grain * w[y][1], 5); + grain = iclip(grain, grain_min, grain_max); + add_noise_y(x, y, grain); + } + + // Special case for doubly-overlapped corner + for (int x = 0; x < xstart; x++) { + // Blend the top pixel with the top left block + int top = sample_lut(grain_lut, offsets, 0, 0, 0, 1, x, y); + int old = sample_lut(grain_lut, offsets, 0, 0, 1, 1, x, y); + top = round2(old * w[x][0] + top * w[x][1], 5); + top = iclip(top, grain_min, grain_max); + + // Blend the current pixel with the left block + int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y); + old = sample_lut(grain_lut, offsets, 0, 0, 1, 0, x, y); + grain = round2(old * w[x][0] + grain * w[x][1], 5); + grain = iclip(grain, grain_min, grain_max); + + // Mix the row rows together and apply grain + grain = round2(top * w[y][0] + grain * w[y][1], 5); + grain = iclip(grain, grain_min, grain_max); + add_noise_y(x, y, grain); + } + } + } +} + +static NOINLINE void +fguv_32x32xn_c(pixel *const dst_row, const pixel *const src_row, + const ptrdiff_t stride, const Dav1dFilmGrainData *const data, + const int pw, const uint8_t scaling[SCALING_SIZE], + const entry grain_lut[][GRAIN_WIDTH], const int bh, + const int row_num, const pixel *const luma_row, + const ptrdiff_t luma_stride, const int uv, const int is_id, + const int sx, const int sy HIGHBD_DECL_SUFFIX) +{ + const int rows = 1 + (data->overlap_flag && row_num > 0); + const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; + const int grain_ctr = 128 << bitdepth_min_8; + const int grain_min = -grain_ctr, grain_max = grain_ctr - 1; + + int min_value, max_value; + if (data->clip_to_restricted_range) { + min_value = 16 << bitdepth_min_8; + max_value = (is_id ? 235 : 240) << bitdepth_min_8; + } else { + min_value = 0; +#if BITDEPTH == 8 + max_value = 0xff; +#else + max_value = bitdepth_max; +#endif + } + + // seed[0] contains the current row, seed[1] contains the previous + unsigned seed[2]; + for (int i = 0; i < rows; i++) { + seed[i] = data->seed; + seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8; + seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF); + } + + assert(stride % (BLOCK_SIZE * sizeof(pixel)) == 0); + + int offsets[2 /* col offset */][2 /* row offset */]; + + // process this row in BLOCK_SIZE^2 blocks (subsampled) + for (int bx = 0; bx < pw; bx += BLOCK_SIZE >> sx) { + const int bw = imin(BLOCK_SIZE >> sx, pw - bx); + if (data->overlap_flag && bx) { + // shift previous offsets left + for (int i = 0; i < rows; i++) + offsets[1][i] = offsets[0][i]; + } + + // update current offsets + for (int i = 0; i < rows; i++) + offsets[0][i] = get_random_number(8, &seed[i]); + + // x/y block offsets to compensate for overlapped regions + const int ystart = data->overlap_flag && row_num ? imin(2 >> sy, bh) : 0; + const int xstart = data->overlap_flag && bx ? imin(2 >> sx, bw) : 0; + + static const int w[2 /* sub */][2 /* off */][2] = { + { { 27, 17 }, { 17, 27 } }, + { { 23, 22 } }, + }; + +#define add_noise_uv(x, y, grain) \ + const int lx = (bx + x) << sx; \ + const int ly = y << sy; \ + const pixel *const luma = luma_row + ly * PXSTRIDE(luma_stride) + lx; \ + pixel avg = luma[0]; \ + if (sx) \ + avg = (avg + luma[1] + 1) >> 1; \ + const pixel *const src = src_row + (y) * PXSTRIDE(stride) + (bx + (x)); \ + pixel *const dst = dst_row + (y) * PXSTRIDE(stride) + (bx + (x)); \ + int val = avg; \ + if (!data->chroma_scaling_from_luma) { \ + const int combined = avg * data->uv_luma_mult[uv] + \ + *src * data->uv_mult[uv]; \ + val = iclip_pixel( (combined >> 6) + \ + (data->uv_offset[uv] * (1 << bitdepth_min_8)) ); \ + } \ + const int noise = round2(scaling[ val ] * (grain), data->scaling_shift); \ + *dst = iclip(*src + noise, min_value, max_value); + + for (int y = ystart; y < bh; y++) { + // Non-overlapped image region (straightforward) + for (int x = xstart; x < bw; x++) { + int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y); + add_noise_uv(x, y, grain); + } + + // Special case for overlapped column + for (int x = 0; x < xstart; x++) { + int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y); + int old = sample_lut(grain_lut, offsets, sx, sy, 1, 0, x, y); + grain = (old * w[sx][x][0] + grain * w[sx][x][1] + 16) >> 5; + grain = iclip(grain, grain_min, grain_max); + add_noise_uv(x, y, grain); + } + } + + for (int y = 0; y < ystart; y++) { + // Special case for overlapped row (sans corner) + for (int x = xstart; x < bw; x++) { + int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y); + int old = sample_lut(grain_lut, offsets, sx, sy, 0, 1, x, y); + grain = (old * w[sy][y][0] + grain * w[sy][y][1] + 16) >> 5; + grain = iclip(grain, grain_min, grain_max); + add_noise_uv(x, y, grain); + } + + // Special case for doubly-overlapped corner + for (int x = 0; x < xstart; x++) { + // Blend the top pixel with the top left block + int top = sample_lut(grain_lut, offsets, sx, sy, 0, 1, x, y); + int old = sample_lut(grain_lut, offsets, sx, sy, 1, 1, x, y); + top = (old * w[sx][x][0] + top * w[sx][x][1] + 16) >> 5; + top = iclip(top, grain_min, grain_max); + + // Blend the current pixel with the left block + int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y); + old = sample_lut(grain_lut, offsets, sx, sy, 1, 0, x, y); + grain = (old * w[sx][x][0] + grain * w[sx][x][1] + 16) >> 5; + grain = iclip(grain, grain_min, grain_max); + + // Mix the row rows together and apply to image + grain = (top * w[sy][y][0] + grain * w[sy][y][1] + 16) >> 5; + grain = iclip(grain, grain_min, grain_max); + add_noise_uv(x, y, grain); + } + } + } +} + +#define fguv_ss_fn(nm, ss_x, ss_y) \ +static decl_fguv_32x32xn_fn(fguv_32x32xn_##nm##_c) { \ + fguv_32x32xn_c(dst_row, src_row, stride, data, pw, scaling, grain_lut, bh, \ + row_num, luma_row, luma_stride, uv_pl, is_id, ss_x, ss_y \ + HIGHBD_TAIL_SUFFIX); \ +} + +fguv_ss_fn(420, 1, 1); +fguv_ss_fn(422, 1, 0); +fguv_ss_fn(444, 0, 0); + +COLD void bitfn(dav1d_film_grain_dsp_init)(Dav1dFilmGrainDSPContext *const c) { + c->generate_grain_y = generate_grain_y_c; + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = generate_grain_uv_420_c; + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = generate_grain_uv_422_c; + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = generate_grain_uv_444_c; + + c->fgy_32x32xn = fgy_32x32xn_c; + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = fguv_32x32xn_420_c; + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = fguv_32x32xn_422_c; + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = fguv_32x32xn_444_c; + +#if HAVE_ASM && ARCH_X86 + bitfn(dav1d_film_grain_dsp_init_x86)(c); +#endif +} diff --git a/third_party/dav1d/src/getbits.c b/third_party/dav1d/src/getbits.c new file mode 100644 index 0000000000..7bb20140e4 --- /dev/null +++ b/third_party/dav1d/src/getbits.c @@ -0,0 +1,162 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include + +#include "common/intops.h" + +#include "src/getbits.h" + +void dav1d_init_get_bits(GetBits *const c, const uint8_t *const data, + const size_t sz) +{ + // If sz were 0, c->eof would need to be initialized to 1. + assert(sz); + c->ptr = c->ptr_start = data; + c->ptr_end = &c->ptr_start[sz]; + c->bits_left = 0; + c->state = 0; + c->error = 0; + c->eof = 0; +} + +static void refill(GetBits *const c, const unsigned n) { + assert(c->bits_left <= 56); + uint64_t state = 0; + do { + state <<= 8; + c->bits_left += 8; + if (!c->eof) + state |= *c->ptr++; + if (c->ptr >= c->ptr_end) { + c->error = c->eof; + c->eof = 1; + } + } while (n > c->bits_left); + c->state |= state << (64 - c->bits_left); +} + +unsigned dav1d_get_bits(GetBits *const c, const unsigned n) { + assert(n <= 32 /* can go up to 57 if we change return type */); + assert(n /* can't shift state by 64 */); + + if (n > c->bits_left) refill(c, n); + + const uint64_t state = c->state; + c->bits_left -= n; + c->state <<= n; + + return (unsigned) (state >> (64 - n)); +} + +int dav1d_get_sbits(GetBits *const c, const unsigned n) { + const int shift = 31 - n; + const int res = dav1d_get_bits(c, n + 1) << shift; + return res >> shift; +} + +unsigned dav1d_get_uleb128(GetBits *const c) { + uint64_t val = 0; + unsigned i = 0, more; + + do { + const int v = dav1d_get_bits(c, 8); + more = v & 0x80; + val |= ((uint64_t) (v & 0x7F)) << i; + i += 7; + } while (more && i < 56); + + if (val > UINT_MAX || more) { + c->error = 1; + return 0; + } + + return (unsigned) val; +} + +unsigned dav1d_get_uniform(GetBits *const c, const unsigned max) { + // Output in range [0..max-1] + // max must be > 1, or else nothing is read from the bitstream + assert(max > 1); + const int l = ulog2(max) + 1; + assert(l > 1); + const unsigned m = (1U << l) - max; + const unsigned v = dav1d_get_bits(c, l - 1); + return v < m ? v : (v << 1) - m + dav1d_get_bits(c, 1); +} + +unsigned dav1d_get_vlc(GetBits *const c) { + int n_bits = 0; + while (!dav1d_get_bits(c, 1)) + if (++n_bits == 32) + return 0xFFFFFFFFU; + return n_bits ? ((1U << n_bits) - 1) + dav1d_get_bits(c, n_bits) : 0; +} + +static unsigned get_bits_subexp_u(GetBits *const c, const unsigned ref, + const unsigned n) +{ + unsigned v = 0; + + for (int i = 0;; i++) { + const int b = i ? 3 + i - 1 : 3; + + if (n < v + 3 * (1 << b)) { + v += dav1d_get_uniform(c, n - v + 1); + break; + } + + if (!dav1d_get_bits(c, 1)) { + v += dav1d_get_bits(c, b); + break; + } + + v += 1 << b; + } + + return ref * 2 <= n ? inv_recenter(ref, v) : n - inv_recenter(n - ref, v); +} + +int dav1d_get_bits_subexp(GetBits *const c, const int ref, const unsigned n) { + return (int) get_bits_subexp_u(c, ref + (1 << n), 2 << n) - (1 << n); +} + +void dav1d_bytealign_get_bits(GetBits *c) { + // bits_left is never more than 7, because it is only incremented + // by refill(), called by dav1d_get_bits and that never reads more + // than 7 bits more than it needs. + // + // If this wasn't true, we would need to work out how many bits to + // discard (bits_left % 8), subtract that from bits_left and then + // shift state right by that amount. + assert(c->bits_left <= 7); + + c->bits_left = 0; + c->state = 0; +} diff --git a/third_party/dav1d/src/getbits.h b/third_party/dav1d/src/getbits.h new file mode 100644 index 0000000000..fc382148b2 --- /dev/null +++ b/third_party/dav1d/src/getbits.h @@ -0,0 +1,59 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_GETBITS_H +#define DAV1D_SRC_GETBITS_H + +#include +#include + +typedef struct GetBits { + int error, eof; + uint64_t state; + unsigned bits_left; + const uint8_t *ptr, *ptr_start, *ptr_end; +} GetBits; + +void dav1d_init_get_bits(GetBits *c, const uint8_t *data, size_t sz); +unsigned dav1d_get_bits(GetBits *c, unsigned n); +int dav1d_get_sbits(GetBits *c, unsigned n); +unsigned dav1d_get_uleb128(GetBits *c); + +// Output in range 0..max-1 +unsigned dav1d_get_uniform(GetBits *c, unsigned max); +unsigned dav1d_get_vlc(GetBits *c); +int dav1d_get_bits_subexp(GetBits *c, int ref, unsigned n); + +// Discard bits from the buffer until we're next byte-aligned. +void dav1d_bytealign_get_bits(GetBits *c); + +// Return the current bit position relative to the start of the buffer. +static inline unsigned dav1d_get_bits_pos(const GetBits *c) { + return (unsigned) (c->ptr - c->ptr_start) * 8 - c->bits_left; +} + +#endif /* DAV1D_SRC_GETBITS_H */ diff --git a/third_party/dav1d/src/internal.h b/third_party/dav1d/src/internal.h new file mode 100644 index 0000000000..51962a2cd7 --- /dev/null +++ b/third_party/dav1d/src/internal.h @@ -0,0 +1,357 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_INTERNAL_H +#define DAV1D_SRC_INTERNAL_H + +#include + +#include "dav1d/data.h" + +typedef struct Dav1dFrameContext Dav1dFrameContext; +typedef struct Dav1dTileState Dav1dTileState; +typedef struct Dav1dTileContext Dav1dTileContext; + +#include "common/attributes.h" + +#include "src/cdef.h" +#include "src/cdf.h" +#include "src/data.h" +#include "src/env.h" +#include "src/film_grain.h" +#include "src/intra_edge.h" +#include "src/ipred.h" +#include "src/itx.h" +#include "src/levels.h" +#include "src/lf_mask.h" +#include "src/loopfilter.h" +#include "src/looprestoration.h" +#include "src/mc.h" +#include "src/msac.h" +#include "src/picture.h" +#include "src/recon.h" +#include "src/refmvs.h" +#include "src/thread.h" + +typedef struct Dav1dDSPContext { + Dav1dFilmGrainDSPContext fg; + Dav1dIntraPredDSPContext ipred; + Dav1dMCDSPContext mc; + Dav1dInvTxfmDSPContext itx; + Dav1dLoopFilterDSPContext lf; + Dav1dCdefDSPContext cdef; + Dav1dLoopRestorationDSPContext lr; +} Dav1dDSPContext; + +struct Dav1dTileGroup { + Dav1dData data; + int start, end; +}; + +struct Dav1dContext { + Dav1dFrameContext *fc; + unsigned n_fc; + + // cache of OBUs that make up a single frame before we submit them + // to a frame worker to be decoded + struct Dav1dTileGroup *tile; + int n_tile_data_alloc; + int n_tile_data; + int n_tiles; + Dav1dMemPool seq_hdr_pool; + Dav1dRef *seq_hdr_ref; + Dav1dSequenceHeader *seq_hdr; + Dav1dMemPool frame_hdr_pool; + Dav1dRef *frame_hdr_ref; + Dav1dFrameHeader *frame_hdr; + + Dav1dRef *content_light_ref; + Dav1dContentLightLevel *content_light; + Dav1dRef *mastering_display_ref; + Dav1dMasteringDisplay *mastering_display; + Dav1dRef *itut_t35_ref; + Dav1dITUTT35 *itut_t35; + + // decoded output picture queue + Dav1dData in; + Dav1dPicture out; + struct { + Dav1dThreadPicture *out_delayed; + unsigned next; + // dummy is a pointer to prevent compiler errors about atomic_load() + // not taking const arguments; the const attribute is not taken + // from pointers + atomic_int flush_mem, *flush; + } frame_thread; + + // reference/entropy state + Dav1dMemPool segmap_pool; + Dav1dMemPool refmvs_pool; + struct { + Dav1dThreadPicture p; + Dav1dRef *segmap; + Dav1dRef *refmvs; + unsigned refpoc[7]; + } refs[8]; + Dav1dMemPool cdf_pool; + CdfThreadContext cdf[8]; + + Dav1dDSPContext dsp[3 /* 8, 10, 12 bits/component */]; + + // tree to keep track of which edges are available + struct { + EdgeNode *root[2 /* BL_128X128 vs. BL_64X64 */]; + EdgeBranch branch_sb128[1 + 4 + 16 + 64]; + EdgeBranch branch_sb64[1 + 4 + 16]; + EdgeTip tip_sb128[256]; + EdgeTip tip_sb64[64]; + } intra_edge; + + Dav1dPicAllocator allocator; + int apply_grain; + int operating_point; + unsigned operating_point_idc; + int all_layers; + unsigned frame_size_limit; + int drain; + + Dav1dLogger logger; + + Dav1dMemPool picture_pool; + int mem_pools_inited; +}; + +struct Dav1dFrameContext { + Dav1dRef *seq_hdr_ref; + Dav1dSequenceHeader *seq_hdr; + Dav1dRef *frame_hdr_ref; + Dav1dFrameHeader *frame_hdr; + Dav1dThreadPicture refp[7]; + Dav1dPicture cur; // during block coding / reconstruction + Dav1dThreadPicture sr_cur; // after super-resolution upscaling + Dav1dRef *mvs_ref; + refmvs_temporal_block *mvs, *ref_mvs[7]; + Dav1dRef *ref_mvs_ref[7]; + Dav1dRef *cur_segmap_ref, *prev_segmap_ref; + uint8_t *cur_segmap; + const uint8_t *prev_segmap; + unsigned refpoc[7], refrefpoc[7][7]; + uint8_t gmv_warp_allowed[7]; + CdfThreadContext in_cdf, out_cdf; + struct Dav1dTileGroup *tile; + int n_tile_data_alloc; + int n_tile_data; + + // for scalable references + struct ScalableMotionParams { + int scale; // if no scaling, this is 0 + int step; + } svc[7][2 /* x, y */]; + int resize_step[2 /* y, uv */], resize_start[2 /* y, uv */]; + + const Dav1dContext *c; + Dav1dTileContext *tc; + int n_tc; + Dav1dTileState *ts; + int n_ts; + const Dav1dDSPContext *dsp; + struct { + recon_b_intra_fn recon_b_intra; + recon_b_inter_fn recon_b_inter; + filter_sbrow_fn filter_sbrow; + backup_ipred_edge_fn backup_ipred_edge; + read_coef_blocks_fn read_coef_blocks; + } bd_fn; + + int ipred_edge_sz; + pixel *ipred_edge[3]; + ptrdiff_t b4_stride; + int w4, h4, bw, bh, sb128w, sb128h, sbh, sb_shift, sb_step, sr_sb128w; + uint16_t dq[DAV1D_MAX_SEGMENTS][3 /* plane */][2 /* dc/ac */]; + const uint8_t *qm[2 /* is_1d */][N_RECT_TX_SIZES][3 /* plane */]; + BlockContext *a; + int a_sz /* w*tile_rows */; + refmvs_frame rf; + uint8_t jnt_weights[7][7]; + int bitdepth_max; + + struct { + struct thread_data td; + int pass, die; + // indexed using t->by * f->b4_stride + t->bx + Av1Block *b; + struct CodedBlockInfo { + int16_t eob[3 /* plane */]; + uint8_t txtp[3 /* plane */]; + } *cbi; + // indexed using (t->by >> 1) * (f->b4_stride >> 1) + (t->bx >> 1) + uint16_t (*pal)[3 /* plane */][8 /* idx */]; + // iterated over inside tile state + uint8_t *pal_idx; + coef *cf; + int pal_sz, pal_idx_sz, cf_sz; + // start offsets per tile + int *tile_start_off; + } frame_thread; + + // loopfilter + struct { + uint8_t (*level)[4]; + Av1Filter *mask; + Av1Restoration *lr_mask; + int top_pre_cdef_toggle; + int mask_sz /* w*h */, lr_mask_sz, cdef_line_sz[2] /* stride */; + int lr_line_sz, re_sz /* h */; + ALIGN(Av1FilterLUT lim_lut, 16); + int last_sharpness; + uint8_t lvl[8 /* seg_id */][4 /* dir */][8 /* ref */][2 /* is_gmv */]; + uint8_t *tx_lpf_right_edge[2]; + uint8_t *cdef_line_buf; + pixel *cdef_line[2 /* pre, post */][3 /* plane */]; + pixel *lr_lpf_line[3 /* plane */]; + + // in-loop filter per-frame state keeping + int tile_row; // for carry-over at tile row edges + pixel *p[3], *sr_p[3]; + Av1Filter *mask_ptr, *prev_mask_ptr; + int restore_planes; // enum LrRestorePlanes + } lf; + + // threading (refer to tc[] for per-thread things) + struct FrameTileThreadData { + uint64_t available; + pthread_mutex_t lock; + pthread_cond_t cond, icond; + int tasks_left, num_tasks; + int (*task_idx_to_sby_and_tile_idx)[2]; + int titsati_sz, titsati_init[2]; + uint16_t titsati_index_rows[1 + DAV1D_MAX_TILE_ROWS]; + int inited; + } tile_thread; +}; + +struct Dav1dTileState { + CdfContext cdf; + MsacContext msac; + + struct { + int col_start, col_end, row_start, row_end; // in 4px units + int col, row; // in tile units + } tiling; + + atomic_int progress; // in sby units, TILE_ERROR after a decoding error + struct { + pthread_mutex_t lock; + pthread_cond_t cond; + } tile_thread; + struct { + uint8_t *pal_idx; + coef *cf; + } frame_thread; + + uint16_t dqmem[DAV1D_MAX_SEGMENTS][3 /* plane */][2 /* dc/ac */]; + const uint16_t (*dq)[3][2]; + int last_qidx; + + int8_t last_delta_lf[4]; + uint8_t lflvlmem[8 /* seg_id */][4 /* dir */][8 /* ref */][2 /* is_gmv */]; + const uint8_t (*lflvl)[4][8][2]; + + Av1RestorationUnit *lr_ref[3]; +}; + +struct Dav1dTileContext { + const Dav1dFrameContext *f; + Dav1dTileState *ts; + int bx, by; + BlockContext l, *a; + ALIGN(union, 32) { + int16_t cf_8bpc [32 * 32]; + int32_t cf_16bpc[32 * 32]; + }; + // FIXME types can be changed to pixel (and dynamically allocated) + // which would make copy/assign operations slightly faster? + uint16_t al_pal[2 /* a/l */][32 /* bx/y4 */][3 /* plane */][8 /* palette_idx */]; + uint8_t pal_sz_uv[2 /* a/l */][32 /* bx4/by4 */]; + uint8_t txtp_map[32 * 32]; // inter-only + refmvs_tile rt; + ALIGN(union, 64) { + struct { + union { + uint8_t lap_8bpc [128 * 32]; + uint16_t lap_16bpc[128 * 32]; + struct { + int16_t compinter[2][128 * 128]; + uint8_t seg_mask[128 * 128]; + }; + }; + union { + // stride=192 for non-SVC, or 320 for SVC + uint8_t emu_edge_8bpc [320 * (256 + 7)]; + uint16_t emu_edge_16bpc[320 * (256 + 7)]; + }; + }; + struct { + union { + uint8_t levels[32 * 34]; + struct { + uint8_t pal_order[64][8]; + uint8_t pal_ctx[64]; + }; + }; + int16_t ac[32 * 32]; + uint8_t pal_idx[2 * 64 * 64]; + uint16_t pal[3 /* plane */][8 /* palette_idx */]; + ALIGN(union, 32) { + struct { + uint8_t interintra_8bpc[64 * 64]; + uint8_t edge_8bpc[257]; + }; + struct { + uint16_t interintra_16bpc[64 * 64]; + uint16_t edge_16bpc[257]; + }; + }; + }; + } scratch; + + Dav1dWarpedMotionParams warpmv; + Av1Filter *lf_mask; + int8_t *cur_sb_cdef_idx_ptr; + // for chroma sub8x8, we need to know the filter for all 4 subblocks in + // a 4x4 area, but the top/left one can go out of cache already, so this + // keeps it accessible + enum Filter2d tl_4x4_filter; + + struct { + struct thread_data td; + struct FrameTileThreadData *fttd; + int die; + } tile_thread; +}; + +#endif /* DAV1D_SRC_INTERNAL_H */ diff --git a/third_party/dav1d/src/intra_edge.c b/third_party/dav1d/src/intra_edge.c new file mode 100644 index 0000000000..684d113fa9 --- /dev/null +++ b/third_party/dav1d/src/intra_edge.c @@ -0,0 +1,165 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include + +#include "common/attributes.h" + +#include "src/intra_edge.h" +#include "src/levels.h" + +struct ModeSelMem { + EdgeBranch *nwc[3 /* 64x64, 32x32, 16x16 */]; + EdgeTip *nt; +}; + +static void init_edges(EdgeNode *const node, + const enum BlockLevel bl, + const enum EdgeFlags edge_flags) +{ + node->o = edge_flags; + +#define ALL_FL(t) (EDGE_I444_##t | EDGE_I422_##t | EDGE_I420_##t) + if (bl == BL_8X8) { + EdgeTip *const nt = (EdgeTip *) node; + + node->h[0] = edge_flags | ALL_FL(LEFT_HAS_BOTTOM); + node->h[1] = edge_flags & (ALL_FL(LEFT_HAS_BOTTOM) | + EDGE_I420_TOP_HAS_RIGHT); + + node->v[0] = edge_flags | ALL_FL(TOP_HAS_RIGHT); + node->v[1] = edge_flags & (ALL_FL(TOP_HAS_RIGHT) | + EDGE_I420_LEFT_HAS_BOTTOM | + EDGE_I422_LEFT_HAS_BOTTOM); + + nt->split[0] = ALL_FL(TOP_HAS_RIGHT) | ALL_FL(LEFT_HAS_BOTTOM); + nt->split[1] = (edge_flags & ALL_FL(TOP_HAS_RIGHT)) | + EDGE_I422_LEFT_HAS_BOTTOM; + nt->split[2] = edge_flags | EDGE_I444_TOP_HAS_RIGHT; + nt->split[3] = edge_flags & (EDGE_I420_TOP_HAS_RIGHT | + EDGE_I420_LEFT_HAS_BOTTOM | + EDGE_I422_LEFT_HAS_BOTTOM); + } else { + EdgeBranch *const nwc = (EdgeBranch *) node; + + node->h[0] = edge_flags | ALL_FL(LEFT_HAS_BOTTOM); + node->h[1] = edge_flags & ALL_FL(LEFT_HAS_BOTTOM); + + node->v[0] = edge_flags | ALL_FL(TOP_HAS_RIGHT); + node->v[1] = edge_flags & ALL_FL(TOP_HAS_RIGHT); + + nwc->h4[0] = edge_flags | ALL_FL(LEFT_HAS_BOTTOM); + nwc->h4[1] = + nwc->h4[2] = ALL_FL(LEFT_HAS_BOTTOM); + nwc->h4[3] = edge_flags & ALL_FL(LEFT_HAS_BOTTOM); + if (bl == BL_16X16) + nwc->h4[1] |= edge_flags & EDGE_I420_TOP_HAS_RIGHT; + + nwc->v4[0] = edge_flags | ALL_FL(TOP_HAS_RIGHT); + nwc->v4[1] = + nwc->v4[2] = ALL_FL(TOP_HAS_RIGHT); + nwc->v4[3] = edge_flags & ALL_FL(TOP_HAS_RIGHT); + if (bl == BL_16X16) + nwc->v4[1] |= edge_flags & (EDGE_I420_LEFT_HAS_BOTTOM | + EDGE_I422_LEFT_HAS_BOTTOM); + + nwc->tls[0] = ALL_FL(TOP_HAS_RIGHT) | ALL_FL(LEFT_HAS_BOTTOM); + nwc->tls[1] = edge_flags & ALL_FL(LEFT_HAS_BOTTOM); + nwc->tls[2] = edge_flags & ALL_FL(TOP_HAS_RIGHT); + + nwc->trs[0] = edge_flags | ALL_FL(TOP_HAS_RIGHT); + nwc->trs[1] = edge_flags | ALL_FL(LEFT_HAS_BOTTOM); + nwc->trs[2] = 0; + + nwc->tts[0] = ALL_FL(TOP_HAS_RIGHT) | ALL_FL(LEFT_HAS_BOTTOM); + nwc->tts[1] = edge_flags & ALL_FL(TOP_HAS_RIGHT); + nwc->tts[2] = edge_flags & ALL_FL(LEFT_HAS_BOTTOM); + + nwc->tbs[0] = edge_flags | ALL_FL(LEFT_HAS_BOTTOM); + nwc->tbs[1] = edge_flags | ALL_FL(TOP_HAS_RIGHT); + nwc->tbs[2] = 0; + } +} + +static void init_mode_node(EdgeBranch *const nwc, + const enum BlockLevel bl, + struct ModeSelMem *const mem, + const int top_has_right, + const int left_has_bottom) +{ + init_edges(&nwc->node, bl, + (top_has_right ? ALL_FL(TOP_HAS_RIGHT) : 0) | + (left_has_bottom ? ALL_FL(LEFT_HAS_BOTTOM) : 0)); + if (bl == BL_16X16) { + for (int n = 0; n < 4; n++) { + EdgeTip *const nt = mem->nt++; + nwc->split[n] = &nt->node; + init_edges(&nt->node, bl + 1, + ((n == 3 || (n == 1 && !top_has_right)) ? 0 : + ALL_FL(TOP_HAS_RIGHT)) | + (!(n == 0 || (n == 2 && left_has_bottom)) ? 0 : + ALL_FL(LEFT_HAS_BOTTOM))); + } + } else { + for (int n = 0; n < 4; n++) { + EdgeBranch *const nwc_child = mem->nwc[bl]++; + nwc->split[n] = &nwc_child->node; + init_mode_node(nwc_child, bl + 1, mem, + !(n == 3 || (n == 1 && !top_has_right)), + n == 0 || (n == 2 && left_has_bottom)); + } + } +} + +void dav1d_init_mode_tree(EdgeNode *const root_node, EdgeTip *const nt, + const int allow_sb128) +{ + EdgeBranch *const root = (EdgeBranch *) root_node; + struct ModeSelMem mem; + mem.nt = nt; + + if (allow_sb128) { + mem.nwc[BL_128X128] = &root[1]; + mem.nwc[BL_64X64] = &root[1 + 4]; + mem.nwc[BL_32X32] = &root[1 + 4 + 16]; + init_mode_node(root, BL_128X128, &mem, 1, 0); + assert(mem.nwc[BL_128X128] == &root[1 + 4]); + assert(mem.nwc[BL_64X64] == &root[1 + 4 + 16]); + assert(mem.nwc[BL_32X32] == &root[1 + 4 + 16 + 64]); + assert(mem.nt == &nt[256]); + } else { + mem.nwc[BL_128X128] = NULL; + mem.nwc[BL_64X64] = &root[1]; + mem.nwc[BL_32X32] = &root[1 + 4]; + init_mode_node(root, BL_64X64, &mem, 1, 0); + assert(mem.nwc[BL_64X64] == &root[1 + 4]); + assert(mem.nwc[BL_32X32] == &root[1 + 4 + 16]); + assert(mem.nt == &nt[64]); + } +} diff --git a/third_party/dav1d/src/intra_edge.h b/third_party/dav1d/src/intra_edge.h new file mode 100644 index 0000000000..8b4e150181 --- /dev/null +++ b/third_party/dav1d/src/intra_edge.h @@ -0,0 +1,57 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_INTRA_EDGE_H +#define DAV1D_SRC_INTRA_EDGE_H + +enum EdgeFlags { + EDGE_I444_TOP_HAS_RIGHT = 1 << 0, + EDGE_I422_TOP_HAS_RIGHT = 1 << 1, + EDGE_I420_TOP_HAS_RIGHT = 1 << 2, + EDGE_I444_LEFT_HAS_BOTTOM = 1 << 3, + EDGE_I422_LEFT_HAS_BOTTOM = 1 << 4, + EDGE_I420_LEFT_HAS_BOTTOM = 1 << 5, +}; + +typedef struct EdgeNode EdgeNode; +struct EdgeNode { + enum EdgeFlags o, h[2], v[2]; +}; +typedef struct EdgeTip { + EdgeNode node; + enum EdgeFlags split[4]; +} EdgeTip; +typedef struct EdgeBranch { + EdgeNode node; + enum EdgeFlags tts[3], tbs[3], tls[3], trs[3], h4[4], v4[4]; + EdgeNode *split[4]; +} EdgeBranch; + +void dav1d_init_mode_tree(EdgeNode *const root, EdgeTip *const nt, + const int allow_sb128); + +#endif /* DAV1D_SRC_INTRA_EDGE_H */ diff --git a/third_party/dav1d/src/ipred.h b/third_party/dav1d/src/ipred.h new file mode 100644 index 0000000000..5df2657740 --- /dev/null +++ b/third_party/dav1d/src/ipred.h @@ -0,0 +1,95 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_IPRED_H +#define DAV1D_SRC_IPRED_H + +#include + +#include "common/bitdepth.h" + +#include "src/levels.h" + +/* + * Intra prediction. + * - a is the angle (in degrees) for directional intra predictors. For other + * modes, it is ignored; + * - topleft is the same as the argument given to dav1d_prepare_intra_edges(), + * see ipred_prepare.h for more detailed documentation. + */ +#define decl_angular_ipred_fn(name) \ +void (name)(pixel *dst, ptrdiff_t stride, const pixel *topleft, \ + int width, int height, int angle, int max_width, int max_height \ + HIGHBD_DECL_SUFFIX) +typedef decl_angular_ipred_fn(*angular_ipred_fn); + +/* + * Create a subsampled Y plane with the DC subtracted. + * - w/h_pad is the edge of the width/height that extends outside the visible + * portion of the frame in 4px units; + * - ac has a stride of 16. + */ +#define decl_cfl_ac_fn(name) \ +void (name)(int16_t *ac, const pixel *y, ptrdiff_t stride, \ + int w_pad, int h_pad, int cw, int ch) +typedef decl_cfl_ac_fn(*cfl_ac_fn); + +/* + * dst[x,y] += alpha * ac[x,y] + * - alpha contains a q3 scalar in [-16,16] range; + */ +#define decl_cfl_pred_fn(name) \ +void (name)(pixel *dst, ptrdiff_t stride, const pixel *topleft, \ + int width, int height, const int16_t *ac, int alpha \ + HIGHBD_DECL_SUFFIX) +typedef decl_cfl_pred_fn(*cfl_pred_fn); + +/* + * dst[x,y] = pal[idx[x,y]] + * - palette indices are [0-7] + */ +#define decl_pal_pred_fn(name) \ +void (name)(pixel *dst, ptrdiff_t stride, const uint16_t *pal, \ + const uint8_t *idx, int w, int h) +typedef decl_pal_pred_fn(*pal_pred_fn); + +typedef struct Dav1dIntraPredDSPContext { + angular_ipred_fn intra_pred[N_IMPL_INTRA_PRED_MODES]; + + // chroma-from-luma + cfl_ac_fn cfl_ac[3 /* 420, 422, 444 */]; + cfl_pred_fn cfl_pred[DC_128_PRED + 1]; + + // palette + pal_pred_fn pal_pred; +} Dav1dIntraPredDSPContext; + +bitfn_decls(void dav1d_intra_pred_dsp_init, Dav1dIntraPredDSPContext *c); +bitfn_decls(void dav1d_intra_pred_dsp_init_arm, Dav1dIntraPredDSPContext *c); +bitfn_decls(void dav1d_intra_pred_dsp_init_x86, Dav1dIntraPredDSPContext *c); + +#endif /* DAV1D_SRC_IPRED_H */ diff --git a/third_party/dav1d/src/ipred_prepare.h b/third_party/dav1d/src/ipred_prepare.h new file mode 100644 index 0000000000..6a7efeb3d7 --- /dev/null +++ b/third_party/dav1d/src/ipred_prepare.h @@ -0,0 +1,108 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_IPRED_PREPARE_H +#define DAV1D_SRC_IPRED_PREPARE_H + +#include +#include + +#include "common/bitdepth.h" + +#include "src/env.h" +#include "src/intra_edge.h" +#include "src/levels.h" + +/* + * Luma intra edge preparation. + * + * x/y/start/w/h are in luma block (4px) units: + * - x and y are the absolute block positions in the image; + * - start/w/h are the *dependent tile* boundary positions. In practice, start + * is the horizontal tile start, w is the horizontal tile end, the vertical + * tile start is assumed to be 0 and h is the vertical image end. + * + * edge_flags signals which edges are available for this transform-block inside + * the given partition, as well as for the partition inside the superblock + * structure. + * + * dst and stride are pointers to the top/left position of the current block, + * and can be used to locate the top, left, top/left, top/right and bottom/left + * edge pointers also. + * + * angle is the angle_delta [-3..3] on input, and the absolute angle on output. + * + * mode is the intra prediction mode as coded in the bitstream. The return value + * is this same mode, converted to an index in the DSP functions. + * + * tw/th are the size of the transform block in block (4px) units. + * + * topleft_out is a pointer to scratch memory that will be filled with the edge + * pixels. The memory array should have space to be indexed in the [-2*w,2*w] + * range, in the following order: + * - [0] will be the top/left edge pixel; + * - [1..w] will be the top edge pixels (1 being left-most, w being right-most); + * - [w+1..2*w] will be the top/right edge pixels; + * - [-1..-w] will be the left edge pixels (-1 being top-most, -w being bottom- + * most); + * - [-w-1..-2*w] will be the bottom/left edge pixels. + * Each edge may remain uninitialized if it is not used by the returned mode + * index. If edges are not available (because the edge position is outside the + * tile dimensions or because edge_flags indicates lack of edge availability), + * they will be extended from nearby edges as defined by the av1 spec. + */ +enum IntraPredMode + bytefn(dav1d_prepare_intra_edges)(int x, int have_left, int y, int have_top, + int w, int h, enum EdgeFlags edge_flags, + const pixel *dst, ptrdiff_t stride, + const pixel *prefilter_toplevel_sb_edge, + enum IntraPredMode mode, int *angle, + int tw, int th, int filter_edge, + pixel *topleft_out HIGHBD_DECL_SUFFIX); + +// These flags are OR'd with the angle argument into intra predictors. +// ANGLE_USE_EDGE_FILTER_FLAG signals that edges should be convolved +// with a filter before using them to predict values in a block. +// ANGLE_SMOOTH_EDGE_FLAG means that edges are smooth and should use +// reduced filter strength. +#define ANGLE_USE_EDGE_FILTER_FLAG 1024 +#define ANGLE_SMOOTH_EDGE_FLAG 512 + +static inline int sm_flag(const BlockContext *const b, const int idx) { + if (!b->intra[idx]) return 0; + const enum IntraPredMode m = b->mode[idx]; + return (m == SMOOTH_PRED || m == SMOOTH_H_PRED || + m == SMOOTH_V_PRED) ? ANGLE_SMOOTH_EDGE_FLAG : 0; +} + +static inline int sm_uv_flag(const BlockContext *const b, const int idx) { + const enum IntraPredMode m = b->uvmode[idx]; + return (m == SMOOTH_PRED || m == SMOOTH_H_PRED || + m == SMOOTH_V_PRED) ? ANGLE_SMOOTH_EDGE_FLAG : 0; +} + +#endif /* DAV1D_SRC_IPRED_PREPARE_H */ diff --git a/third_party/dav1d/src/ipred_prepare_tmpl.c b/third_party/dav1d/src/ipred_prepare_tmpl.c new file mode 100644 index 0000000000..0bf9de9418 --- /dev/null +++ b/third_party/dav1d/src/ipred_prepare_tmpl.c @@ -0,0 +1,204 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include +#include + +#include "common/intops.h" + +#include "src/ipred_prepare.h" + +static const uint8_t av1_mode_conv[N_INTRA_PRED_MODES] + [2 /* have_left */][2 /* have_top */] = +{ + [DC_PRED] = { { DC_128_PRED, TOP_DC_PRED }, + { LEFT_DC_PRED, DC_PRED } }, + [PAETH_PRED] = { { DC_128_PRED, VERT_PRED }, + { HOR_PRED, PAETH_PRED } }, +}; + +static const uint8_t av1_mode_to_angle_map[8] = { + 90, 180, 45, 135, 113, 157, 203, 67 +}; + +static const struct { + uint8_t needs_left:1; + uint8_t needs_top:1; + uint8_t needs_topleft:1; + uint8_t needs_topright:1; + uint8_t needs_bottomleft:1; +} av1_intra_prediction_edges[N_IMPL_INTRA_PRED_MODES] = { + [DC_PRED] = { .needs_top = 1, .needs_left = 1 }, + [VERT_PRED] = { .needs_top = 1 }, + [HOR_PRED] = { .needs_left = 1 }, + [LEFT_DC_PRED] = { .needs_left = 1 }, + [TOP_DC_PRED] = { .needs_top = 1 }, + [DC_128_PRED] = { 0 }, + [Z1_PRED] = { .needs_top = 1, .needs_topright = 1, + .needs_topleft = 1 }, + [Z2_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 }, + [Z3_PRED] = { .needs_left = 1, .needs_bottomleft = 1, + .needs_topleft = 1 }, + [SMOOTH_PRED] = { .needs_left = 1, .needs_top = 1 }, + [SMOOTH_V_PRED] = { .needs_left = 1, .needs_top = 1 }, + [SMOOTH_H_PRED] = { .needs_left = 1, .needs_top = 1 }, + [PAETH_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 }, + [FILTER_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 }, +}; + +enum IntraPredMode +bytefn(dav1d_prepare_intra_edges)(const int x, const int have_left, + const int y, const int have_top, + const int w, const int h, + const enum EdgeFlags edge_flags, + const pixel *const dst, + const ptrdiff_t stride, + const pixel *prefilter_toplevel_sb_edge, + enum IntraPredMode mode, int *const angle, + const int tw, const int th, const int filter_edge, + pixel *const topleft_out HIGHBD_DECL_SUFFIX) +{ + const int bitdepth = bitdepth_from_max(bitdepth_max); + assert(y < h && x < w); + + switch (mode) { + case VERT_PRED: + case HOR_PRED: + case DIAG_DOWN_LEFT_PRED: + case DIAG_DOWN_RIGHT_PRED: + case VERT_RIGHT_PRED: + case HOR_DOWN_PRED: + case HOR_UP_PRED: + case VERT_LEFT_PRED: { + *angle = av1_mode_to_angle_map[mode - VERT_PRED] + 3 * *angle; + + if (*angle <= 90) + mode = *angle < 90 && have_top ? Z1_PRED : VERT_PRED; + else if (*angle < 180) + mode = Z2_PRED; + else + mode = *angle > 180 && have_left ? Z3_PRED : HOR_PRED; + break; + } + case DC_PRED: + case PAETH_PRED: + mode = av1_mode_conv[mode][have_left][have_top]; + break; + default: + break; + } + + const pixel *dst_top; + if (have_top && + (av1_intra_prediction_edges[mode].needs_top || + av1_intra_prediction_edges[mode].needs_topleft || + (av1_intra_prediction_edges[mode].needs_left && !have_left))) + { + if (prefilter_toplevel_sb_edge) { + dst_top = &prefilter_toplevel_sb_edge[x * 4]; + } else { + dst_top = &dst[-PXSTRIDE(stride)]; + } + } + + if (av1_intra_prediction_edges[mode].needs_left) { + const int sz = th << 2; + pixel *const left = &topleft_out[-sz]; + + if (have_left) { + const int px_have = imin(sz, (h - y) << 2); + + for (int i = 0; i < px_have; i++) + left[sz - 1 - i] = dst[PXSTRIDE(stride) * i - 1]; + if (px_have < sz) + pixel_set(left, left[sz - px_have], sz - px_have); + } else { + pixel_set(left, have_top ? *dst_top : ((1 << bitdepth) >> 1) + 1, sz); + } + + if (av1_intra_prediction_edges[mode].needs_bottomleft) { + const int have_bottomleft = (!have_left || y + th >= h) ? 0 : + (edge_flags & EDGE_I444_LEFT_HAS_BOTTOM); + + if (have_bottomleft) { + const int px_have = imin(sz, (h - y - th) << 2); + + for (int i = 0; i < px_have; i++) + left[-(i + 1)] = dst[(sz + i) * PXSTRIDE(stride) - 1]; + if (px_have < sz) + pixel_set(left - sz, left[-px_have], sz - px_have); + } else { + pixel_set(left - sz, left[0], sz); + } + } + } + + if (av1_intra_prediction_edges[mode].needs_top) { + const int sz = tw << 2; + pixel *const top = &topleft_out[1]; + + if (have_top) { + const int px_have = imin(sz, (w - x) << 2); + pixel_copy(top, dst_top, px_have); + if (px_have < sz) + pixel_set(top + px_have, top[px_have - 1], sz - px_have); + } else { + pixel_set(top, have_left ? dst[-1] : ((1 << bitdepth) >> 1) - 1, sz); + } + + if (av1_intra_prediction_edges[mode].needs_topright) { + const int have_topright = (!have_top || x + tw >= w) ? 0 : + (edge_flags & EDGE_I444_TOP_HAS_RIGHT); + + if (have_topright) { + const int px_have = imin(sz, (w - x - tw) << 2); + + pixel_copy(top + sz, &dst_top[sz], px_have); + if (px_have < sz) + pixel_set(top + sz + px_have, top[sz + px_have - 1], + sz - px_have); + } else { + pixel_set(top + sz, top[sz - 1], sz); + } + } + } + + if (av1_intra_prediction_edges[mode].needs_topleft) { + if (have_left) + *topleft_out = have_top ? dst_top[-1] : dst[-1]; + else + *topleft_out = have_top ? *dst_top : (1 << bitdepth) >> 1; + + if (mode == Z2_PRED && tw + th >= 6 && filter_edge) + *topleft_out = ((topleft_out[-1] + topleft_out[1]) * 5 + + topleft_out[0] * 6 + 8) >> 4; + } + + return mode; +} diff --git a/third_party/dav1d/src/ipred_tmpl.c b/third_party/dav1d/src/ipred_tmpl.c new file mode 100644 index 0000000000..50c7a3c7be --- /dev/null +++ b/third_party/dav1d/src/ipred_tmpl.c @@ -0,0 +1,763 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include +#include + +#include "common/attributes.h" +#include "common/intops.h" + +#include "src/ipred.h" +#include "src/tables.h" + +static NOINLINE void +splat_dc(pixel *dst, const ptrdiff_t stride, + const int width, const int height, const int dc HIGHBD_DECL_SUFFIX) +{ +#if BITDEPTH == 8 + assert(dc <= 0xff); + if (width > 4) { + const uint64_t dcN = dc * 0x0101010101010101ULL; + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x += sizeof(dcN)) + *((uint64_t *) &dst[x]) = dcN; + dst += PXSTRIDE(stride); + } + } else { + const unsigned dcN = dc * 0x01010101U; + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x += sizeof(dcN)) + *((unsigned *) &dst[x]) = dcN; + dst += PXSTRIDE(stride); + } + } +#else + assert(dc <= bitdepth_max); + const uint64_t dcN = dc * 0x0001000100010001ULL; + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x += sizeof(dcN) >> 1) + *((uint64_t *) &dst[x]) = dcN; + dst += PXSTRIDE(stride); + } +#endif +} + +static NOINLINE void +cfl_pred(pixel *dst, const ptrdiff_t stride, + const int width, const int height, const int dc, + const int16_t *ac, const int alpha HIGHBD_DECL_SUFFIX) +{ + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + const int diff = alpha * ac[x]; + dst[x] = iclip_pixel(dc + apply_sign((abs(diff) + 32) >> 6, diff)); + } + ac += width; + dst += PXSTRIDE(stride); + } +} + +static unsigned dc_gen_top(const pixel *const topleft, const int width) { + unsigned dc = width >> 1; + for (int i = 0; i < width; i++) + dc += topleft[1 + i]; + return dc >> ctz(width); +} + +static void ipred_dc_top_c(pixel *dst, const ptrdiff_t stride, + const pixel *const topleft, + const int width, const int height, const int a, + const int max_width, const int max_height + HIGHBD_DECL_SUFFIX) +{ + splat_dc(dst, stride, width, height, dc_gen_top(topleft, width) + HIGHBD_TAIL_SUFFIX); +} + +static void ipred_cfl_top_c(pixel *dst, const ptrdiff_t stride, + const pixel *const topleft, + const int width, const int height, + const int16_t *ac, const int alpha + HIGHBD_DECL_SUFFIX) +{ + cfl_pred(dst, stride, width, height, dc_gen_top(topleft, width), ac, alpha + HIGHBD_TAIL_SUFFIX); +} + +static unsigned dc_gen_left(const pixel *const topleft, const int height) { + unsigned dc = height >> 1; + for (int i = 0; i < height; i++) + dc += topleft[-(1 + i)]; + return dc >> ctz(height); +} + +static void ipred_dc_left_c(pixel *dst, const ptrdiff_t stride, + const pixel *const topleft, + const int width, const int height, const int a, + const int max_width, const int max_height + HIGHBD_DECL_SUFFIX) +{ + splat_dc(dst, stride, width, height, dc_gen_left(topleft, height) + HIGHBD_TAIL_SUFFIX); +} + +static void ipred_cfl_left_c(pixel *dst, const ptrdiff_t stride, + const pixel *const topleft, + const int width, const int height, + const int16_t *ac, const int alpha + HIGHBD_DECL_SUFFIX) +{ + const unsigned dc = dc_gen_left(topleft, height); + cfl_pred(dst, stride, width, height, dc, ac, alpha HIGHBD_TAIL_SUFFIX); +} + +#if BITDEPTH == 8 +#define MULTIPLIER_1x2 0x5556 +#define MULTIPLIER_1x4 0x3334 +#define BASE_SHIFT 16 +#else +#define MULTIPLIER_1x2 0xAAAB +#define MULTIPLIER_1x4 0x6667 +#define BASE_SHIFT 17 +#endif + +static unsigned dc_gen(const pixel *const topleft, + const int width, const int height) +{ + unsigned dc = (width + height) >> 1; + for (int i = 0; i < width; i++) + dc += topleft[i + 1]; + for (int i = 0; i < height; i++) + dc += topleft[-(i + 1)]; + dc >>= ctz(width + height); + + if (width != height) { + dc *= (width > height * 2 || height > width * 2) ? MULTIPLIER_1x4 : + MULTIPLIER_1x2; + dc >>= BASE_SHIFT; + } + return dc; +} + +static void ipred_dc_c(pixel *dst, const ptrdiff_t stride, + const pixel *const topleft, + const int width, const int height, const int a, + const int max_width, const int max_height + HIGHBD_DECL_SUFFIX) +{ + splat_dc(dst, stride, width, height, dc_gen(topleft, width, height) + HIGHBD_TAIL_SUFFIX); +} + +static void ipred_cfl_c(pixel *dst, const ptrdiff_t stride, + const pixel *const topleft, + const int width, const int height, + const int16_t *ac, const int alpha + HIGHBD_DECL_SUFFIX) +{ + unsigned dc = dc_gen(topleft, width, height); + cfl_pred(dst, stride, width, height, dc, ac, alpha HIGHBD_TAIL_SUFFIX); +} + +#undef MULTIPLIER_1x2 +#undef MULTIPLIER_1x4 +#undef BASE_SHIFT + +static void ipred_dc_128_c(pixel *dst, const ptrdiff_t stride, + const pixel *const topleft, + const int width, const int height, const int a, + const int max_width, const int max_height + HIGHBD_DECL_SUFFIX) +{ +#if BITDEPTH == 16 + const int dc = (bitdepth_max + 1) >> 1; +#else + const int dc = 128; +#endif + splat_dc(dst, stride, width, height, dc HIGHBD_TAIL_SUFFIX); +} + +static void ipred_cfl_128_c(pixel *dst, const ptrdiff_t stride, + const pixel *const topleft, + const int width, const int height, + const int16_t *ac, const int alpha + HIGHBD_DECL_SUFFIX) +{ +#if BITDEPTH == 16 + const int dc = (bitdepth_max + 1) >> 1; +#else + const int dc = 128; +#endif + cfl_pred(dst, stride, width, height, dc, ac, alpha HIGHBD_TAIL_SUFFIX); +} + +static void ipred_v_c(pixel *dst, const ptrdiff_t stride, + const pixel *const topleft, + const int width, const int height, const int a, + const int max_width, const int max_height + HIGHBD_DECL_SUFFIX) +{ + for (int y = 0; y < height; y++) { + pixel_copy(dst, topleft + 1, width); + dst += PXSTRIDE(stride); + } +} + +static void ipred_h_c(pixel *dst, const ptrdiff_t stride, + const pixel *const topleft, + const int width, const int height, const int a, + const int max_width, const int max_height + HIGHBD_DECL_SUFFIX) +{ + for (int y = 0; y < height; y++) { + pixel_set(dst, topleft[-(1 + y)], width); + dst += PXSTRIDE(stride); + } +} + +static void ipred_paeth_c(pixel *dst, const ptrdiff_t stride, + const pixel *const tl_ptr, + const int width, const int height, const int a, + const int max_width, const int max_height + HIGHBD_DECL_SUFFIX) +{ + const int topleft = tl_ptr[0]; + for (int y = 0; y < height; y++) { + const int left = tl_ptr[-(y + 1)]; + for (int x = 0; x < width; x++) { + const int top = tl_ptr[1 + x]; + const int base = left + top - topleft; + const int ldiff = abs(left - base); + const int tdiff = abs(top - base); + const int tldiff = abs(topleft - base); + + dst[x] = ldiff <= tdiff && ldiff <= tldiff ? left : + tdiff <= tldiff ? top : topleft; + } + dst += PXSTRIDE(stride); + } +} + +static void ipred_smooth_c(pixel *dst, const ptrdiff_t stride, + const pixel *const topleft, + const int width, const int height, const int a, + const int max_width, const int max_height + HIGHBD_DECL_SUFFIX) +{ + const uint8_t *const weights_hor = &dav1d_sm_weights[width]; + const uint8_t *const weights_ver = &dav1d_sm_weights[height]; + const int right = topleft[width], bottom = topleft[-height]; + + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + const int pred = weights_ver[y] * topleft[1 + x] + + (256 - weights_ver[y]) * bottom + + weights_hor[x] * topleft[-(1 + y)] + + (256 - weights_hor[x]) * right; + dst[x] = (pred + 256) >> 9; + } + dst += PXSTRIDE(stride); + } +} + +static void ipred_smooth_v_c(pixel *dst, const ptrdiff_t stride, + const pixel *const topleft, + const int width, const int height, const int a, + const int max_width, const int max_height + HIGHBD_DECL_SUFFIX) +{ + const uint8_t *const weights_ver = &dav1d_sm_weights[height]; + const int bottom = topleft[-height]; + + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + const int pred = weights_ver[y] * topleft[1 + x] + + (256 - weights_ver[y]) * bottom; + dst[x] = (pred + 128) >> 8; + } + dst += PXSTRIDE(stride); + } +} + +static void ipred_smooth_h_c(pixel *dst, const ptrdiff_t stride, + const pixel *const topleft, + const int width, const int height, const int a, + const int max_width, const int max_height + HIGHBD_DECL_SUFFIX) +{ + const uint8_t *const weights_hor = &dav1d_sm_weights[width]; + const int right = topleft[width]; + + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + const int pred = weights_hor[x] * topleft[-(y + 1)] + + (256 - weights_hor[x]) * right; + dst[x] = (pred + 128) >> 8; + } + dst += PXSTRIDE(stride); + } +} + +static NOINLINE int get_filter_strength(const int wh, const int angle, + const int is_sm) +{ + if (is_sm) { + if (wh <= 8) { + if (angle >= 64) return 2; + if (angle >= 40) return 1; + } else if (wh <= 16) { + if (angle >= 48) return 2; + if (angle >= 20) return 1; + } else if (wh <= 24) { + if (angle >= 4) return 3; + } else { + return 3; + } + } else { + if (wh <= 8) { + if (angle >= 56) return 1; + } else if (wh <= 16) { + if (angle >= 40) return 1; + } else if (wh <= 24) { + if (angle >= 32) return 3; + if (angle >= 16) return 2; + if (angle >= 8) return 1; + } else if (wh <= 32) { + if (angle >= 32) return 3; + if (angle >= 4) return 2; + return 1; + } else { + return 3; + } + } + return 0; +} + +static NOINLINE void filter_edge(pixel *const out, const int sz, + const int lim_from, const int lim_to, + const pixel *const in, const int from, + const int to, const int strength) +{ + static const uint8_t kernel[3][5] = { + { 0, 4, 8, 4, 0 }, + { 0, 5, 6, 5, 0 }, + { 2, 4, 4, 4, 2 } + }; + + assert(strength > 0); + int i = 0; + for (; i < imin(sz, lim_from); i++) + out[i] = in[iclip(i, from, to - 1)]; + for (; i < imin(lim_to, sz); i++) { + int s = 0; + for (int j = 0; j < 5; j++) + s += in[iclip(i - 2 + j, from, to - 1)] * kernel[strength - 1][j]; + out[i] = (s + 8) >> 4; + } + for (; i < sz; i++) + out[i] = in[iclip(i, from, to - 1)]; +} + +static inline int get_upsample(const int wh, const int angle, const int is_sm) { + return angle < 40 && wh <= 16 >> is_sm; +} + +static NOINLINE void upsample_edge(pixel *const out, const int hsz, + const pixel *const in, const int from, + const int to HIGHBD_DECL_SUFFIX) +{ + static const int8_t kernel[4] = { -1, 9, 9, -1 }; + int i; + for (i = 0; i < hsz - 1; i++) { + out[i * 2] = in[iclip(i, from, to - 1)]; + + int s = 0; + for (int j = 0; j < 4; j++) + s += in[iclip(i + j - 1, from, to - 1)] * kernel[j]; + out[i * 2 + 1] = iclip_pixel((s + 8) >> 4); + } + out[i * 2] = in[iclip(i, from, to - 1)]; +} + +static void ipred_z1_c(pixel *dst, const ptrdiff_t stride, + const pixel *const topleft_in, + const int width, const int height, int angle, + const int max_width, const int max_height + HIGHBD_DECL_SUFFIX) +{ + const int is_sm = (angle >> 9) & 0x1; + const int enable_intra_edge_filter = angle >> 10; + angle &= 511; + assert(angle < 90); + int dx = dav1d_dr_intra_derivative[angle >> 1]; + pixel top_out[64 + 64]; + const pixel *top; + int max_base_x; + const int upsample_above = enable_intra_edge_filter ? + get_upsample(width + height, 90 - angle, is_sm) : 0; + if (upsample_above) { + upsample_edge(top_out, width + height, &topleft_in[1], -1, + width + imin(width, height) HIGHBD_TAIL_SUFFIX); + top = top_out; + max_base_x = 2 * (width + height) - 2; + dx <<= 1; + } else { + const int filter_strength = enable_intra_edge_filter ? + get_filter_strength(width + height, 90 - angle, is_sm) : 0; + if (filter_strength) { + filter_edge(top_out, width + height, 0, width + height, + &topleft_in[1], -1, width + imin(width, height), + filter_strength); + top = top_out; + max_base_x = width + height - 1; + } else { + top = &topleft_in[1]; + max_base_x = width + imin(width, height) - 1; + } + } + const int base_inc = 1 + upsample_above; + for (int y = 0, xpos = dx; y < height; + y++, dst += PXSTRIDE(stride), xpos += dx) + { + const int frac = xpos & 0x3E; + + for (int x = 0, base = xpos >> 6; x < width; x++, base += base_inc) { + if (base < max_base_x) { + const int v = top[base] * (64 - frac) + top[base + 1] * frac; + dst[x] = (v + 32) >> 6; + } else { + pixel_set(&dst[x], top[max_base_x], width - x); + break; + } + } + } +} + +static void ipred_z2_c(pixel *dst, const ptrdiff_t stride, + const pixel *const topleft_in, + const int width, const int height, int angle, + const int max_width, const int max_height + HIGHBD_DECL_SUFFIX) +{ + const int is_sm = (angle >> 9) & 0x1; + const int enable_intra_edge_filter = angle >> 10; + angle &= 511; + assert(angle > 90 && angle < 180); + int dy = dav1d_dr_intra_derivative[(angle - 90) >> 1]; + int dx = dav1d_dr_intra_derivative[(180 - angle) >> 1]; + const int upsample_left = enable_intra_edge_filter ? + get_upsample(width + height, 180 - angle, is_sm) : 0; + const int upsample_above = enable_intra_edge_filter ? + get_upsample(width + height, angle - 90, is_sm) : 0; + pixel edge[64 + 64 + 1]; + pixel *const topleft = &edge[64]; + + if (upsample_above) { + upsample_edge(topleft, width + 1, topleft_in, 0, width + 1 + HIGHBD_TAIL_SUFFIX); + dx <<= 1; + } else { + const int filter_strength = enable_intra_edge_filter ? + get_filter_strength(width + height, angle - 90, is_sm) : 0; + + if (filter_strength) { + filter_edge(&topleft[1], width, 0, max_width, + &topleft_in[1], -1, width, + filter_strength); + } else { + pixel_copy(&topleft[1], &topleft_in[1], width); + } + } + if (upsample_left) { + upsample_edge(&topleft[-height * 2], height + 1, &topleft_in[-height], + 0, height + 1 HIGHBD_TAIL_SUFFIX); + dy <<= 1; + } else { + const int filter_strength = enable_intra_edge_filter ? + get_filter_strength(width + height, 180 - angle, is_sm) : 0; + + if (filter_strength) { + filter_edge(&topleft[-height], height, height - max_height, height, + &topleft_in[-height], + 0, height + 1, filter_strength); + } else { + pixel_copy(&topleft[-height], &topleft_in[-height], height); + } + } + *topleft = *topleft_in; + + const int base_inc_x = 1 + upsample_above; + const pixel *const left = &topleft[-(1 + upsample_left)]; + for (int y = 0, xpos = ((1 + upsample_above) << 6) - dx; y < height; + y++, xpos -= dx, dst += PXSTRIDE(stride)) + { + int base_x = xpos >> 6; + const int frac_x = xpos & 0x3E; + + for (int x = 0, ypos = (y << (6 + upsample_left)) - dy; x < width; + x++, base_x += base_inc_x, ypos -= dy) + { + int v; + if (base_x >= 0) { + v = topleft[base_x] * (64 - frac_x) + + topleft[base_x + 1] * frac_x; + } else { + const int base_y = ypos >> 6; + assert(base_y >= -(1 + upsample_left)); + const int frac_y = ypos & 0x3E; + v = left[-base_y] * (64 - frac_y) + + left[-(base_y + 1)] * frac_y; + } + dst[x] = (v + 32) >> 6; + } + } +} + +static void ipred_z3_c(pixel *dst, const ptrdiff_t stride, + const pixel *const topleft_in, + const int width, const int height, int angle, + const int max_width, const int max_height + HIGHBD_DECL_SUFFIX) +{ + const int is_sm = (angle >> 9) & 0x1; + const int enable_intra_edge_filter = angle >> 10; + angle &= 511; + assert(angle > 180); + int dy = dav1d_dr_intra_derivative[(270 - angle) >> 1]; + pixel left_out[64 + 64]; + const pixel *left; + int max_base_y; + const int upsample_left = enable_intra_edge_filter ? + get_upsample(width + height, angle - 180, is_sm) : 0; + if (upsample_left) { + upsample_edge(left_out, width + height, + &topleft_in[-(width + height)], + imax(width - height, 0), width + height + 1 + HIGHBD_TAIL_SUFFIX); + left = &left_out[2 * (width + height) - 2]; + max_base_y = 2 * (width + height) - 2; + dy <<= 1; + } else { + const int filter_strength = enable_intra_edge_filter ? + get_filter_strength(width + height, angle - 180, is_sm) : 0; + + if (filter_strength) { + filter_edge(left_out, width + height, 0, width + height, + &topleft_in[-(width + height)], + imax(width - height, 0), width + height + 1, + filter_strength); + left = &left_out[width + height - 1]; + max_base_y = width + height - 1; + } else { + left = &topleft_in[-1]; + max_base_y = height + imin(width, height) - 1; + } + } + const int base_inc = 1 + upsample_left; + for (int x = 0, ypos = dy; x < width; x++, ypos += dy) { + const int frac = ypos & 0x3E; + + for (int y = 0, base = ypos >> 6; y < height; y++, base += base_inc) { + if (base < max_base_y) { + const int v = left[-base] * (64 - frac) + + left[-(base + 1)] * frac; + dst[y * PXSTRIDE(stride) + x] = (v + 32) >> 6; + } else { + do { + dst[y * PXSTRIDE(stride) + x] = left[-max_base_y]; + } while (++y < height); + break; + } + } + } +} + +#if ARCH_X86 +#define FILTER(flt_ptr, p0, p1, p2, p3, p4, p5, p6) \ + flt_ptr[ 0] * p0 + flt_ptr[ 1] * p1 + \ + flt_ptr[16] * p2 + flt_ptr[17] * p3 + \ + flt_ptr[32] * p4 + flt_ptr[33] * p5 + \ + flt_ptr[48] * p6 +#define FLT_INCR 2 +#else +#define FILTER(flt_ptr, p0, p1, p2, p3, p4, p5, p6) \ + flt_ptr[ 0] * p0 + flt_ptr[ 8] * p1 + \ + flt_ptr[16] * p2 + flt_ptr[24] * p3 + \ + flt_ptr[32] * p4 + flt_ptr[40] * p5 + \ + flt_ptr[48] * p6 +#define FLT_INCR 1 +#endif + +/* Up to 32x32 only */ +static void ipred_filter_c(pixel *dst, const ptrdiff_t stride, + const pixel *const topleft_in, + const int width, const int height, int filt_idx, + const int max_width, const int max_height + HIGHBD_DECL_SUFFIX) +{ + filt_idx &= 511; + assert(filt_idx < 5); + + const int8_t *const filter = dav1d_filter_intra_taps[filt_idx]; + const pixel *top = &topleft_in[1]; + for (int y = 0; y < height; y += 2) { + const pixel *topleft = &topleft_in[-y]; + const pixel *left = &topleft[-1]; + ptrdiff_t left_stride = -1; + for (int x = 0; x < width; x += 4) { + const int p0 = *topleft; + const int p1 = top[0], p2 = top[1], p3 = top[2], p4 = top[3]; + const int p5 = left[0 * left_stride], p6 = left[1 * left_stride]; + pixel *ptr = &dst[x]; + const int8_t *flt_ptr = filter; + + for (int yy = 0; yy < 2; yy++) { + for (int xx = 0; xx < 4; xx++, flt_ptr += FLT_INCR) { + const int acc = FILTER(flt_ptr, p0, p1, p2, p3, p4, p5, p6); + ptr[xx] = iclip_pixel((acc + 8) >> 4); + } + ptr += PXSTRIDE(stride); + } + left = &dst[x + 4 - 1]; + left_stride = PXSTRIDE(stride); + top += 4; + topleft = &top[-1]; + } + top = &dst[PXSTRIDE(stride)]; + dst = &dst[PXSTRIDE(stride) * 2]; + } +} + +static NOINLINE void +cfl_ac_c(int16_t *ac, const pixel *ypx, const ptrdiff_t stride, + const int w_pad, const int h_pad, const int width, const int height, + const int ss_hor, const int ss_ver) +{ + int y, x; + int16_t *const ac_orig = ac; + + assert(w_pad >= 0 && w_pad * 4 < width); + assert(h_pad >= 0 && h_pad * 4 < height); + + for (y = 0; y < height - 4 * h_pad; y++) { + for (x = 0; x < width - 4 * w_pad; x++) { + int ac_sum = ypx[x << ss_hor]; + if (ss_hor) ac_sum += ypx[x * 2 + 1]; + if (ss_ver) { + ac_sum += ypx[(x << ss_hor) + PXSTRIDE(stride)]; + if (ss_hor) ac_sum += ypx[x * 2 + 1 + PXSTRIDE(stride)]; + } + ac[x] = ac_sum << (1 + !ss_ver + !ss_hor); + } + for (; x < width; x++) + ac[x] = ac[x - 1]; + ac += width; + ypx += PXSTRIDE(stride) << ss_ver; + } + for (; y < height; y++) { + memcpy(ac, &ac[-width], width * sizeof(*ac)); + ac += width; + } + + const int log2sz = ctz(width) + ctz(height); + int sum = (1 << log2sz) >> 1; + for (ac = ac_orig, y = 0; y < height; y++) { + for (x = 0; x < width; x++) + sum += ac[x]; + ac += width; + } + sum >>= log2sz; + + // subtract DC + for (ac = ac_orig, y = 0; y < height; y++) { + for (x = 0; x < width; x++) + ac[x] -= sum; + ac += width; + } +} + +#define cfl_ac_fn(fmt, ss_hor, ss_ver) \ +static void cfl_ac_##fmt##_c(int16_t *const ac, const pixel *const ypx, \ + const ptrdiff_t stride, const int w_pad, \ + const int h_pad, const int cw, const int ch) \ +{ \ + cfl_ac_c(ac, ypx, stride, w_pad, h_pad, cw, ch, ss_hor, ss_ver); \ +} + +cfl_ac_fn(420, 1, 1) +cfl_ac_fn(422, 1, 0) +cfl_ac_fn(444, 0, 0) + +static void pal_pred_c(pixel *dst, const ptrdiff_t stride, + const uint16_t *const pal, const uint8_t *idx, + const int w, const int h) +{ + for (int y = 0; y < h; y++) { + for (int x = 0; x < w; x++) + dst[x] = (pixel) pal[idx[x]]; + idx += w; + dst += PXSTRIDE(stride); + } +} + +COLD void bitfn(dav1d_intra_pred_dsp_init)(Dav1dIntraPredDSPContext *const c) { + c->intra_pred[DC_PRED ] = ipred_dc_c; + c->intra_pred[DC_128_PRED ] = ipred_dc_128_c; + c->intra_pred[TOP_DC_PRED ] = ipred_dc_top_c; + c->intra_pred[LEFT_DC_PRED ] = ipred_dc_left_c; + c->intra_pred[HOR_PRED ] = ipred_h_c; + c->intra_pred[VERT_PRED ] = ipred_v_c; + c->intra_pred[PAETH_PRED ] = ipred_paeth_c; + c->intra_pred[SMOOTH_PRED ] = ipred_smooth_c; + c->intra_pred[SMOOTH_V_PRED] = ipred_smooth_v_c; + c->intra_pred[SMOOTH_H_PRED] = ipred_smooth_h_c; + c->intra_pred[Z1_PRED ] = ipred_z1_c; + c->intra_pred[Z2_PRED ] = ipred_z2_c; + c->intra_pred[Z3_PRED ] = ipred_z3_c; + c->intra_pred[FILTER_PRED ] = ipred_filter_c; + + c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = cfl_ac_420_c; + c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = cfl_ac_422_c; + c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = cfl_ac_444_c; + + c->cfl_pred[DC_PRED ] = ipred_cfl_c; + c->cfl_pred[DC_128_PRED ] = ipred_cfl_128_c; + c->cfl_pred[TOP_DC_PRED ] = ipred_cfl_top_c; + c->cfl_pred[LEFT_DC_PRED] = ipred_cfl_left_c; + + c->pal_pred = pal_pred_c; + +#if HAVE_ASM +#if ARCH_AARCH64 || ARCH_ARM + bitfn(dav1d_intra_pred_dsp_init_arm)(c); +#elif ARCH_X86 + bitfn(dav1d_intra_pred_dsp_init_x86)(c); +#endif +#endif +} diff --git a/third_party/dav1d/src/itx.h b/third_party/dav1d/src/itx.h new file mode 100644 index 0000000000..a299629c5c --- /dev/null +++ b/third_party/dav1d/src/itx.h @@ -0,0 +1,50 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_ITX_H +#define DAV1D_SRC_ITX_H + +#include + +#include "common/bitdepth.h" + +#include "src/levels.h" + +#define decl_itx_fn(name) \ +void (name)(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob \ + HIGHBD_DECL_SUFFIX) +typedef decl_itx_fn(*itxfm_fn); + +typedef struct Dav1dInvTxfmDSPContext { + itxfm_fn itxfm_add[N_RECT_TX_SIZES][N_TX_TYPES_PLUS_LL]; +} Dav1dInvTxfmDSPContext; + +bitfn_decls(void dav1d_itx_dsp_init, Dav1dInvTxfmDSPContext *c, int bpc); +bitfn_decls(void dav1d_itx_dsp_init_arm, Dav1dInvTxfmDSPContext *c, int bpc); +bitfn_decls(void dav1d_itx_dsp_init_x86, Dav1dInvTxfmDSPContext *c); + +#endif /* DAV1D_SRC_ITX_H */ diff --git a/third_party/dav1d/src/itx_1d.c b/third_party/dav1d/src/itx_1d.c new file mode 100644 index 0000000000..ca14fc8c41 --- /dev/null +++ b/third_party/dav1d/src/itx_1d.c @@ -0,0 +1,1034 @@ +/* + * Copyright © 2018-2019, VideoLAN and dav1d authors + * Copyright © 2018-2019, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include +#include + +#include "common/intops.h" + +#include "src/itx_1d.h" + +#define CLIP(a) iclip(a, min, max) + +/* + * In some places, we use the pattern like this: + * t2 = ((in1 * 1567 - in3 * (3784 - 4096) + 2048) >> 12) - in3; + * even though the reference code might use something like: + * t2 = (in1 * 1567 - in3 * 3784 + 2048) >> 12; + * + * The reason for this is that for 12 bits/component bitstreams (corrupt/ + * invalid ones, but they are codable nonetheless), each coefficient or + * input can be 19(+sign) bits, and therefore if the combination of the + * two multipliers (each 12 bits) is >= 4096, the result of the add/sub + * after the pair of multiplies will exceed the 31+sign bit range. Signed + * integer overflows are UB in C, and we'd like to prevent that. + * + * To workaround this, we invert one of the two coefficients (or, if both are + * multiples of 2, we reduce their magnitude by one bit). It should be noted + * that SIMD implementations do not have to follow this exact behaviour. The + * AV1 spec clearly states that the result of the multiply/add pairs should + * fit in 31+sign bit intermediates, and that streams violating this convention + * are not AV1-compliant. So, as long as we don't trigger UB (which some people + * would consider a security vulnerability), we're fine. So, SIMD can simply + * use the faster implementation, even if that might in some cases result in + * integer overflows, since these are not considered valid AV1 anyway, and in + * e.g. x86 assembly, integer overflows are not considered UB, but they merely + * wrap around. + */ + +static NOINLINE void +inv_dct4_1d_internal_c(int32_t *const c, const ptrdiff_t stride, + const int min, const int max, const int tx64) +{ + assert(stride > 0); + const int in0 = c[0 * stride], in1 = c[1 * stride]; + + int t0, t1, t2, t3; + if (tx64) { + t0 = t1 = (in0 * 181 + 128) >> 8; + t2 = (in1 * 1567 + 2048) >> 12; + t3 = (in1 * 3784 + 2048) >> 12; + } else { + const int in2 = c[2 * stride], in3 = c[3 * stride]; + + t0 = ((in0 + in2) * 181 + 128) >> 8; + t1 = ((in0 - in2) * 181 + 128) >> 8; + t2 = ((in1 * 1567 - in3 * (3784 - 4096) + 2048) >> 12) - in3; + t3 = ((in1 * (3784 - 4096) + in3 * 1567 + 2048) >> 12) + in1; + } + + c[0 * stride] = CLIP(t0 + t3); + c[1 * stride] = CLIP(t1 + t2); + c[2 * stride] = CLIP(t1 - t2); + c[3 * stride] = CLIP(t0 - t3); +} + +void dav1d_inv_dct4_1d_c(int32_t *const c, const ptrdiff_t stride, + const int min, const int max) +{ + inv_dct4_1d_internal_c(c, stride, min, max, 0); +} + +static NOINLINE void +inv_dct8_1d_internal_c(int32_t *const c, const ptrdiff_t stride, + const int min, const int max, const int tx64) +{ + assert(stride > 0); + inv_dct4_1d_internal_c(c, stride << 1, min, max, tx64); + + const int in1 = c[1 * stride], in3 = c[3 * stride]; + + int t4a, t5a, t6a, t7a; + if (tx64) { + t4a = (in1 * 799 + 2048) >> 12; + t5a = (in3 * -2276 + 2048) >> 12; + t6a = (in3 * 3406 + 2048) >> 12; + t7a = (in1 * 4017 + 2048) >> 12; + } else { + const int in5 = c[5 * stride], in7 = c[7 * stride]; + + t4a = ((in1 * 799 - in7 * (4017 - 4096) + 2048) >> 12) - in7; + t5a = (in5 * 1703 - in3 * 1138 + 1024) >> 11; + t6a = (in5 * 1138 + in3 * 1703 + 1024) >> 11; + t7a = ((in1 * (4017 - 4096) + in7 * 799 + 2048) >> 12) + in1; + } + + const int t4 = CLIP(t4a + t5a); + t5a = CLIP(t4a - t5a); + const int t7 = CLIP(t7a + t6a); + t6a = CLIP(t7a - t6a); + + const int t5 = ((t6a - t5a) * 181 + 128) >> 8; + const int t6 = ((t6a + t5a) * 181 + 128) >> 8; + + const int t0 = c[0 * stride]; + const int t1 = c[2 * stride]; + const int t2 = c[4 * stride]; + const int t3 = c[6 * stride]; + + c[0 * stride] = CLIP(t0 + t7); + c[1 * stride] = CLIP(t1 + t6); + c[2 * stride] = CLIP(t2 + t5); + c[3 * stride] = CLIP(t3 + t4); + c[4 * stride] = CLIP(t3 - t4); + c[5 * stride] = CLIP(t2 - t5); + c[6 * stride] = CLIP(t1 - t6); + c[7 * stride] = CLIP(t0 - t7); +} + +void dav1d_inv_dct8_1d_c(int32_t *const c, const ptrdiff_t stride, + const int min, const int max) +{ + inv_dct8_1d_internal_c(c, stride, min, max, 0); +} + +static NOINLINE void +inv_dct16_1d_internal_c(int32_t *const c, const ptrdiff_t stride, + const int min, const int max, int tx64) +{ + assert(stride > 0); + inv_dct8_1d_internal_c(c, stride << 1, min, max, tx64); + + const int in1 = c[1 * stride], in3 = c[3 * stride]; + const int in5 = c[5 * stride], in7 = c[7 * stride]; + + int t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a; + if (tx64) { + t8a = (in1 * 401 + 2048) >> 12; + t9a = (in7 * -2598 + 2048) >> 12; + t10a = (in5 * 1931 + 2048) >> 12; + t11a = (in3 * -1189 + 2048) >> 12; + t12a = (in3 * 3920 + 2048) >> 12; + t13a = (in5 * 3612 + 2048) >> 12; + t14a = (in7 * 3166 + 2048) >> 12; + t15a = (in1 * 4076 + 2048) >> 12; + } else { + const int in9 = c[ 9 * stride], in11 = c[11 * stride]; + const int in13 = c[13 * stride], in15 = c[15 * stride]; + + t8a = ((in1 * 401 - in15 * (4076 - 4096) + 2048) >> 12) - in15; + t9a = (in9 * 1583 - in7 * 1299 + 1024) >> 11; + t10a = ((in5 * 1931 - in11 * (3612 - 4096) + 2048) >> 12) - in11; + t11a = ((in13 * (3920 - 4096) - in3 * 1189 + 2048) >> 12) + in13; + t12a = ((in13 * 1189 + in3 * (3920 - 4096) + 2048) >> 12) + in3; + t13a = ((in5 * (3612 - 4096) + in11 * 1931 + 2048) >> 12) + in5; + t14a = (in9 * 1299 + in7 * 1583 + 1024) >> 11; + t15a = ((in1 * (4076 - 4096) + in15 * 401 + 2048) >> 12) + in1; + } + + int t8 = CLIP(t8a + t9a); + int t9 = CLIP(t8a - t9a); + int t10 = CLIP(t11a - t10a); + int t11 = CLIP(t11a + t10a); + int t12 = CLIP(t12a + t13a); + int t13 = CLIP(t12a - t13a); + int t14 = CLIP(t15a - t14a); + int t15 = CLIP(t15a + t14a); + + t9a = (( t14 * 1567 - t9 * (3784 - 4096) + 2048) >> 12) - t9; + t14a = (( t14 * (3784 - 4096) + t9 * 1567 + 2048) >> 12) + t14; + t10a = ((-(t13 * (3784 - 4096) + t10 * 1567) + 2048) >> 12) - t13; + t13a = (( t13 * 1567 - t10 * (3784 - 4096) + 2048) >> 12) - t10; + + t8a = CLIP(t8 + t11); + t9 = CLIP(t9a + t10a); + t10 = CLIP(t9a - t10a); + t11a = CLIP(t8 - t11); + t12a = CLIP(t15 - t12); + t13 = CLIP(t14a - t13a); + t14 = CLIP(t14a + t13a); + t15a = CLIP(t15 + t12); + + t10a = ((t13 - t10) * 181 + 128) >> 8; + t13a = ((t13 + t10) * 181 + 128) >> 8; + t11 = ((t12a - t11a) * 181 + 128) >> 8; + t12 = ((t12a + t11a) * 181 + 128) >> 8; + + const int t0 = c[ 0 * stride]; + const int t1 = c[ 2 * stride]; + const int t2 = c[ 4 * stride]; + const int t3 = c[ 6 * stride]; + const int t4 = c[ 8 * stride]; + const int t5 = c[10 * stride]; + const int t6 = c[12 * stride]; + const int t7 = c[14 * stride]; + + c[ 0 * stride] = CLIP(t0 + t15a); + c[ 1 * stride] = CLIP(t1 + t14); + c[ 2 * stride] = CLIP(t2 + t13a); + c[ 3 * stride] = CLIP(t3 + t12); + c[ 4 * stride] = CLIP(t4 + t11); + c[ 5 * stride] = CLIP(t5 + t10a); + c[ 6 * stride] = CLIP(t6 + t9); + c[ 7 * stride] = CLIP(t7 + t8a); + c[ 8 * stride] = CLIP(t7 - t8a); + c[ 9 * stride] = CLIP(t6 - t9); + c[10 * stride] = CLIP(t5 - t10a); + c[11 * stride] = CLIP(t4 - t11); + c[12 * stride] = CLIP(t3 - t12); + c[13 * stride] = CLIP(t2 - t13a); + c[14 * stride] = CLIP(t1 - t14); + c[15 * stride] = CLIP(t0 - t15a); +} + +void dav1d_inv_dct16_1d_c(int32_t *const c, const ptrdiff_t stride, + const int min, const int max) +{ + inv_dct16_1d_internal_c(c, stride, min, max, 0); +} + +static NOINLINE void +inv_dct32_1d_internal_c(int32_t *const c, const ptrdiff_t stride, + const int min, const int max, const int tx64) +{ + assert(stride > 0); + inv_dct16_1d_internal_c(c, stride << 1, min, max, tx64); + + const int in1 = c[ 1 * stride], in3 = c[ 3 * stride]; + const int in5 = c[ 5 * stride], in7 = c[ 7 * stride]; + const int in9 = c[ 9 * stride], in11 = c[11 * stride]; + const int in13 = c[13 * stride], in15 = c[15 * stride]; + + int t16a, t17a, t18a, t19a, t20a, t21a, t22a, t23a; + int t24a, t25a, t26a, t27a, t28a, t29a, t30a, t31a; + if (tx64) { + t16a = (in1 * 201 + 2048) >> 12; + t17a = (in15 * -2751 + 2048) >> 12; + t18a = (in9 * 1751 + 2048) >> 12; + t19a = (in7 * -1380 + 2048) >> 12; + t20a = (in5 * 995 + 2048) >> 12; + t21a = (in11 * -2106 + 2048) >> 12; + t22a = (in13 * 2440 + 2048) >> 12; + t23a = (in3 * -601 + 2048) >> 12; + t24a = (in3 * 4052 + 2048) >> 12; + t25a = (in13 * 3290 + 2048) >> 12; + t26a = (in11 * 3513 + 2048) >> 12; + t27a = (in5 * 3973 + 2048) >> 12; + t28a = (in7 * 3857 + 2048) >> 12; + t29a = (in9 * 3703 + 2048) >> 12; + t30a = (in15 * 3035 + 2048) >> 12; + t31a = (in1 * 4091 + 2048) >> 12; + } else { + const int in17 = c[17 * stride], in19 = c[19 * stride]; + const int in21 = c[21 * stride], in23 = c[23 * stride]; + const int in25 = c[25 * stride], in27 = c[27 * stride]; + const int in29 = c[29 * stride], in31 = c[31 * stride]; + + t16a = ((in1 * 201 - in31 * (4091 - 4096) + 2048) >> 12) - in31; + t17a = ((in17 * (3035 - 4096) - in15 * 2751 + 2048) >> 12) + in17; + t18a = ((in9 * 1751 - in23 * (3703 - 4096) + 2048) >> 12) - in23; + t19a = ((in25 * (3857 - 4096) - in7 * 1380 + 2048) >> 12) + in25; + t20a = ((in5 * 995 - in27 * (3973 - 4096) + 2048) >> 12) - in27; + t21a = ((in21 * (3513 - 4096) - in11 * 2106 + 2048) >> 12) + in21; + t22a = (in13 * 1220 - in19 * 1645 + 1024) >> 11; + t23a = ((in29 * (4052 - 4096) - in3 * 601 + 2048) >> 12) + in29; + t24a = ((in29 * 601 + in3 * (4052 - 4096) + 2048) >> 12) + in3; + t25a = (in13 * 1645 + in19 * 1220 + 1024) >> 11; + t26a = ((in21 * 2106 + in11 * (3513 - 4096) + 2048) >> 12) + in11; + t27a = ((in5 * (3973 - 4096) + in27 * 995 + 2048) >> 12) + in5; + t28a = ((in25 * 1380 + in7 * (3857 - 4096) + 2048) >> 12) + in7; + t29a = ((in9 * (3703 - 4096) + in23 * 1751 + 2048) >> 12) + in9; + t30a = ((in17 * 2751 + in15 * (3035 - 4096) + 2048) >> 12) + in15; + t31a = ((in1 * (4091 - 4096) + in31 * 201 + 2048) >> 12) + in1; + } + + int t16 = CLIP(t16a + t17a); + int t17 = CLIP(t16a - t17a); + int t18 = CLIP(t19a - t18a); + int t19 = CLIP(t19a + t18a); + int t20 = CLIP(t20a + t21a); + int t21 = CLIP(t20a - t21a); + int t22 = CLIP(t23a - t22a); + int t23 = CLIP(t23a + t22a); + int t24 = CLIP(t24a + t25a); + int t25 = CLIP(t24a - t25a); + int t26 = CLIP(t27a - t26a); + int t27 = CLIP(t27a + t26a); + int t28 = CLIP(t28a + t29a); + int t29 = CLIP(t28a - t29a); + int t30 = CLIP(t31a - t30a); + int t31 = CLIP(t31a + t30a); + + t17a = (( t30 * 799 - t17 * (4017 - 4096) + 2048) >> 12) - t17; + t30a = (( t30 * (4017 - 4096) + t17 * 799 + 2048) >> 12) + t30; + t18a = ((-(t29 * (4017 - 4096) + t18 * 799) + 2048) >> 12) - t29; + t29a = (( t29 * 799 - t18 * (4017 - 4096) + 2048) >> 12) - t18; + t21a = ( t26 * 1703 - t21 * 1138 + 1024) >> 11; + t26a = ( t26 * 1138 + t21 * 1703 + 1024) >> 11; + t22a = (-(t25 * 1138 + t22 * 1703 ) + 1024) >> 11; + t25a = ( t25 * 1703 - t22 * 1138 + 1024) >> 11; + + t16a = CLIP(t16 + t19); + t17 = CLIP(t17a + t18a); + t18 = CLIP(t17a - t18a); + t19a = CLIP(t16 - t19); + t20a = CLIP(t23 - t20); + t21 = CLIP(t22a - t21a); + t22 = CLIP(t22a + t21a); + t23a = CLIP(t23 + t20); + t24a = CLIP(t24 + t27); + t25 = CLIP(t25a + t26a); + t26 = CLIP(t25a - t26a); + t27a = CLIP(t24 - t27); + t28a = CLIP(t31 - t28); + t29 = CLIP(t30a - t29a); + t30 = CLIP(t30a + t29a); + t31a = CLIP(t31 + t28); + + t18a = (( t29 * 1567 - t18 * (3784 - 4096) + 2048) >> 12) - t18; + t29a = (( t29 * (3784 - 4096) + t18 * 1567 + 2048) >> 12) + t29; + t19 = (( t28a * 1567 - t19a * (3784 - 4096) + 2048) >> 12) - t19a; + t28 = (( t28a * (3784 - 4096) + t19a * 1567 + 2048) >> 12) + t28a; + t20 = ((-(t27a * (3784 - 4096) + t20a * 1567) + 2048) >> 12) - t27a; + t27 = (( t27a * 1567 - t20a * (3784 - 4096) + 2048) >> 12) - t20a; + t21a = ((-(t26 * (3784 - 4096) + t21 * 1567) + 2048) >> 12) - t26; + t26a = (( t26 * 1567 - t21 * (3784 - 4096) + 2048) >> 12) - t21; + + t16 = CLIP(t16a + t23a); + t17a = CLIP(t17 + t22); + t18 = CLIP(t18a + t21a); + t19a = CLIP(t19 + t20); + t20a = CLIP(t19 - t20); + t21 = CLIP(t18a - t21a); + t22a = CLIP(t17 - t22); + t23 = CLIP(t16a - t23a); + t24 = CLIP(t31a - t24a); + t25a = CLIP(t30 - t25); + t26 = CLIP(t29a - t26a); + t27a = CLIP(t28 - t27); + t28a = CLIP(t28 + t27); + t29 = CLIP(t29a + t26a); + t30a = CLIP(t30 + t25); + t31 = CLIP(t31a + t24a); + + t20 = ((t27a - t20a) * 181 + 128) >> 8; + t27 = ((t27a + t20a) * 181 + 128) >> 8; + t21a = ((t26 - t21 ) * 181 + 128) >> 8; + t26a = ((t26 + t21 ) * 181 + 128) >> 8; + t22 = ((t25a - t22a) * 181 + 128) >> 8; + t25 = ((t25a + t22a) * 181 + 128) >> 8; + t23a = ((t24 - t23 ) * 181 + 128) >> 8; + t24a = ((t24 + t23 ) * 181 + 128) >> 8; + + const int t0 = c[ 0 * stride]; + const int t1 = c[ 2 * stride]; + const int t2 = c[ 4 * stride]; + const int t3 = c[ 6 * stride]; + const int t4 = c[ 8 * stride]; + const int t5 = c[10 * stride]; + const int t6 = c[12 * stride]; + const int t7 = c[14 * stride]; + const int t8 = c[16 * stride]; + const int t9 = c[18 * stride]; + const int t10 = c[20 * stride]; + const int t11 = c[22 * stride]; + const int t12 = c[24 * stride]; + const int t13 = c[26 * stride]; + const int t14 = c[28 * stride]; + const int t15 = c[30 * stride]; + + c[ 0 * stride] = CLIP(t0 + t31); + c[ 1 * stride] = CLIP(t1 + t30a); + c[ 2 * stride] = CLIP(t2 + t29); + c[ 3 * stride] = CLIP(t3 + t28a); + c[ 4 * stride] = CLIP(t4 + t27); + c[ 5 * stride] = CLIP(t5 + t26a); + c[ 6 * stride] = CLIP(t6 + t25); + c[ 7 * stride] = CLIP(t7 + t24a); + c[ 8 * stride] = CLIP(t8 + t23a); + c[ 9 * stride] = CLIP(t9 + t22); + c[10 * stride] = CLIP(t10 + t21a); + c[11 * stride] = CLIP(t11 + t20); + c[12 * stride] = CLIP(t12 + t19a); + c[13 * stride] = CLIP(t13 + t18); + c[14 * stride] = CLIP(t14 + t17a); + c[15 * stride] = CLIP(t15 + t16); + c[16 * stride] = CLIP(t15 - t16); + c[17 * stride] = CLIP(t14 - t17a); + c[18 * stride] = CLIP(t13 - t18); + c[19 * stride] = CLIP(t12 - t19a); + c[20 * stride] = CLIP(t11 - t20); + c[21 * stride] = CLIP(t10 - t21a); + c[22 * stride] = CLIP(t9 - t22); + c[23 * stride] = CLIP(t8 - t23a); + c[24 * stride] = CLIP(t7 - t24a); + c[25 * stride] = CLIP(t6 - t25); + c[26 * stride] = CLIP(t5 - t26a); + c[27 * stride] = CLIP(t4 - t27); + c[28 * stride] = CLIP(t3 - t28a); + c[29 * stride] = CLIP(t2 - t29); + c[30 * stride] = CLIP(t1 - t30a); + c[31 * stride] = CLIP(t0 - t31); +} + +void dav1d_inv_dct32_1d_c(int32_t *const c, const ptrdiff_t stride, + const int min, const int max) +{ + inv_dct32_1d_internal_c(c, stride, min, max, 0); +} + +void dav1d_inv_dct64_1d_c(int32_t *const c, const ptrdiff_t stride, + const int min, const int max) +{ + assert(stride > 0); + inv_dct32_1d_internal_c(c, stride << 1, min, max, 1); + + const int in1 = c[ 1 * stride], in3 = c[ 3 * stride]; + const int in5 = c[ 5 * stride], in7 = c[ 7 * stride]; + const int in9 = c[ 9 * stride], in11 = c[11 * stride]; + const int in13 = c[13 * stride], in15 = c[15 * stride]; + const int in17 = c[17 * stride], in19 = c[19 * stride]; + const int in21 = c[21 * stride], in23 = c[23 * stride]; + const int in25 = c[25 * stride], in27 = c[27 * stride]; + const int in29 = c[29 * stride], in31 = c[31 * stride]; + + int t32a = (in1 * 101 + 2048) >> 12; + int t33a = (in31 * -2824 + 2048) >> 12; + int t34a = (in17 * 1660 + 2048) >> 12; + int t35a = (in15 * -1474 + 2048) >> 12; + int t36a = (in9 * 897 + 2048) >> 12; + int t37a = (in23 * -2191 + 2048) >> 12; + int t38a = (in25 * 2359 + 2048) >> 12; + int t39a = (in7 * -700 + 2048) >> 12; + int t40a = (in5 * 501 + 2048) >> 12; + int t41a = (in27 * -2520 + 2048) >> 12; + int t42a = (in21 * 2019 + 2048) >> 12; + int t43a = (in11 * -1092 + 2048) >> 12; + int t44a = (in13 * 1285 + 2048) >> 12; + int t45a = (in19 * -1842 + 2048) >> 12; + int t46a = (in29 * 2675 + 2048) >> 12; + int t47a = (in3 * -301 + 2048) >> 12; + int t48a = (in3 * 4085 + 2048) >> 12; + int t49a = (in29 * 3102 + 2048) >> 12; + int t50a = (in19 * 3659 + 2048) >> 12; + int t51a = (in13 * 3889 + 2048) >> 12; + int t52a = (in11 * 3948 + 2048) >> 12; + int t53a = (in21 * 3564 + 2048) >> 12; + int t54a = (in27 * 3229 + 2048) >> 12; + int t55a = (in5 * 4065 + 2048) >> 12; + int t56a = (in7 * 4036 + 2048) >> 12; + int t57a = (in25 * 3349 + 2048) >> 12; + int t58a = (in23 * 3461 + 2048) >> 12; + int t59a = (in9 * 3996 + 2048) >> 12; + int t60a = (in15 * 3822 + 2048) >> 12; + int t61a = (in17 * 3745 + 2048) >> 12; + int t62a = (in31 * 2967 + 2048) >> 12; + int t63a = (in1 * 4095 + 2048) >> 12; + + int t32 = CLIP(t32a + t33a); + int t33 = CLIP(t32a - t33a); + int t34 = CLIP(t35a - t34a); + int t35 = CLIP(t35a + t34a); + int t36 = CLIP(t36a + t37a); + int t37 = CLIP(t36a - t37a); + int t38 = CLIP(t39a - t38a); + int t39 = CLIP(t39a + t38a); + int t40 = CLIP(t40a + t41a); + int t41 = CLIP(t40a - t41a); + int t42 = CLIP(t43a - t42a); + int t43 = CLIP(t43a + t42a); + int t44 = CLIP(t44a + t45a); + int t45 = CLIP(t44a - t45a); + int t46 = CLIP(t47a - t46a); + int t47 = CLIP(t47a + t46a); + int t48 = CLIP(t48a + t49a); + int t49 = CLIP(t48a - t49a); + int t50 = CLIP(t51a - t50a); + int t51 = CLIP(t51a + t50a); + int t52 = CLIP(t52a + t53a); + int t53 = CLIP(t52a - t53a); + int t54 = CLIP(t55a - t54a); + int t55 = CLIP(t55a + t54a); + int t56 = CLIP(t56a + t57a); + int t57 = CLIP(t56a - t57a); + int t58 = CLIP(t59a - t58a); + int t59 = CLIP(t59a + t58a); + int t60 = CLIP(t60a + t61a); + int t61 = CLIP(t60a - t61a); + int t62 = CLIP(t63a - t62a); + int t63 = CLIP(t63a + t62a); + + t33a = ((t33 * (4096 - 4076) + t62 * 401 + 2048) >> 12) - t33; + t34a = ((t34 * -401 + t61 * (4096 - 4076) + 2048) >> 12) - t61; + t37a = (t37 * -1299 + t58 * 1583 + 1024) >> 11; + t38a = (t38 * -1583 + t57 * -1299 + 1024) >> 11; + t41a = ((t41 * (4096 - 3612) + t54 * 1931 + 2048) >> 12) - t41; + t42a = ((t42 * -1931 + t53 * (4096 - 3612) + 2048) >> 12) - t53; + t45a = ((t45 * -1189 + t50 * (3920 - 4096) + 2048) >> 12) + t50; + t46a = ((t46 * (4096 - 3920) + t49 * -1189 + 2048) >> 12) - t46; + t49a = ((t46 * -1189 + t49 * (3920 - 4096) + 2048) >> 12) + t49; + t50a = ((t45 * (3920 - 4096) + t50 * 1189 + 2048) >> 12) + t45; + t53a = ((t42 * (4096 - 3612) + t53 * 1931 + 2048) >> 12) - t42; + t54a = ((t41 * 1931 + t54 * (3612 - 4096) + 2048) >> 12) + t54; + t57a = (t38 * -1299 + t57 * 1583 + 1024) >> 11; + t58a = (t37 * 1583 + t58 * 1299 + 1024) >> 11; + t61a = ((t34 * (4096 - 4076) + t61 * 401 + 2048) >> 12) - t34; + t62a = ((t33 * 401 + t62 * (4076 - 4096) + 2048) >> 12) + t62; + + t32a = CLIP(t32 + t35); + t33 = CLIP(t33a + t34a); + t34 = CLIP(t33a - t34a); + t35a = CLIP(t32 - t35); + t36a = CLIP(t39 - t36); + t37 = CLIP(t38a - t37a); + t38 = CLIP(t38a + t37a); + t39a = CLIP(t39 + t36); + t40a = CLIP(t40 + t43); + t41 = CLIP(t41a + t42a); + t42 = CLIP(t41a - t42a); + t43a = CLIP(t40 - t43); + t44a = CLIP(t47 - t44); + t45 = CLIP(t46a - t45a); + t46 = CLIP(t46a + t45a); + t47a = CLIP(t47 + t44); + t48a = CLIP(t48 + t51); + t49 = CLIP(t49a + t50a); + t50 = CLIP(t49a - t50a); + t51a = CLIP(t48 - t51); + t52a = CLIP(t55 - t52); + t53 = CLIP(t54a - t53a); + t54 = CLIP(t54a + t53a); + t55a = CLIP(t55 + t52); + t56a = CLIP(t56 + t59); + t57 = CLIP(t57a + t58a); + t58 = CLIP(t57a - t58a); + t59a = CLIP(t56 - t59); + t60a = CLIP(t63 - t60); + t61 = CLIP(t62a - t61a); + t62 = CLIP(t62a + t61a); + t63a = CLIP(t63 + t60); + + t34a = ((t34 * (4096 - 4017) + t61 * 799 + 2048) >> 12) - t34; + t35 = ((t35a * (4096 - 4017) + t60a * 799 + 2048) >> 12) - t35a; + t36 = ((t36a * -799 + t59a * (4096 - 4017) + 2048) >> 12) - t59a; + t37a = ((t37 * -799 + t58 * (4096 - 4017) + 2048) >> 12) - t58; + t42a = (t42 * -1138 + t53 * 1703 + 1024) >> 11; + t43 = (t43a * -1138 + t52a * 1703 + 1024) >> 11; + t44 = (t44a * -1703 + t51a * -1138 + 1024) >> 11; + t45a = (t45 * -1703 + t50 * -1138 + 1024) >> 11; + t50a = (t45 * -1138 + t50 * 1703 + 1024) >> 11; + t51 = (t44a * -1138 + t51a * 1703 + 1024) >> 11; + t52 = (t43a * 1703 + t52a * 1138 + 1024) >> 11; + t53a = (t42 * 1703 + t53 * 1138 + 1024) >> 11; + t58a = ((t37 * (4096 - 4017) + t58 * 799 + 2048) >> 12) - t37; + t59 = ((t36a * (4096 - 4017) + t59a * 799 + 2048) >> 12) - t36a; + t60 = ((t35a * 799 + t60a * (4017 - 4096) + 2048) >> 12) + t60a; + t61a = ((t34 * 799 + t61 * (4017 - 4096) + 2048) >> 12) + t61; + + t32 = CLIP(t32a + t39a); + t33a = CLIP(t33 + t38); + t34 = CLIP(t34a + t37a); + t35a = CLIP(t35 + t36); + t36a = CLIP(t35 - t36); + t37 = CLIP(t34a - t37a); + t38a = CLIP(t33 - t38); + t39 = CLIP(t32a - t39a); + t40 = CLIP(t47a - t40a); + t41a = CLIP(t46 - t41); + t42 = CLIP(t45a - t42a); + t43a = CLIP(t44 - t43); + t44a = CLIP(t44 + t43); + t45 = CLIP(t45a + t42a); + t46a = CLIP(t46 + t41); + t47 = CLIP(t47a + t40a); + t48 = CLIP(t48a + t55a); + t49a = CLIP(t49 + t54); + t50 = CLIP(t50a + t53a); + t51a = CLIP(t51 + t52); + t52a = CLIP(t51 - t52); + t53 = CLIP(t50a - t53a); + t54a = CLIP(t49 - t54); + t55 = CLIP(t48a - t55a); + t56 = CLIP(t63a - t56a); + t57a = CLIP(t62 - t57); + t58 = CLIP(t61a - t58a); + t59a = CLIP(t60 - t59); + t60a = CLIP(t60 + t59); + t61 = CLIP(t61a + t58a); + t62a = CLIP(t62 + t57); + t63 = CLIP(t63a + t56a); + + t36 = ((t36a * (4096 - 3784) + t59a * 1567 + 2048) >> 12) - t36a; + t37a = ((t37 * (4096 - 3784) + t58 * 1567 + 2048) >> 12) - t37; + t38 = ((t38a * (4096 - 3784) + t57a * 1567 + 2048) >> 12) - t38a; + t39a = ((t39 * (4096 - 3784) + t56 * 1567 + 2048) >> 12) - t39; + t40a = ((t40 * -1567 + t55 * (4096 - 3784) + 2048) >> 12) - t55; + t41 = ((t41a * -1567 + t54a * (4096 - 3784) + 2048) >> 12) - t54a; + t42a = ((t42 * -1567 + t53 * (4096 - 3784) + 2048) >> 12) - t53; + t43 = ((t43a * -1567 + t52a * (4096 - 3784) + 2048) >> 12) - t52a; + t52 = ((t43a * (4096 - 3784) + t52a * 1567 + 2048) >> 12) - t43a; + t53a = ((t42 * (4096 - 3784) + t53 * 1567 + 2048) >> 12) - t42; + t54 = ((t41a * (4096 - 3784) + t54a * 1567 + 2048) >> 12) - t41a; + t55a = ((t40 * (4096 - 3784) + t55 * 1567 + 2048) >> 12) - t40; + t56a = ((t39 * 1567 + t56 * (3784 - 4096) + 2048) >> 12) + t56; + t57 = ((t38a * 1567 + t57a * (3784 - 4096) + 2048) >> 12) + t57a; + t58a = ((t37 * 1567 + t58 * (3784 - 4096) + 2048) >> 12) + t58; + t59 = ((t36a * 1567 + t59a * (3784 - 4096) + 2048) >> 12) + t59a; + + t32a = CLIP(t32 + t47); + t33 = CLIP(t33a + t46a); + t34a = CLIP(t34 + t45); + t35 = CLIP(t35a + t44a); + t36a = CLIP(t36 + t43); + t37 = CLIP(t37a + t42a); + t38a = CLIP(t38 + t41); + t39 = CLIP(t39a + t40a); + t40 = CLIP(t39a - t40a); + t41a = CLIP(t38 - t41); + t42 = CLIP(t37a - t42a); + t43a = CLIP(t36 - t43); + t44 = CLIP(t35a - t44a); + t45a = CLIP(t34 - t45); + t46 = CLIP(t33a - t46a); + t47a = CLIP(t32 - t47); + t48a = CLIP(t63 - t48); + t49 = CLIP(t62a - t49a); + t50a = CLIP(t61 - t50); + t51 = CLIP(t60a - t51a); + t52a = CLIP(t59 - t52); + t53 = CLIP(t58a - t53a); + t54a = CLIP(t57 - t54); + t55 = CLIP(t56a - t55a); + t56 = CLIP(t56a + t55a); + t57a = CLIP(t57 + t54); + t58 = CLIP(t58a + t53a); + t59a = CLIP(t59 + t52); + t60 = CLIP(t60a + t51a); + t61a = CLIP(t61 + t50); + t62 = CLIP(t62a + t49a); + t63a = CLIP(t63 + t48); + + t40a = ((t55 - t40 ) * 181 + 128) >> 8; + t41 = ((t54a - t41a) * 181 + 128) >> 8; + t42a = ((t53 - t42 ) * 181 + 128) >> 8; + t43 = ((t52a - t43a) * 181 + 128) >> 8; + t44a = ((t51 - t44 ) * 181 + 128) >> 8; + t45 = ((t50a - t45a) * 181 + 128) >> 8; + t46a = ((t49 - t46 ) * 181 + 128) >> 8; + t47 = ((t48a - t47a) * 181 + 128) >> 8; + t48 = ((t47a + t48a) * 181 + 128) >> 8; + t49a = ((t46 + t49 ) * 181 + 128) >> 8; + t50 = ((t45a + t50a) * 181 + 128) >> 8; + t51a = ((t44 + t51 ) * 181 + 128) >> 8; + t52 = ((t43a + t52a) * 181 + 128) >> 8; + t53a = ((t42 + t53 ) * 181 + 128) >> 8; + t54 = ((t41a + t54a) * 181 + 128) >> 8; + t55a = ((t40 + t55 ) * 181 + 128) >> 8; + + const int t0 = c[ 0 * stride]; + const int t1 = c[ 2 * stride]; + const int t2 = c[ 4 * stride]; + const int t3 = c[ 6 * stride]; + const int t4 = c[ 8 * stride]; + const int t5 = c[10 * stride]; + const int t6 = c[12 * stride]; + const int t7 = c[14 * stride]; + const int t8 = c[16 * stride]; + const int t9 = c[18 * stride]; + const int t10 = c[20 * stride]; + const int t11 = c[22 * stride]; + const int t12 = c[24 * stride]; + const int t13 = c[26 * stride]; + const int t14 = c[28 * stride]; + const int t15 = c[30 * stride]; + const int t16 = c[32 * stride]; + const int t17 = c[34 * stride]; + const int t18 = c[36 * stride]; + const int t19 = c[38 * stride]; + const int t20 = c[40 * stride]; + const int t21 = c[42 * stride]; + const int t22 = c[44 * stride]; + const int t23 = c[46 * stride]; + const int t24 = c[48 * stride]; + const int t25 = c[50 * stride]; + const int t26 = c[52 * stride]; + const int t27 = c[54 * stride]; + const int t28 = c[56 * stride]; + const int t29 = c[58 * stride]; + const int t30 = c[60 * stride]; + const int t31 = c[62 * stride]; + + c[ 0 * stride] = CLIP(t0 + t63a); + c[ 1 * stride] = CLIP(t1 + t62); + c[ 2 * stride] = CLIP(t2 + t61a); + c[ 3 * stride] = CLIP(t3 + t60); + c[ 4 * stride] = CLIP(t4 + t59a); + c[ 5 * stride] = CLIP(t5 + t58); + c[ 6 * stride] = CLIP(t6 + t57a); + c[ 7 * stride] = CLIP(t7 + t56); + c[ 8 * stride] = CLIP(t8 + t55a); + c[ 9 * stride] = CLIP(t9 + t54); + c[10 * stride] = CLIP(t10 + t53a); + c[11 * stride] = CLIP(t11 + t52); + c[12 * stride] = CLIP(t12 + t51a); + c[13 * stride] = CLIP(t13 + t50); + c[14 * stride] = CLIP(t14 + t49a); + c[15 * stride] = CLIP(t15 + t48); + c[16 * stride] = CLIP(t16 + t47); + c[17 * stride] = CLIP(t17 + t46a); + c[18 * stride] = CLIP(t18 + t45); + c[19 * stride] = CLIP(t19 + t44a); + c[20 * stride] = CLIP(t20 + t43); + c[21 * stride] = CLIP(t21 + t42a); + c[22 * stride] = CLIP(t22 + t41); + c[23 * stride] = CLIP(t23 + t40a); + c[24 * stride] = CLIP(t24 + t39); + c[25 * stride] = CLIP(t25 + t38a); + c[26 * stride] = CLIP(t26 + t37); + c[27 * stride] = CLIP(t27 + t36a); + c[28 * stride] = CLIP(t28 + t35); + c[29 * stride] = CLIP(t29 + t34a); + c[30 * stride] = CLIP(t30 + t33); + c[31 * stride] = CLIP(t31 + t32a); + c[32 * stride] = CLIP(t31 - t32a); + c[33 * stride] = CLIP(t30 - t33); + c[34 * stride] = CLIP(t29 - t34a); + c[35 * stride] = CLIP(t28 - t35); + c[36 * stride] = CLIP(t27 - t36a); + c[37 * stride] = CLIP(t26 - t37); + c[38 * stride] = CLIP(t25 - t38a); + c[39 * stride] = CLIP(t24 - t39); + c[40 * stride] = CLIP(t23 - t40a); + c[41 * stride] = CLIP(t22 - t41); + c[42 * stride] = CLIP(t21 - t42a); + c[43 * stride] = CLIP(t20 - t43); + c[44 * stride] = CLIP(t19 - t44a); + c[45 * stride] = CLIP(t18 - t45); + c[46 * stride] = CLIP(t17 - t46a); + c[47 * stride] = CLIP(t16 - t47); + c[48 * stride] = CLIP(t15 - t48); + c[49 * stride] = CLIP(t14 - t49a); + c[50 * stride] = CLIP(t13 - t50); + c[51 * stride] = CLIP(t12 - t51a); + c[52 * stride] = CLIP(t11 - t52); + c[53 * stride] = CLIP(t10 - t53a); + c[54 * stride] = CLIP(t9 - t54); + c[55 * stride] = CLIP(t8 - t55a); + c[56 * stride] = CLIP(t7 - t56); + c[57 * stride] = CLIP(t6 - t57a); + c[58 * stride] = CLIP(t5 - t58); + c[59 * stride] = CLIP(t4 - t59a); + c[60 * stride] = CLIP(t3 - t60); + c[61 * stride] = CLIP(t2 - t61a); + c[62 * stride] = CLIP(t1 - t62); + c[63 * stride] = CLIP(t0 - t63a); +} + +static NOINLINE void +inv_adst4_1d_internal_c(const int32_t *const in, const ptrdiff_t in_s, + const int min, const int max, + int32_t *const out, const ptrdiff_t out_s) +{ + assert(in_s > 0 && out_s != 0); + const int in0 = in[0 * in_s], in1 = in[1 * in_s]; + const int in2 = in[2 * in_s], in3 = in[3 * in_s]; + + out[0 * out_s] = (( 1321 * in0 + (3803 - 4096) * in2 + + (2482 - 4096) * in3 + (3344 - 4096) * in1 + 2048) >> 12) + + in2 + in3 + in1; + out[1 * out_s] = (((2482 - 4096) * in0 - 1321 * in2 - + (3803 - 4096) * in3 + (3344 - 4096) * in1 + 2048) >> 12) + + in0 - in3 + in1; + out[2 * out_s] = (209 * (in0 - in2 + in3) + 128) >> 8; + out[3 * out_s] = (((3803 - 4096) * in0 + (2482 - 4096) * in2 - + 1321 * in3 - (3344 - 4096) * in1 + 2048) >> 12) + + in0 + in2 - in1; +} + +static NOINLINE void +inv_adst8_1d_internal_c(const int32_t *const in, const ptrdiff_t in_s, + const int min, const int max, + int32_t *const out, const ptrdiff_t out_s) +{ + assert(in_s > 0 && out_s != 0); + const int in0 = in[0 * in_s], in1 = in[1 * in_s]; + const int in2 = in[2 * in_s], in3 = in[3 * in_s]; + const int in4 = in[4 * in_s], in5 = in[5 * in_s]; + const int in6 = in[6 * in_s], in7 = in[7 * in_s]; + + const int t0a = (((4076 - 4096) * in7 + 401 * in0 + 2048) >> 12) + in7; + const int t1a = (( 401 * in7 - (4076 - 4096) * in0 + 2048) >> 12) - in0; + const int t2a = (((3612 - 4096) * in5 + 1931 * in2 + 2048) >> 12) + in5; + const int t3a = (( 1931 * in5 - (3612 - 4096) * in2 + 2048) >> 12) - in2; + int t4a = ( 1299 * in3 + 1583 * in4 + 1024) >> 11; + int t5a = ( 1583 * in3 - 1299 * in4 + 1024) >> 11; + int t6a = (( 1189 * in1 + (3920 - 4096) * in6 + 2048) >> 12) + in6; + int t7a = (((3920 - 4096) * in1 - 1189 * in6 + 2048) >> 12) + in1; + + const int t0 = CLIP(t0a + t4a); + const int t1 = CLIP(t1a + t5a); + int t2 = CLIP(t2a + t6a); + int t3 = CLIP(t3a + t7a); + const int t4 = CLIP(t0a - t4a); + const int t5 = CLIP(t1a - t5a); + int t6 = CLIP(t2a - t6a); + int t7 = CLIP(t3a - t7a); + + t4a = (((3784 - 4096) * t4 + 1567 * t5 + 2048) >> 12) + t4; + t5a = (( 1567 * t4 - (3784 - 4096) * t5 + 2048) >> 12) - t5; + t6a = (((3784 - 4096) * t7 - 1567 * t6 + 2048) >> 12) + t7; + t7a = (( 1567 * t7 + (3784 - 4096) * t6 + 2048) >> 12) + t6; + + out[0 * out_s] = CLIP(t0 + t2 ); + out[7 * out_s] = -CLIP(t1 + t3 ); + t2 = CLIP(t0 - t2 ); + t3 = CLIP(t1 - t3 ); + out[1 * out_s] = -CLIP(t4a + t6a); + out[6 * out_s] = CLIP(t5a + t7a); + t6 = CLIP(t4a - t6a); + t7 = CLIP(t5a - t7a); + + out[3 * out_s] = -(((t2 + t3) * 181 + 128) >> 8); + out[4 * out_s] = ((t2 - t3) * 181 + 128) >> 8; + out[2 * out_s] = ((t6 + t7) * 181 + 128) >> 8; + out[5 * out_s] = -(((t6 - t7) * 181 + 128) >> 8); +} + +static NOINLINE void +inv_adst16_1d_internal_c(const int32_t *const in, const ptrdiff_t in_s, + const int min, const int max, + int32_t *const out, const ptrdiff_t out_s) +{ + assert(in_s > 0 && out_s != 0); + const int in0 = in[ 0 * in_s], in1 = in[ 1 * in_s]; + const int in2 = in[ 2 * in_s], in3 = in[ 3 * in_s]; + const int in4 = in[ 4 * in_s], in5 = in[ 5 * in_s]; + const int in6 = in[ 6 * in_s], in7 = in[ 7 * in_s]; + const int in8 = in[ 8 * in_s], in9 = in[ 9 * in_s]; + const int in10 = in[10 * in_s], in11 = in[11 * in_s]; + const int in12 = in[12 * in_s], in13 = in[13 * in_s]; + const int in14 = in[14 * in_s], in15 = in[15 * in_s]; + + int t0 = ((in15 * (4091 - 4096) + in0 * 201 + 2048) >> 12) + in15; + int t1 = ((in15 * 201 - in0 * (4091 - 4096) + 2048) >> 12) - in0; + int t2 = ((in13 * (3973 - 4096) + in2 * 995 + 2048) >> 12) + in13; + int t3 = ((in13 * 995 - in2 * (3973 - 4096) + 2048) >> 12) - in2; + int t4 = ((in11 * (3703 - 4096) + in4 * 1751 + 2048) >> 12) + in11; + int t5 = ((in11 * 1751 - in4 * (3703 - 4096) + 2048) >> 12) - in4; + int t6 = (in9 * 1645 + in6 * 1220 + 1024) >> 11; + int t7 = (in9 * 1220 - in6 * 1645 + 1024) >> 11; + int t8 = ((in7 * 2751 + in8 * (3035 - 4096) + 2048) >> 12) + in8; + int t9 = ((in7 * (3035 - 4096) - in8 * 2751 + 2048) >> 12) + in7; + int t10 = ((in5 * 2106 + in10 * (3513 - 4096) + 2048) >> 12) + in10; + int t11 = ((in5 * (3513 - 4096) - in10 * 2106 + 2048) >> 12) + in5; + int t12 = ((in3 * 1380 + in12 * (3857 - 4096) + 2048) >> 12) + in12; + int t13 = ((in3 * (3857 - 4096) - in12 * 1380 + 2048) >> 12) + in3; + int t14 = ((in1 * 601 + in14 * (4052 - 4096) + 2048) >> 12) + in14; + int t15 = ((in1 * (4052 - 4096) - in14 * 601 + 2048) >> 12) + in1; + + int t0a = CLIP(t0 + t8 ); + int t1a = CLIP(t1 + t9 ); + int t2a = CLIP(t2 + t10); + int t3a = CLIP(t3 + t11); + int t4a = CLIP(t4 + t12); + int t5a = CLIP(t5 + t13); + int t6a = CLIP(t6 + t14); + int t7a = CLIP(t7 + t15); + int t8a = CLIP(t0 - t8 ); + int t9a = CLIP(t1 - t9 ); + int t10a = CLIP(t2 - t10); + int t11a = CLIP(t3 - t11); + int t12a = CLIP(t4 - t12); + int t13a = CLIP(t5 - t13); + int t14a = CLIP(t6 - t14); + int t15a = CLIP(t7 - t15); + + t8 = ((t8a * (4017 - 4096) + t9a * 799 + 2048) >> 12) + t8a; + t9 = ((t8a * 799 - t9a * (4017 - 4096) + 2048) >> 12) - t9a; + t10 = ((t10a * 2276 + t11a * (3406 - 4096) + 2048) >> 12) + t11a; + t11 = ((t10a * (3406 - 4096) - t11a * 2276 + 2048) >> 12) + t10a; + t12 = ((t13a * (4017 - 4096) - t12a * 799 + 2048) >> 12) + t13a; + t13 = ((t13a * 799 + t12a * (4017 - 4096) + 2048) >> 12) + t12a; + t14 = ((t15a * 2276 - t14a * (3406 - 4096) + 2048) >> 12) - t14a; + t15 = ((t15a * (3406 - 4096) + t14a * 2276 + 2048) >> 12) + t15a; + + t0 = CLIP(t0a + t4a); + t1 = CLIP(t1a + t5a); + t2 = CLIP(t2a + t6a); + t3 = CLIP(t3a + t7a); + t4 = CLIP(t0a - t4a); + t5 = CLIP(t1a - t5a); + t6 = CLIP(t2a - t6a); + t7 = CLIP(t3a - t7a); + t8a = CLIP(t8 + t12); + t9a = CLIP(t9 + t13); + t10a = CLIP(t10 + t14); + t11a = CLIP(t11 + t15); + t12a = CLIP(t8 - t12); + t13a = CLIP(t9 - t13); + t14a = CLIP(t10 - t14); + t15a = CLIP(t11 - t15); + + t4a = ((t4 * (3784 - 4096) + t5 * 1567 + 2048) >> 12) + t4; + t5a = ((t4 * 1567 - t5 * (3784 - 4096) + 2048) >> 12) - t5; + t6a = ((t7 * (3784 - 4096) - t6 * 1567 + 2048) >> 12) + t7; + t7a = ((t7 * 1567 + t6 * (3784 - 4096) + 2048) >> 12) + t6; + t12 = ((t12a * (3784 - 4096) + t13a * 1567 + 2048) >> 12) + t12a; + t13 = ((t12a * 1567 - t13a * (3784 - 4096) + 2048) >> 12) - t13a; + t14 = ((t15a * (3784 - 4096) - t14a * 1567 + 2048) >> 12) + t15a; + t15 = ((t15a * 1567 + t14a * (3784 - 4096) + 2048) >> 12) + t14a; + + out[ 0 * out_s] = CLIP(t0 + t2 ); + out[15 * out_s] = -CLIP(t1 + t3 ); + t2a = CLIP(t0 - t2 ); + t3a = CLIP(t1 - t3 ); + out[ 3 * out_s] = -CLIP(t4a + t6a ); + out[12 * out_s] = CLIP(t5a + t7a ); + t6 = CLIP(t4a - t6a ); + t7 = CLIP(t5a - t7a ); + out[ 1 * out_s] = -CLIP(t8a + t10a); + out[14 * out_s] = CLIP(t9a + t11a); + t10 = CLIP(t8a - t10a); + t11 = CLIP(t9a - t11a); + out[ 2 * out_s] = CLIP(t12 + t14 ); + out[13 * out_s] = -CLIP(t13 + t15 ); + t14a = CLIP(t12 - t14 ); + t15a = CLIP(t13 - t15 ); + + out[ 7 * out_s] = -(((t2a + t3a) * 181 + 128) >> 8); + out[ 8 * out_s] = ((t2a - t3a) * 181 + 128) >> 8; + out[ 4 * out_s] = ((t6 + t7) * 181 + 128) >> 8; + out[11 * out_s] = -(((t6 - t7) * 181 + 128) >> 8); + out[ 6 * out_s] = ((t10 + t11) * 181 + 128) >> 8; + out[ 9 * out_s] = -(((t10 - t11) * 181 + 128) >> 8); + out[ 5 * out_s] = -(((t14a + t15a) * 181 + 128) >> 8); + out[10 * out_s] = ((t14a - t15a) * 181 + 128) >> 8; +} + +#define inv_adst_1d(sz) \ +void dav1d_inv_adst##sz##_1d_c(int32_t *const c, const ptrdiff_t stride, \ + const int min, const int max) \ +{ \ + inv_adst##sz##_1d_internal_c(c, stride, min, max, c, stride); \ +} \ +void dav1d_inv_flipadst##sz##_1d_c(int32_t *const c, const ptrdiff_t stride, \ + const int min, const int max) \ +{ \ + inv_adst##sz##_1d_internal_c(c, stride, min, max, \ + &c[(sz - 1) * stride], -stride); \ +} + +inv_adst_1d( 4) +inv_adst_1d( 8) +inv_adst_1d(16) + +#undef inv_adst_1d + +void dav1d_inv_identity4_1d_c(int32_t *const c, const ptrdiff_t stride, + const int min, const int max) +{ + assert(stride > 0); + for (int i = 0; i < 4; i++) { + const int in = c[stride * i]; + c[stride * i] = in + ((in * 1697 + 2048) >> 12); + } +} + +void dav1d_inv_identity8_1d_c(int32_t *const c, const ptrdiff_t stride, + const int min, const int max) +{ + assert(stride > 0); + for (int i = 0; i < 8; i++) + c[stride * i] *= 2; +} + +void dav1d_inv_identity16_1d_c(int32_t *const c, const ptrdiff_t stride, + const int min, const int max) +{ + assert(stride > 0); + for (int i = 0; i < 16; i++) { + const int in = c[stride * i]; + c[stride * i] = 2 * in + ((in * 1697 + 1024) >> 11); + } +} + +void dav1d_inv_identity32_1d_c(int32_t *const c, const ptrdiff_t stride, + const int min, const int max) +{ + assert(stride > 0); + for (int i = 0; i < 32; i++) + c[stride * i] *= 4; +} + +void dav1d_inv_wht4_1d_c(int32_t *const c, const ptrdiff_t stride) { + assert(stride > 0); + const int in0 = c[0 * stride], in1 = c[1 * stride]; + const int in2 = c[2 * stride], in3 = c[3 * stride]; + + const int t0 = in0 + in1; + const int t2 = in2 - in3; + const int t4 = (t0 - t2) >> 1; + const int t3 = t4 - in3; + const int t1 = t4 - in1; + + c[0 * stride] = t0 - t3; + c[1 * stride] = t3; + c[2 * stride] = t1; + c[3 * stride] = t2 + t1; +} diff --git a/third_party/dav1d/src/itx_1d.h b/third_party/dav1d/src/itx_1d.h new file mode 100644 index 0000000000..b63d71b020 --- /dev/null +++ b/third_party/dav1d/src/itx_1d.h @@ -0,0 +1,59 @@ +/* + * Copyright © 2018-2019, VideoLAN and dav1d authors + * Copyright © 2018-2019, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include + +#ifndef DAV1D_SRC_ITX_1D_H +#define DAV1D_SRC_ITX_1D_H + +#define decl_itx_1d_fn(name) \ +void (name)(int32_t *c, ptrdiff_t stride, int min, int max) +typedef decl_itx_1d_fn(*itx_1d_fn); + +decl_itx_1d_fn(dav1d_inv_dct4_1d_c); +decl_itx_1d_fn(dav1d_inv_dct8_1d_c); +decl_itx_1d_fn(dav1d_inv_dct16_1d_c); +decl_itx_1d_fn(dav1d_inv_dct32_1d_c); +decl_itx_1d_fn(dav1d_inv_dct64_1d_c); + +decl_itx_1d_fn(dav1d_inv_adst4_1d_c); +decl_itx_1d_fn(dav1d_inv_adst8_1d_c); +decl_itx_1d_fn(dav1d_inv_adst16_1d_c); + +decl_itx_1d_fn(dav1d_inv_flipadst4_1d_c); +decl_itx_1d_fn(dav1d_inv_flipadst8_1d_c); +decl_itx_1d_fn(dav1d_inv_flipadst16_1d_c); + +decl_itx_1d_fn(dav1d_inv_identity4_1d_c); +decl_itx_1d_fn(dav1d_inv_identity8_1d_c); +decl_itx_1d_fn(dav1d_inv_identity16_1d_c); +decl_itx_1d_fn(dav1d_inv_identity32_1d_c); + +void dav1d_inv_wht4_1d_c(int32_t *c, ptrdiff_t stride); + +#endif /* DAV1D_SRC_ITX_1D_H */ diff --git a/third_party/dav1d/src/itx_tmpl.c b/third_party/dav1d/src/itx_tmpl.c new file mode 100644 index 0000000000..a0e807f955 --- /dev/null +++ b/third_party/dav1d/src/itx_tmpl.c @@ -0,0 +1,256 @@ +/* + * Copyright © 2018-2019, VideoLAN and dav1d authors + * Copyright © 2018-2019, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include +#include +#include + +#include "common/attributes.h" +#include "common/intops.h" + +#include "src/itx.h" +#include "src/itx_1d.h" + +static NOINLINE void +inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, coef *const coeff, + const int eob, const int w, const int h, const int shift, + const itx_1d_fn first_1d_fn, const itx_1d_fn second_1d_fn, + const int has_dconly HIGHBD_DECL_SUFFIX) +{ + assert(w >= 4 && w <= 64); + assert(h >= 4 && h <= 64); + assert(eob >= 0); + + const int is_rect2 = w * 2 == h || h * 2 == w; + const int rnd = (1 << shift) >> 1; + + if (eob < has_dconly) { + int dc = coeff[0]; + coeff[0] = 0; + if (is_rect2) + dc = (dc * 181 + 128) >> 8; + dc = (dc * 181 + 128) >> 8; + dc = (dc + rnd) >> shift; + dc = (dc * 181 + 128 + 2048) >> 12; + for (int y = 0; y < h; y++, dst += PXSTRIDE(stride)) + for (int x = 0; x < w; x++) + dst[x] = iclip_pixel(dst[x] + dc); + return; + } + + const int sh = imin(h, 32), sw = imin(w, 32); +#if BITDEPTH == 8 + const int row_clip_min = INT16_MIN; + const int col_clip_min = INT16_MIN; +#else + const int row_clip_min = (int) ((unsigned) ~bitdepth_max << 7); + const int col_clip_min = (int) ((unsigned) ~bitdepth_max << 5); +#endif + const int row_clip_max = ~row_clip_min; + const int col_clip_max = ~col_clip_min; + + int32_t tmp[64 * 64], *c = tmp; + for (int y = 0; y < sh; y++, c += w) { + if (is_rect2) + for (int x = 0; x < sw; x++) + c[x] = (coeff[y + x * sh] * 181 + 128) >> 8; + else + for (int x = 0; x < sw; x++) + c[x] = coeff[y + x * sh]; + first_1d_fn(c, 1, row_clip_min, row_clip_max); + } + + memset(coeff, 0, sizeof(*coeff) * sw * sh); + for (int i = 0; i < w * sh; i++) + tmp[i] = iclip((tmp[i] + rnd) >> shift, col_clip_min, col_clip_max); + + for (int x = 0; x < w; x++) + second_1d_fn(&tmp[x], w, col_clip_min, col_clip_max); + + c = tmp; + for (int y = 0; y < h; y++, dst += PXSTRIDE(stride)) + for (int x = 0; x < w; x++) + dst[x] = iclip_pixel(dst[x] + ((*c++ + 8) >> 4)); +} + +#define inv_txfm_fn(type1, type2, w, h, shift, has_dconly) \ +static void \ +inv_txfm_add_##type1##_##type2##_##w##x##h##_c(pixel *dst, \ + const ptrdiff_t stride, \ + coef *const coeff, \ + const int eob \ + HIGHBD_DECL_SUFFIX) \ +{ \ + inv_txfm_add_c(dst, stride, coeff, eob, w, h, shift, \ + dav1d_inv_##type1##w##_1d_c, dav1d_inv_##type2##h##_1d_c, \ + has_dconly HIGHBD_TAIL_SUFFIX); \ +} + +#define inv_txfm_fn64(w, h, shift) \ +inv_txfm_fn(dct, dct, w, h, shift, 1) + +#define inv_txfm_fn32(w, h, shift) \ +inv_txfm_fn64(w, h, shift) \ +inv_txfm_fn(identity, identity, w, h, shift, 0) + +#define inv_txfm_fn16(w, h, shift) \ +inv_txfm_fn32(w, h, shift) \ +inv_txfm_fn(adst, dct, w, h, shift, 0) \ +inv_txfm_fn(dct, adst, w, h, shift, 0) \ +inv_txfm_fn(adst, adst, w, h, shift, 0) \ +inv_txfm_fn(dct, flipadst, w, h, shift, 0) \ +inv_txfm_fn(flipadst, dct, w, h, shift, 0) \ +inv_txfm_fn(adst, flipadst, w, h, shift, 0) \ +inv_txfm_fn(flipadst, adst, w, h, shift, 0) \ +inv_txfm_fn(flipadst, flipadst, w, h, shift, 0) \ +inv_txfm_fn(identity, dct, w, h, shift, 0) \ +inv_txfm_fn(dct, identity, w, h, shift, 0) \ + +#define inv_txfm_fn84(w, h, shift) \ +inv_txfm_fn16(w, h, shift) \ +inv_txfm_fn(identity, flipadst, w, h, shift, 0) \ +inv_txfm_fn(flipadst, identity, w, h, shift, 0) \ +inv_txfm_fn(identity, adst, w, h, shift, 0) \ +inv_txfm_fn(adst, identity, w, h, shift, 0) \ + +inv_txfm_fn84( 4, 4, 0) +inv_txfm_fn84( 4, 8, 0) +inv_txfm_fn84( 4, 16, 1) +inv_txfm_fn84( 8, 4, 0) +inv_txfm_fn84( 8, 8, 1) +inv_txfm_fn84( 8, 16, 1) +inv_txfm_fn32( 8, 32, 2) +inv_txfm_fn84(16, 4, 1) +inv_txfm_fn84(16, 8, 1) +inv_txfm_fn16(16, 16, 2) +inv_txfm_fn32(16, 32, 1) +inv_txfm_fn64(16, 64, 2) +inv_txfm_fn32(32, 8, 2) +inv_txfm_fn32(32, 16, 1) +inv_txfm_fn32(32, 32, 2) +inv_txfm_fn64(32, 64, 1) +inv_txfm_fn64(64, 16, 2) +inv_txfm_fn64(64, 32, 1) +inv_txfm_fn64(64, 64, 2) + +static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride, + coef *const coeff, const int eob + HIGHBD_DECL_SUFFIX) +{ + int32_t tmp[4 * 4], *c = tmp; + for (int y = 0; y < 4; y++, c += 4) { + for (int x = 0; x < 4; x++) + c[x] = coeff[y + x * 4] >> 2; + dav1d_inv_wht4_1d_c(c, 1); + } + memset(coeff, 0, sizeof(*coeff) * 4 * 4); + + for (int x = 0; x < 4; x++) + dav1d_inv_wht4_1d_c(&tmp[x], 4); + + c = tmp; + for (int y = 0; y < 4; y++, dst += PXSTRIDE(stride)) + for (int x = 0; x < 4; x++) + dst[x] = iclip_pixel(dst[x] + *c++); +} + +COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c, int bpc) { +#define assign_itx_all_fn64(w, h, pfx) \ + c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT ] = \ + inv_txfm_add_dct_dct_##w##x##h##_c + +#define assign_itx_all_fn32(w, h, pfx) \ + assign_itx_all_fn64(w, h, pfx); \ + c->itxfm_add[pfx##TX_##w##X##h][IDTX] = \ + inv_txfm_add_identity_identity_##w##x##h##_c + +#define assign_itx_all_fn16(w, h, pfx) \ + assign_itx_all_fn32(w, h, pfx); \ + c->itxfm_add[pfx##TX_##w##X##h][DCT_ADST ] = \ + inv_txfm_add_adst_dct_##w##x##h##_c; \ + c->itxfm_add[pfx##TX_##w##X##h][ADST_DCT ] = \ + inv_txfm_add_dct_adst_##w##x##h##_c; \ + c->itxfm_add[pfx##TX_##w##X##h][ADST_ADST] = \ + inv_txfm_add_adst_adst_##w##x##h##_c; \ + c->itxfm_add[pfx##TX_##w##X##h][ADST_FLIPADST] = \ + inv_txfm_add_flipadst_adst_##w##x##h##_c; \ + c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_ADST] = \ + inv_txfm_add_adst_flipadst_##w##x##h##_c; \ + c->itxfm_add[pfx##TX_##w##X##h][DCT_FLIPADST] = \ + inv_txfm_add_flipadst_dct_##w##x##h##_c; \ + c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_DCT] = \ + inv_txfm_add_dct_flipadst_##w##x##h##_c; \ + c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_FLIPADST] = \ + inv_txfm_add_flipadst_flipadst_##w##x##h##_c; \ + c->itxfm_add[pfx##TX_##w##X##h][H_DCT] = \ + inv_txfm_add_dct_identity_##w##x##h##_c; \ + c->itxfm_add[pfx##TX_##w##X##h][V_DCT] = \ + inv_txfm_add_identity_dct_##w##x##h##_c + +#define assign_itx_all_fn84(w, h, pfx) \ + assign_itx_all_fn16(w, h, pfx); \ + c->itxfm_add[pfx##TX_##w##X##h][H_FLIPADST] = \ + inv_txfm_add_flipadst_identity_##w##x##h##_c; \ + c->itxfm_add[pfx##TX_##w##X##h][V_FLIPADST] = \ + inv_txfm_add_identity_flipadst_##w##x##h##_c; \ + c->itxfm_add[pfx##TX_##w##X##h][H_ADST] = \ + inv_txfm_add_adst_identity_##w##x##h##_c; \ + c->itxfm_add[pfx##TX_##w##X##h][V_ADST] = \ + inv_txfm_add_identity_adst_##w##x##h##_c; \ + + c->itxfm_add[TX_4X4][WHT_WHT] = inv_txfm_add_wht_wht_4x4_c; + assign_itx_all_fn84( 4, 4, ); + assign_itx_all_fn84( 4, 8, R); + assign_itx_all_fn84( 4, 16, R); + assign_itx_all_fn84( 8, 4, R); + assign_itx_all_fn84( 8, 8, ); + assign_itx_all_fn84( 8, 16, R); + assign_itx_all_fn32( 8, 32, R); + assign_itx_all_fn84(16, 4, R); + assign_itx_all_fn84(16, 8, R); + assign_itx_all_fn16(16, 16, ); + assign_itx_all_fn32(16, 32, R); + assign_itx_all_fn64(16, 64, R); + assign_itx_all_fn32(32, 8, R); + assign_itx_all_fn32(32, 16, R); + assign_itx_all_fn32(32, 32, ); + assign_itx_all_fn64(32, 64, R); + assign_itx_all_fn64(64, 16, R); + assign_itx_all_fn64(64, 32, R); + assign_itx_all_fn64(64, 64, ); + +#if HAVE_ASM +#if ARCH_AARCH64 || ARCH_ARM + bitfn(dav1d_itx_dsp_init_arm)(c, bpc); +#endif +#if ARCH_X86 + bitfn(dav1d_itx_dsp_init_x86)(c); +#endif +#endif +} diff --git a/third_party/dav1d/src/levels.h b/third_party/dav1d/src/levels.h new file mode 100644 index 0000000000..571c5801e3 --- /dev/null +++ b/third_party/dav1d/src/levels.h @@ -0,0 +1,288 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_LEVELS_H +#define DAV1D_SRC_LEVELS_H + +#include + +#include "dav1d/headers.h" + +enum ObuMetaType { + OBU_META_HDR_CLL = 1, + OBU_META_HDR_MDCV = 2, + OBU_META_SCALABILITY = 3, + OBU_META_ITUT_T35 = 4, + OBU_META_TIMECODE = 5, +}; + +enum TxfmSize { + TX_4X4, + TX_8X8, + TX_16X16, + TX_32X32, + TX_64X64, + N_TX_SIZES, +}; + +enum BlockLevel { + BL_128X128, + BL_64X64, + BL_32X32, + BL_16X16, + BL_8X8, + N_BL_LEVELS, +}; + +enum RectTxfmSize { + RTX_4X8 = N_TX_SIZES, + RTX_8X4, + RTX_8X16, + RTX_16X8, + RTX_16X32, + RTX_32X16, + RTX_32X64, + RTX_64X32, + RTX_4X16, + RTX_16X4, + RTX_8X32, + RTX_32X8, + RTX_16X64, + RTX_64X16, + N_RECT_TX_SIZES +}; + +enum TxfmType { + DCT_DCT, // DCT in both horizontal and vertical + ADST_DCT, // ADST in vertical, DCT in horizontal + DCT_ADST, // DCT in vertical, ADST in horizontal + ADST_ADST, // ADST in both directions + FLIPADST_DCT, + DCT_FLIPADST, + FLIPADST_FLIPADST, + ADST_FLIPADST, + FLIPADST_ADST, + IDTX, + V_DCT, + H_DCT, + V_ADST, + H_ADST, + V_FLIPADST, + H_FLIPADST, + N_TX_TYPES, + WHT_WHT = N_TX_TYPES, + N_TX_TYPES_PLUS_LL, +}; + +enum TxClass { + TX_CLASS_2D, + TX_CLASS_H, + TX_CLASS_V, +}; + +enum IntraPredMode { + DC_PRED, + VERT_PRED, + HOR_PRED, + DIAG_DOWN_LEFT_PRED, + DIAG_DOWN_RIGHT_PRED, + VERT_RIGHT_PRED, + HOR_DOWN_PRED, + HOR_UP_PRED, + VERT_LEFT_PRED, + SMOOTH_PRED, + SMOOTH_V_PRED, + SMOOTH_H_PRED, + PAETH_PRED, + N_INTRA_PRED_MODES, + CFL_PRED = N_INTRA_PRED_MODES, + N_UV_INTRA_PRED_MODES, + N_IMPL_INTRA_PRED_MODES = N_UV_INTRA_PRED_MODES, + LEFT_DC_PRED = DIAG_DOWN_LEFT_PRED, + TOP_DC_PRED, + DC_128_PRED, + Z1_PRED, + Z2_PRED, + Z3_PRED, + FILTER_PRED = N_INTRA_PRED_MODES, +}; + +enum InterIntraPredMode { + II_DC_PRED, + II_VERT_PRED, + II_HOR_PRED, + II_SMOOTH_PRED, + N_INTER_INTRA_PRED_MODES, +}; + +enum BlockPartition { + PARTITION_NONE, // [ ] <-. + PARTITION_H, // [-] | + PARTITION_V, // [|] | + PARTITION_SPLIT, // [+] --' + PARTITION_T_TOP_SPLIT, // [⊥] i.e. split top, H bottom + PARTITION_T_BOTTOM_SPLIT, // [т] i.e. H top, split bottom + PARTITION_T_LEFT_SPLIT, // [-|] i.e. split left, V right + PARTITION_T_RIGHT_SPLIT, // [|-] i.e. V left, split right + PARTITION_H4, // [Ⲷ] + PARTITION_V4, // [Ⲽ] + N_PARTITIONS, + N_SUB8X8_PARTITIONS = PARTITION_T_TOP_SPLIT, +}; + +enum BlockSize { + BS_128x128, + BS_128x64, + BS_64x128, + BS_64x64, + BS_64x32, + BS_64x16, + BS_32x64, + BS_32x32, + BS_32x16, + BS_32x8, + BS_16x64, + BS_16x32, + BS_16x16, + BS_16x8, + BS_16x4, + BS_8x32, + BS_8x16, + BS_8x8, + BS_8x4, + BS_4x16, + BS_4x8, + BS_4x4, + N_BS_SIZES, +}; + +enum Filter2d { // order is horizontal, vertical + FILTER_2D_8TAP_REGULAR, + FILTER_2D_8TAP_REGULAR_SMOOTH, + FILTER_2D_8TAP_REGULAR_SHARP, + FILTER_2D_8TAP_SHARP_REGULAR, + FILTER_2D_8TAP_SHARP_SMOOTH, + FILTER_2D_8TAP_SHARP, + FILTER_2D_8TAP_SMOOTH_REGULAR, + FILTER_2D_8TAP_SMOOTH, + FILTER_2D_8TAP_SMOOTH_SHARP, + FILTER_2D_BILINEAR, + N_2D_FILTERS, +}; + +enum MVJoint { + MV_JOINT_ZERO, + MV_JOINT_H, + MV_JOINT_V, + MV_JOINT_HV, + N_MV_JOINTS, +}; + +enum InterPredMode { + NEARESTMV, + NEARMV, + GLOBALMV, + NEWMV, + N_INTER_PRED_MODES, +}; + +enum DRL_PROXIMITY { + NEAREST_DRL, + NEARER_DRL, + NEAR_DRL, + NEARISH_DRL +}; + +enum CompInterPredMode { + NEARESTMV_NEARESTMV, + NEARMV_NEARMV, + NEARESTMV_NEWMV, + NEWMV_NEARESTMV, + NEARMV_NEWMV, + NEWMV_NEARMV, + GLOBALMV_GLOBALMV, + NEWMV_NEWMV, + N_COMP_INTER_PRED_MODES, +}; + +enum CompInterType { + COMP_INTER_NONE, + COMP_INTER_WEIGHTED_AVG, + COMP_INTER_AVG, + COMP_INTER_SEG, + COMP_INTER_WEDGE, +}; + +enum InterIntraType { + INTER_INTRA_NONE, + INTER_INTRA_BLEND, + INTER_INTRA_WEDGE, +}; + +typedef union mv { + struct { + int16_t y, x; + }; + uint32_t n; +} mv; + +enum MotionMode { + MM_TRANSLATION, + MM_OBMC, + MM_WARP, +}; + +#define QINDEX_RANGE 256 + +typedef struct Av1Block { + uint8_t bl, bs, bp; + uint8_t intra, seg_id, skip_mode, skip, uvtx; + union { + struct { + uint8_t y_mode, uv_mode, tx, pal_sz[2]; + int8_t y_angle, uv_angle, cfl_alpha[2]; + }; // intra + struct { + union { + struct { + union mv mv[2]; + uint8_t wedge_idx, mask_sign, interintra_mode; + }; + struct { + union mv mv2d; + int16_t matrix[4]; + }; + }; + uint8_t comp_type, inter_mode, motion_mode, drl_idx; + int8_t ref[2]; + uint8_t max_ytx, filter2d, interintra_type, tx_split0; + uint16_t tx_split1; + }; // inter + }; +} Av1Block; + +#endif /* DAV1D_SRC_LEVELS_H */ diff --git a/third_party/dav1d/src/lf_apply.h b/third_party/dav1d/src/lf_apply.h new file mode 100644 index 0000000000..6b63b62a49 --- /dev/null +++ b/third_party/dav1d/src/lf_apply.h @@ -0,0 +1,42 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_LF_APPLY_H +#define DAV1D_SRC_LF_APPLY_H + +#include + +#include "common/bitdepth.h" + +#include "src/internal.h" +#include "src/levels.h" + +void bytefn(dav1d_loopfilter_sbrow)(const Dav1dFrameContext *f, + pixel *const p[3], Av1Filter *lflvl, + int sby, int start_of_tile_row); + +#endif /* DAV1D_SRC_LF_APPLY_H */ diff --git a/third_party/dav1d/src/lf_apply_tmpl.c b/third_party/dav1d/src/lf_apply_tmpl.c new file mode 100644 index 0000000000..4e860f48b1 --- /dev/null +++ b/third_party/dav1d/src/lf_apply_tmpl.c @@ -0,0 +1,306 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include + +#include "common/intops.h" + +#include "src/lf_apply.h" + +static inline void filter_plane_cols_y(const Dav1dFrameContext *const f, + const int have_left, + const uint8_t (*lvl)[4], + const ptrdiff_t b4_stride, + const uint16_t (*const mask)[3][2], + pixel *dst, const ptrdiff_t ls, + const int w, + const int starty4, const int endy4) +{ + const Dav1dDSPContext *const dsp = f->dsp; + + // filter edges between columns (e.g. block1 | block2) + for (int x = 0; x < w; x++) { + if (!have_left && !x) continue; + uint32_t hmask[4]; + if (!starty4) { + hmask[0] = mask[x][0][0]; + hmask[1] = mask[x][1][0]; + hmask[2] = mask[x][2][0]; + if (endy4 > 16) { + hmask[0] |= (unsigned) mask[x][0][1] << 16; + hmask[1] |= (unsigned) mask[x][1][1] << 16; + hmask[2] |= (unsigned) mask[x][2][1] << 16; + } + } else { + hmask[0] = mask[x][0][1]; + hmask[1] = mask[x][1][1]; + hmask[2] = mask[x][2][1]; + } + hmask[3] = 0; + dsp->lf.loop_filter_sb[0][0](&dst[x * 4], ls, hmask, + (const uint8_t(*)[4]) &lvl[x][0], b4_stride, + &f->lf.lim_lut, endy4 - starty4 HIGHBD_CALL_SUFFIX); + } +} + +static inline void filter_plane_rows_y(const Dav1dFrameContext *const f, + const int have_top, + const uint8_t (*lvl)[4], + const ptrdiff_t b4_stride, + const uint16_t (*const mask)[3][2], + pixel *dst, const ptrdiff_t ls, + const int w, + const int starty4, const int endy4) +{ + const Dav1dDSPContext *const dsp = f->dsp; + + // block1 + // filter edges between rows (e.g. ------) + // block2 + for (int y = starty4; y < endy4; + y++, dst += 4 * PXSTRIDE(ls), lvl += b4_stride) + { + if (!have_top && !y) continue; + const uint32_t vmask[4] = { + mask[y][0][0] | ((unsigned) mask[y][0][1] << 16), + mask[y][1][0] | ((unsigned) mask[y][1][1] << 16), + mask[y][2][0] | ((unsigned) mask[y][2][1] << 16), + 0, + }; + dsp->lf.loop_filter_sb[0][1](dst, ls, vmask, + (const uint8_t(*)[4]) &lvl[0][1], b4_stride, + &f->lf.lim_lut, w HIGHBD_CALL_SUFFIX); + } +} + +static inline void filter_plane_cols_uv(const Dav1dFrameContext *const f, + const int have_left, + const uint8_t (*lvl)[4], + const ptrdiff_t b4_stride, + const uint16_t (*const mask)[2][2], + pixel *const u, pixel *const v, + const ptrdiff_t ls, const int w, + const int starty4, const int endy4, + const int ss_ver) +{ + const Dav1dDSPContext *const dsp = f->dsp; + + // filter edges between columns (e.g. block1 | block2) + for (int x = 0; x < w; x++) { + if (!have_left && !x) continue; + uint32_t hmask[3]; + if (!starty4) { + hmask[0] = mask[x][0][0]; + hmask[1] = mask[x][1][0]; + if (endy4 > (16 >> ss_ver)) { + hmask[0] |= (unsigned) mask[x][0][1] << (16 >> ss_ver); + hmask[1] |= (unsigned) mask[x][1][1] << (16 >> ss_ver); + } + } else { + hmask[0] = mask[x][0][1]; + hmask[1] = mask[x][1][1]; + } + hmask[2] = 0; + dsp->lf.loop_filter_sb[1][0](&u[x * 4], ls, hmask, + (const uint8_t(*)[4]) &lvl[x][2], b4_stride, + &f->lf.lim_lut, endy4 - starty4 HIGHBD_CALL_SUFFIX); + dsp->lf.loop_filter_sb[1][0](&v[x * 4], ls, hmask, + (const uint8_t(*)[4]) &lvl[x][3], b4_stride, + &f->lf.lim_lut, endy4 - starty4 HIGHBD_CALL_SUFFIX); + } +} + +static inline void filter_plane_rows_uv(const Dav1dFrameContext *const f, + const int have_top, + const uint8_t (*lvl)[4], + const ptrdiff_t b4_stride, + const uint16_t (*const mask)[2][2], + pixel *const u, pixel *const v, + const ptrdiff_t ls, const int w, + const int starty4, const int endy4, + const int ss_hor) +{ + const Dav1dDSPContext *const dsp = f->dsp; + ptrdiff_t off_l = 0; + + // block1 + // filter edges between rows (e.g. ------) + // block2 + for (int y = starty4; y < endy4; + y++, off_l += 4 * PXSTRIDE(ls), lvl += b4_stride) + { + if (!have_top && !y) continue; + const uint32_t vmask[3] = { + mask[y][0][0] | ((unsigned) mask[y][0][1] << (16 >> ss_hor)), + mask[y][1][0] | ((unsigned) mask[y][1][1] << (16 >> ss_hor)), + 0, + }; + dsp->lf.loop_filter_sb[1][1](&u[off_l], ls, vmask, + (const uint8_t(*)[4]) &lvl[0][2], b4_stride, + &f->lf.lim_lut, w HIGHBD_CALL_SUFFIX); + dsp->lf.loop_filter_sb[1][1](&v[off_l], ls, vmask, + (const uint8_t(*)[4]) &lvl[0][3], b4_stride, + &f->lf.lim_lut, w HIGHBD_CALL_SUFFIX); + } +} + +void bytefn(dav1d_loopfilter_sbrow)(const Dav1dFrameContext *const f, + pixel *const p[3], Av1Filter *const lflvl, + int sby, const int start_of_tile_row) +{ + int x, have_left; + // Don't filter outside the frame + const int have_top = sby > 0; + const int is_sb64 = !f->seq_hdr->sb128; + const int starty4 = (sby & is_sb64) << 4; + const int sbsz = 32 >> is_sb64; + const int sbl2 = 5 - is_sb64; + const int halign = (f->bh + 31) & ~31; + const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; + const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; + const int vmask = 16 >> ss_ver, hmask = 16 >> ss_hor; + const unsigned vmax = 1U << vmask, hmax = 1U << hmask; + const unsigned endy4 = starty4 + imin(f->h4 - sby * sbsz, sbsz); + const unsigned uv_endy4 = (endy4 + ss_ver) >> ss_ver; + + // fix lpf strength at tile col boundaries + const uint8_t *lpf_y = &f->lf.tx_lpf_right_edge[0][sby << sbl2]; + const uint8_t *lpf_uv = &f->lf.tx_lpf_right_edge[1][sby << (sbl2 - ss_ver)]; + for (int tile_col = 1;; tile_col++) { + x = f->frame_hdr->tiling.col_start_sb[tile_col]; + if ((x << sbl2) >= f->bw) break; + const int bx4 = x & is_sb64 ? 16 : 0, cbx4 = bx4 >> ss_hor; + x >>= is_sb64; + + uint16_t (*const y_hmask)[2] = lflvl[x].filter_y[0][bx4]; + for (unsigned y = starty4, mask = 1 << y; y < endy4; y++, mask <<= 1) { + const int sidx = mask >= 0x10000U; + const unsigned smask = mask >> (sidx << 4); + const int idx = 2 * !!(y_hmask[2][sidx] & smask) + + !!(y_hmask[1][sidx] & smask); + y_hmask[2][sidx] &= ~smask; + y_hmask[1][sidx] &= ~smask; + y_hmask[0][sidx] &= ~smask; + y_hmask[imin(idx, lpf_y[y - starty4])][sidx] |= smask; + } + + if (f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) { + uint16_t (*const uv_hmask)[2] = lflvl[x].filter_uv[0][cbx4]; + for (unsigned y = starty4 >> ss_ver, uv_mask = 1 << y; y < uv_endy4; + y++, uv_mask <<= 1) + { + const int sidx = uv_mask >= vmax; + const unsigned smask = uv_mask >> (sidx << (4 - ss_ver)); + const int idx = !!(uv_hmask[1][sidx] & smask); + uv_hmask[1][sidx] &= ~smask; + uv_hmask[0][sidx] &= ~smask; + uv_hmask[imin(idx, lpf_uv[y - (starty4 >> ss_ver)])][sidx] |= smask; + } + } + lpf_y += halign; + lpf_uv += halign >> ss_ver; + } + + // fix lpf strength at tile row boundaries + if (start_of_tile_row) { + const BlockContext *a; + for (x = 0, a = &f->a[f->sb128w * (start_of_tile_row - 1)]; + x < f->sb128w; x++, a++) + { + uint16_t (*const y_vmask)[2] = lflvl[x].filter_y[1][starty4]; + const unsigned w = imin(32, f->w4 - (x << 5)); + for (unsigned mask = 1, i = 0; i < w; mask <<= 1, i++) { + const int sidx = mask >= 0x10000U; + const unsigned smask = mask >> (sidx << 4); + const int idx = 2 * !!(y_vmask[2][sidx] & smask) + + !!(y_vmask[1][sidx] & smask); + y_vmask[2][sidx] &= ~smask; + y_vmask[1][sidx] &= ~smask; + y_vmask[0][sidx] &= ~smask; + y_vmask[imin(idx, a->tx_lpf_y[i])][sidx] |= smask; + } + + if (f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) { + const unsigned cw = (w + ss_hor) >> ss_hor; + uint16_t (*const uv_vmask)[2] = lflvl[x].filter_uv[1][starty4 >> ss_ver]; + for (unsigned uv_mask = 1, i = 0; i < cw; uv_mask <<= 1, i++) { + const int sidx = uv_mask >= hmax; + const unsigned smask = uv_mask >> (sidx << (4 - ss_hor)); + const int idx = !!(uv_vmask[1][sidx] & smask); + uv_vmask[1][sidx] &= ~smask; + uv_vmask[0][sidx] &= ~smask; + uv_vmask[imin(idx, a->tx_lpf_uv[i])][sidx] |= smask; + } + } + } + } + + pixel *ptr; + uint8_t (*level_ptr)[4] = f->lf.level + f->b4_stride * sby * sbsz; + for (ptr = p[0], have_left = 0, x = 0; x < f->sb128w; + x++, have_left = 1, ptr += 128, level_ptr += 32) + { + filter_plane_cols_y(f, have_left, level_ptr, f->b4_stride, + lflvl[x].filter_y[0], ptr, f->cur.stride[0], + imin(32, f->w4 - x * 32), starty4, endy4); + } + + level_ptr = f->lf.level + f->b4_stride * sby * sbsz; + for (ptr = p[0], x = 0; x < f->sb128w; x++, ptr += 128, level_ptr += 32) { + filter_plane_rows_y(f, have_top, level_ptr, f->b4_stride, + lflvl[x].filter_y[1], ptr, f->cur.stride[0], + imin(32, f->w4 - x * 32), starty4, endy4); + } + + if (!f->frame_hdr->loopfilter.level_u && !f->frame_hdr->loopfilter.level_v) + return; + + ptrdiff_t uv_off; + level_ptr = f->lf.level + f->b4_stride * (sby * sbsz >> ss_ver); + for (uv_off = 0, have_left = 0, x = 0; x < f->sb128w; + x++, have_left = 1, uv_off += 128 >> ss_hor, level_ptr += 32 >> ss_hor) + { + filter_plane_cols_uv(f, have_left, level_ptr, f->b4_stride, + lflvl[x].filter_uv[0], + &p[1][uv_off], &p[2][uv_off], f->cur.stride[1], + (imin(32, f->w4 - x * 32) + ss_hor) >> ss_hor, + starty4 >> ss_ver, uv_endy4, ss_ver); + } + + level_ptr = f->lf.level + f->b4_stride * (sby * sbsz >> ss_ver); + for (uv_off = 0, x = 0; x < f->sb128w; + x++, uv_off += 128 >> ss_hor, level_ptr += 32 >> ss_hor) + { + filter_plane_rows_uv(f, have_top, level_ptr, f->b4_stride, + lflvl[x].filter_uv[1], + &p[1][uv_off], &p[2][uv_off], f->cur.stride[1], + (imin(32, f->w4 - x * 32) + ss_hor) >> ss_hor, + starty4 >> ss_ver, uv_endy4, ss_hor); + } +} diff --git a/third_party/dav1d/src/lf_mask.c b/third_party/dav1d/src/lf_mask.c new file mode 100644 index 0000000000..4c99864e92 --- /dev/null +++ b/third_party/dav1d/src/lf_mask.c @@ -0,0 +1,482 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include + +#include "common/intops.h" + +#include "src/ctx.h" +#include "src/levels.h" +#include "src/lf_mask.h" +#include "src/tables.h" + +static void decomp_tx(uint8_t (*const txa)[2 /* txsz, step */][32 /* y */][32 /* x */], + const enum RectTxfmSize from, + const int depth, + const int y_off, const int x_off, + const uint16_t *const tx_masks) +{ + const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[from]; + const int is_split = (from == (int) TX_4X4 || depth > 1) ? 0 : + (tx_masks[depth] >> (y_off * 4 + x_off)) & 1; + + if (is_split) { + const enum RectTxfmSize sub = t_dim->sub; + const int htw4 = t_dim->w >> 1, hth4 = t_dim->h >> 1; + + decomp_tx(txa, sub, depth + 1, y_off * 2 + 0, x_off * 2 + 0, tx_masks); + if (t_dim->w >= t_dim->h) + decomp_tx((uint8_t(*)[2][32][32]) &txa[0][0][0][htw4], + sub, depth + 1, y_off * 2 + 0, x_off * 2 + 1, tx_masks); + if (t_dim->h >= t_dim->w) { + decomp_tx((uint8_t(*)[2][32][32]) &txa[0][0][hth4][0], + sub, depth + 1, y_off * 2 + 1, x_off * 2 + 0, tx_masks); + if (t_dim->w >= t_dim->h) + decomp_tx((uint8_t(*)[2][32][32]) &txa[0][0][hth4][htw4], + sub, depth + 1, y_off * 2 + 1, x_off * 2 + 1, tx_masks); + } + } else { + const int lw = imin(2, t_dim->lw), lh = imin(2, t_dim->lh); + +#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ + for (int y = 0; y < t_dim->h; y++) { \ + rep_macro(type, txa[0][0][y], off, mul * lw); \ + rep_macro(type, txa[1][0][y], off, mul * lh); \ + txa[0][1][y][0] = t_dim->w; \ + } + case_set_upto16(t_dim->w,,, 0); +#undef set_ctx +#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ + rep_macro(type, txa[1][1][0], off, mul * t_dim->h) + case_set_upto16(t_dim->w,,, 0); +#undef set_ctx + } +} + +static inline void mask_edges_inter(uint16_t (*const masks)[32][3][2], + const int by4, const int bx4, + const int w4, const int h4, const int skip, + const enum RectTxfmSize max_tx, + const uint16_t *const tx_masks, + uint8_t *const a, uint8_t *const l) +{ + const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[max_tx]; + int y, x; + + uint8_t txa[2 /* edge */][2 /* txsz, step */][32 /* y */][32 /* x */]; + for (int y_off = 0, y = 0; y < h4; y += t_dim->h, y_off++) + for (int x_off = 0, x = 0; x < w4; x += t_dim->w, x_off++) + decomp_tx((uint8_t(*)[2][32][32]) &txa[0][0][y][x], + max_tx, 0, y_off, x_off, tx_masks); + + // left block edge + unsigned mask = 1U << by4; + for (y = 0; y < h4; y++, mask <<= 1) { + const int sidx = mask >= 0x10000; + const unsigned smask = mask >> (sidx << 4); + masks[0][bx4][imin(txa[0][0][y][0], l[y])][sidx] |= smask; + } + + // top block edge + for (x = 0, mask = 1U << bx4; x < w4; x++, mask <<= 1) { + const int sidx = mask >= 0x10000; + const unsigned smask = mask >> (sidx << 4); + masks[1][by4][imin(txa[1][0][0][x], a[x])][sidx] |= smask; + } + + if (!skip) { + // inner (tx) left|right edges + for (y = 0, mask = 1U << by4; y < h4; y++, mask <<= 1) { + const int sidx = mask >= 0x10000U; + const unsigned smask = mask >> (sidx << 4); + int ltx = txa[0][0][y][0]; + int step = txa[0][1][y][0]; + for (x = step; x < w4; x += step) { + const int rtx = txa[0][0][y][x]; + masks[0][bx4 + x][imin(rtx, ltx)][sidx] |= smask; + ltx = rtx; + step = txa[0][1][y][x]; + } + } + + // top + // inner (tx) --- edges + // bottom + for (x = 0, mask = 1U << bx4; x < w4; x++, mask <<= 1) { + const int sidx = mask >= 0x10000U; + const unsigned smask = mask >> (sidx << 4); + int ttx = txa[1][0][0][x]; + int step = txa[1][1][0][x]; + for (y = step; y < h4; y += step) { + const int btx = txa[1][0][y][x]; + masks[1][by4 + y][imin(ttx, btx)][sidx] |= smask; + ttx = btx; + step = txa[1][1][y][x]; + } + } + } + + for (y = 0; y < h4; y++) + l[y] = txa[0][0][y][w4 - 1]; + memcpy(a, txa[1][0][h4 - 1], w4); +} + +static inline void mask_edges_intra(uint16_t (*const masks)[32][3][2], + const int by4, const int bx4, + const int w4, const int h4, + const enum RectTxfmSize tx, + uint8_t *const a, uint8_t *const l) +{ + const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx]; + const int twl4 = t_dim->lw, thl4 = t_dim->lh; + const int twl4c = imin(2, twl4), thl4c = imin(2, thl4); + int y, x; + + // left block edge + unsigned mask = 1U << by4; + for (y = 0; y < h4; y++, mask <<= 1) { + const int sidx = mask >= 0x10000; + const unsigned smask = mask >> (sidx << 4); + masks[0][bx4][imin(twl4c, l[y])][sidx] |= smask; + } + + // top block edge + for (x = 0, mask = 1U << bx4; x < w4; x++, mask <<= 1) { + const int sidx = mask >= 0x10000; + const unsigned smask = mask >> (sidx << 4); + masks[1][by4][imin(thl4c, a[x])][sidx] |= smask; + } + + // inner (tx) left|right edges + const int hstep = t_dim->w; + unsigned t = 1U << by4; + unsigned inner = (unsigned) ((((uint64_t) t) << h4) - t); + unsigned inner1 = inner & 0xffff, inner2 = inner >> 16; + for (x = hstep; x < w4; x += hstep) { + if (inner1) masks[0][bx4 + x][twl4c][0] |= inner1; + if (inner2) masks[0][bx4 + x][twl4c][1] |= inner2; + } + + // top + // inner (tx) --- edges + // bottom + const int vstep = t_dim->h; + t = 1U << bx4; + inner = (unsigned) ((((uint64_t) t) << w4) - t); + inner1 = inner & 0xffff; + inner2 = inner >> 16; + for (y = vstep; y < h4; y += vstep) { + if (inner1) masks[1][by4 + y][thl4c][0] |= inner1; + if (inner2) masks[1][by4 + y][thl4c][1] |= inner2; + } + +#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ + rep_macro(type, a, off, mul * thl4c) +#define default_memset(dir, diridx, off, var) \ + memset(a, thl4c, var) + case_set_upto32_with_default(w4,,, 0); +#undef default_memset +#undef set_ctx +#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ + rep_macro(type, l, off, mul * twl4c) +#define default_memset(dir, diridx, off, var) \ + memset(l, twl4c, var) + case_set_upto32_with_default(h4,,, 0); +#undef default_memset +#undef set_ctx +} + +static inline void mask_edges_chroma(uint16_t (*const masks)[32][2][2], + const int cby4, const int cbx4, + const int cw4, const int ch4, + const int skip_inter, + const enum RectTxfmSize tx, + uint8_t *const a, uint8_t *const l, + const int ss_hor, const int ss_ver) +{ + const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx]; + const int twl4 = t_dim->lw, thl4 = t_dim->lh; + const int twl4c = !!twl4, thl4c = !!thl4; + int y, x; + const int vbits = 4 - ss_ver, hbits = 4 - ss_hor; + const int vmask = 16 >> ss_ver, hmask = 16 >> ss_hor; + const unsigned vmax = 1 << vmask, hmax = 1 << hmask; + + // left block edge + unsigned mask = 1U << cby4; + for (y = 0; y < ch4; y++, mask <<= 1) { + const int sidx = mask >= vmax; + const unsigned smask = mask >> (sidx << vbits); + masks[0][cbx4][imin(twl4c, l[y])][sidx] |= smask; + } + + // top block edge + for (x = 0, mask = 1U << cbx4; x < cw4; x++, mask <<= 1) { + const int sidx = mask >= hmax; + const unsigned smask = mask >> (sidx << hbits); + masks[1][cby4][imin(thl4c, a[x])][sidx] |= smask; + } + + if (!skip_inter) { + // inner (tx) left|right edges + const int hstep = t_dim->w; + unsigned t = 1U << cby4; + unsigned inner = (unsigned) ((((uint64_t) t) << ch4) - t); + unsigned inner1 = inner & ((1 << vmask) - 1), inner2 = inner >> vmask; + for (x = hstep; x < cw4; x += hstep) { + if (inner1) masks[0][cbx4 + x][twl4c][0] |= inner1; + if (inner2) masks[0][cbx4 + x][twl4c][1] |= inner2; + } + + // top + // inner (tx) --- edges + // bottom + const int vstep = t_dim->h; + t = 1U << cbx4; + inner = (unsigned) ((((uint64_t) t) << cw4) - t); + inner1 = inner & ((1 << hmask) - 1), inner2 = inner >> hmask; + for (y = vstep; y < ch4; y += vstep) { + if (inner1) masks[1][cby4 + y][thl4c][0] |= inner1; + if (inner2) masks[1][cby4 + y][thl4c][1] |= inner2; + } + } + +#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ + rep_macro(type, a, off, mul * thl4c) +#define default_memset(dir, diridx, off, var) \ + memset(a, thl4c, var) + case_set_upto32_with_default(cw4,,, 0); +#undef default_memset +#undef set_ctx +#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ + rep_macro(type, l, off, mul * twl4c) +#define default_memset(dir, diridx, off, var) \ + memset(l, twl4c, var) + case_set_upto32_with_default(ch4,,, 0); +#undef default_memset +#undef set_ctx +} + +void dav1d_create_lf_mask_intra(Av1Filter *const lflvl, + uint8_t (*const level_cache)[4], + const ptrdiff_t b4_stride, + const uint8_t (*filter_level)[8][2], + const int bx, const int by, + const int iw, const int ih, + const enum BlockSize bs, + const enum RectTxfmSize ytx, + const enum RectTxfmSize uvtx, + const enum Dav1dPixelLayout layout, + uint8_t *const ay, uint8_t *const ly, + uint8_t *const auv, uint8_t *const luv) +{ + const uint8_t *const b_dim = dav1d_block_dimensions[bs]; + const int bw4 = imin(iw - bx, b_dim[0]); + const int bh4 = imin(ih - by, b_dim[1]); + const int bx4 = bx & 31; + const int by4 = by & 31; + + if (bw4 && bh4) { + uint8_t (*level_cache_ptr)[4] = level_cache + by * b4_stride + bx; + for (int y = 0; y < bh4; y++) { + for (int x = 0; x < bw4; x++) { + level_cache_ptr[x][0] = filter_level[0][0][0]; + level_cache_ptr[x][1] = filter_level[1][0][0]; + } + level_cache_ptr += b4_stride; + } + + mask_edges_intra(lflvl->filter_y, by4, bx4, bw4, bh4, ytx, ay, ly); + } + + if (!auv) return; + + const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420; + const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444; + const int cbw4 = imin(((iw + ss_hor) >> ss_hor) - (bx >> ss_hor), + (b_dim[0] + ss_hor) >> ss_hor); + const int cbh4 = imin(((ih + ss_ver) >> ss_ver) - (by >> ss_ver), + (b_dim[1] + ss_ver) >> ss_ver); + + if (!cbw4 || !cbh4) return; + + const int cbx4 = bx4 >> ss_hor; + const int cby4 = by4 >> ss_ver; + + uint8_t (*level_cache_ptr)[4] = + level_cache + (by >> ss_ver) * b4_stride + (bx >> ss_hor); + for (int y = 0; y < cbh4; y++) { + for (int x = 0; x < cbw4; x++) { + level_cache_ptr[x][2] = filter_level[2][0][0]; + level_cache_ptr[x][3] = filter_level[3][0][0]; + } + level_cache_ptr += b4_stride; + } + + mask_edges_chroma(lflvl->filter_uv, cby4, cbx4, cbw4, cbh4, 0, uvtx, + auv, luv, ss_hor, ss_ver); +} + +void dav1d_create_lf_mask_inter(Av1Filter *const lflvl, + uint8_t (*const level_cache)[4], + const ptrdiff_t b4_stride, + const uint8_t (*filter_level)[8][2], + const int bx, const int by, + const int iw, const int ih, + const int skip, const enum BlockSize bs, + const enum RectTxfmSize max_ytx, + const uint16_t *const tx_masks, + const enum RectTxfmSize uvtx, + const enum Dav1dPixelLayout layout, + uint8_t *const ay, uint8_t *const ly, + uint8_t *const auv, uint8_t *const luv) +{ + const uint8_t *const b_dim = dav1d_block_dimensions[bs]; + const int bw4 = imin(iw - bx, b_dim[0]); + const int bh4 = imin(ih - by, b_dim[1]); + const int bx4 = bx & 31; + const int by4 = by & 31; + + if (bw4 && bh4) { + uint8_t (*level_cache_ptr)[4] = level_cache + by * b4_stride + bx; + for (int y = 0; y < bh4; y++) { + for (int x = 0; x < bw4; x++) { + level_cache_ptr[x][0] = filter_level[0][0][0]; + level_cache_ptr[x][1] = filter_level[1][0][0]; + } + level_cache_ptr += b4_stride; + } + + mask_edges_inter(lflvl->filter_y, by4, bx4, bw4, bh4, skip, + max_ytx, tx_masks, ay, ly); + } + + if (!auv) return; + + const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420; + const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444; + const int cbw4 = imin(((iw + ss_hor) >> ss_hor) - (bx >> ss_hor), + (b_dim[0] + ss_hor) >> ss_hor); + const int cbh4 = imin(((ih + ss_ver) >> ss_ver) - (by >> ss_ver), + (b_dim[1] + ss_ver) >> ss_ver); + + if (!cbw4 || !cbh4) return; + + const int cbx4 = bx4 >> ss_hor; + const int cby4 = by4 >> ss_ver; + + uint8_t (*level_cache_ptr)[4] = + level_cache + (by >> ss_ver) * b4_stride + (bx >> ss_hor); + for (int y = 0; y < cbh4; y++) { + for (int x = 0; x < cbw4; x++) { + level_cache_ptr[x][2] = filter_level[2][0][0]; + level_cache_ptr[x][3] = filter_level[3][0][0]; + } + level_cache_ptr += b4_stride; + } + + mask_edges_chroma(lflvl->filter_uv, cby4, cbx4, cbw4, cbh4, skip, uvtx, + auv, luv, ss_hor, ss_ver); +} + +void dav1d_calc_eih(Av1FilterLUT *const lim_lut, const int filter_sharpness) { + // set E/I/H values from loopfilter level + const int sharp = filter_sharpness; + for (int level = 0; level < 64; level++) { + int limit = level; + + if (sharp > 0) { + limit >>= (sharp + 3) >> 2; + limit = imin(limit, 9 - sharp); + } + limit = imax(limit, 1); + + lim_lut->i[level] = limit; + lim_lut->e[level] = 2 * (level + 2) + limit; + } + lim_lut->sharp[0] = (sharp + 3) >> 2; + lim_lut->sharp[1] = sharp ? 9 - sharp : 0xff; +} + +static inline void calc_lf_value(uint8_t (*const lflvl_values)[2], + const int is_chroma, const int base_lvl, + const int lf_delta, const int seg_delta, + const Dav1dLoopfilterModeRefDeltas *const mr_delta) +{ + const int base = iclip(iclip(base_lvl + lf_delta, 0, 63) + seg_delta, 0, 63); + + if (!base_lvl && is_chroma) { + memset(lflvl_values, 0, 8 * 2); + } else if (!mr_delta) { + memset(lflvl_values, base, 8 * 2); + } else { + const int sh = base >= 32; + lflvl_values[0][0] = lflvl_values[0][1] = + iclip(base + (mr_delta->ref_delta[0] * (1 << sh)), 0, 63); + for (int r = 1; r < 8; r++) { + for (int m = 0; m < 2; m++) { + const int delta = + mr_delta->mode_delta[m] + mr_delta->ref_delta[r]; + lflvl_values[r][m] = iclip(base + (delta * (1 << sh)), 0, 63); + } + } + } +} + +void dav1d_calc_lf_values(uint8_t (*const lflvl_values)[4][8][2], + const Dav1dFrameHeader *const hdr, + const int8_t lf_delta[4]) +{ + const int n_seg = hdr->segmentation.enabled ? 8 : 1; + + if (!hdr->loopfilter.level_y[0] && !hdr->loopfilter.level_y[1]) { + memset(lflvl_values, 0, 8 * 4 * 2 * n_seg); + return; + } + + const Dav1dLoopfilterModeRefDeltas *const mr_deltas = + hdr->loopfilter.mode_ref_delta_enabled ? + &hdr->loopfilter.mode_ref_deltas : NULL; + for (int s = 0; s < n_seg; s++) { + const Dav1dSegmentationData *const segd = + hdr->segmentation.enabled ? &hdr->segmentation.seg_data.d[s] : NULL; + + calc_lf_value(lflvl_values[s][0], 0, hdr->loopfilter.level_y[0], + lf_delta[0], segd ? segd->delta_lf_y_v : 0, mr_deltas); + calc_lf_value(lflvl_values[s][1], 0, hdr->loopfilter.level_y[1], + lf_delta[hdr->delta.lf.multi ? 1 : 0], + segd ? segd->delta_lf_y_h : 0, mr_deltas); + calc_lf_value(lflvl_values[s][2], 1, hdr->loopfilter.level_u, + lf_delta[hdr->delta.lf.multi ? 2 : 0], + segd ? segd->delta_lf_u : 0, mr_deltas); + calc_lf_value(lflvl_values[s][3], 1, hdr->loopfilter.level_v, + lf_delta[hdr->delta.lf.multi ? 3 : 0], + segd ? segd->delta_lf_v : 0, mr_deltas); + } +} diff --git a/third_party/dav1d/src/lf_mask.h b/third_party/dav1d/src/lf_mask.h new file mode 100644 index 0000000000..0c9caa6880 --- /dev/null +++ b/third_party/dav1d/src/lf_mask.h @@ -0,0 +1,83 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_LF_MASK_H +#define DAV1D_SRC_LF_MASK_H + +#include +#include + +#include "src/levels.h" + +typedef struct Av1FilterLUT { + uint8_t e[64]; + uint8_t i[64]; + uint64_t sharp[2]; +} Av1FilterLUT; + +typedef struct Av1RestorationUnit { + enum Dav1dRestorationType type; + int16_t filter_h[3]; + int16_t filter_v[3]; + uint8_t sgr_idx; + int16_t sgr_weights[2]; +} Av1RestorationUnit; + +// each struct describes one 128x128 area (1 or 4 SBs), pre-superres-scaling +typedef struct Av1Filter { + // each bit is 1 col + uint16_t filter_y[2 /* 0=col, 1=row */][32][3][2]; + uint16_t filter_uv[2 /* 0=col, 1=row */][32][2][2]; + int8_t cdef_idx[4]; // -1 means "unset" + uint16_t noskip_mask[32][2]; +} Av1Filter; + +// each struct describes one 128x128 area (1 or 4 SBs), post-superres-scaling +typedef struct Av1Restoration { + Av1RestorationUnit lr[3][4]; +} Av1Restoration; + +void dav1d_create_lf_mask_intra(Av1Filter *lflvl, uint8_t (*level_cache)[4], + const ptrdiff_t b4_stride, + const uint8_t (*level)[8][2], int bx, int by, + int iw, int ih, enum BlockSize bs, + enum RectTxfmSize ytx, enum RectTxfmSize uvtx, + enum Dav1dPixelLayout layout, uint8_t *ay, + uint8_t *ly, uint8_t *auv, uint8_t *luv); +void dav1d_create_lf_mask_inter(Av1Filter *lflvl, uint8_t (*level_cache)[4], + const ptrdiff_t b4_stride, + const uint8_t (*level)[8][2], int bx, int by, + int iw, int ih, int skip_inter, + enum BlockSize bs, enum RectTxfmSize max_ytx, + const uint16_t *tx_mask, enum RectTxfmSize uvtx, + enum Dav1dPixelLayout layout, uint8_t *ay, + uint8_t *ly, uint8_t *auv, uint8_t *luv); +void dav1d_calc_eih(Av1FilterLUT *lim_lut, int filter_sharpness); +void dav1d_calc_lf_values(uint8_t (*values)[4][8][2], const Dav1dFrameHeader *hdr, + const int8_t lf_delta[4]); + +#endif /* DAV1D_SRC_LF_MASK_H */ diff --git a/third_party/dav1d/src/lib.c b/third_party/dav1d/src/lib.c new file mode 100644 index 0000000000..76f805a78e --- /dev/null +++ b/third_party/dav1d/src/lib.c @@ -0,0 +1,649 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "vcs_version.h" + +#include +#include + +#if defined(__linux__) && defined(HAVE_DLSYM) +#include +#endif + +#include "dav1d/dav1d.h" +#include "dav1d/data.h" + +#include "common/validate.h" + +#include "src/cpu.h" +#include "src/fg_apply.h" +#include "src/internal.h" +#include "src/log.h" +#include "src/obu.h" +#include "src/qm.h" +#include "src/ref.h" +#include "src/thread_task.h" +#include "src/wedge.h" + +static COLD void init_internal(void) { + dav1d_init_cpu(); + dav1d_init_interintra_masks(); + dav1d_init_qm_tables(); + dav1d_init_thread(); + dav1d_init_wedge_masks(); +} + +COLD const char *dav1d_version(void) { + return DAV1D_VERSION; +} + +COLD void dav1d_default_settings(Dav1dSettings *const s) { + s->n_frame_threads = 1; + s->n_tile_threads = 1; + s->apply_grain = 1; + s->allocator.cookie = NULL; + s->allocator.alloc_picture_callback = dav1d_default_picture_alloc; + s->allocator.release_picture_callback = dav1d_default_picture_release; + s->logger.cookie = NULL; + s->logger.callback = dav1d_log_default_callback; + s->operating_point = 0; + s->all_layers = 1; // just until the tests are adjusted + s->frame_size_limit = 0; +} + +static COLD int init_mem_pools(Dav1dContext *const c) { + if (!pthread_mutex_init(&c->seq_hdr_pool.lock, NULL)) { + if (!pthread_mutex_init(&c->frame_hdr_pool.lock, NULL)) { + if (!pthread_mutex_init(&c->segmap_pool.lock, NULL)) { + if (!pthread_mutex_init(&c->refmvs_pool.lock, NULL)) { + if (!pthread_mutex_init(&c->cdf_pool.lock, NULL)) { + if (c->allocator.alloc_picture_callback == dav1d_default_picture_alloc) { + if (!pthread_mutex_init(&c->picture_pool.lock, NULL)) { + c->allocator.cookie = &c->picture_pool; + c->mem_pools_inited = 2; + return 0; + } + } else { + c->mem_pools_inited = 1; + return 0; + } + pthread_mutex_destroy(&c->cdf_pool.lock); + } + pthread_mutex_destroy(&c->refmvs_pool.lock); + } + pthread_mutex_destroy(&c->segmap_pool.lock); + } + pthread_mutex_destroy(&c->frame_hdr_pool.lock); + } + pthread_mutex_destroy(&c->seq_hdr_pool.lock); + } + return -1; +} + +static void close_internal(Dav1dContext **const c_out, int flush); + +NO_SANITIZE("cfi-icall") // CFI is broken with dlsym() +static COLD size_t get_stack_size_internal(const pthread_attr_t *const thread_attr) { +#if defined(__linux__) && defined(HAVE_DLSYM) && defined(__GLIBC__) + /* glibc has an issue where the size of the TLS is subtracted from the stack + * size instead of allocated separately. As a result the specified stack + * size may be insufficient when used in an application with large amounts + * of TLS data. The following is a workaround to compensate for that. + * See https://sourceware.org/bugzilla/show_bug.cgi?id=11787 */ + size_t (*const get_minstack)(const pthread_attr_t*) = + dlsym(RTLD_DEFAULT, "__pthread_get_minstack"); + if (get_minstack) + return get_minstack(thread_attr) - PTHREAD_STACK_MIN; +#endif + return 0; +} + +COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) { + static pthread_once_t initted = PTHREAD_ONCE_INIT; + pthread_once(&initted, init_internal); + + validate_input_or_ret(c_out != NULL, DAV1D_ERR(EINVAL)); + validate_input_or_ret(s != NULL, DAV1D_ERR(EINVAL)); + validate_input_or_ret(s->n_tile_threads >= 1 && + s->n_tile_threads <= DAV1D_MAX_TILE_THREADS, DAV1D_ERR(EINVAL)); + validate_input_or_ret(s->n_frame_threads >= 1 && + s->n_frame_threads <= DAV1D_MAX_FRAME_THREADS, DAV1D_ERR(EINVAL)); + validate_input_or_ret(s->allocator.alloc_picture_callback != NULL, + DAV1D_ERR(EINVAL)); + validate_input_or_ret(s->allocator.release_picture_callback != NULL, + DAV1D_ERR(EINVAL)); + validate_input_or_ret(s->operating_point >= 0 && + s->operating_point <= 31, DAV1D_ERR(EINVAL)); + + pthread_attr_t thread_attr; + if (pthread_attr_init(&thread_attr)) return DAV1D_ERR(ENOMEM); + size_t stack_size = 1024 * 1024 + get_stack_size_internal(&thread_attr); + + pthread_attr_setstacksize(&thread_attr, stack_size); + + Dav1dContext *const c = *c_out = dav1d_alloc_aligned(sizeof(*c), 32); + if (!c) goto error; + memset(c, 0, sizeof(*c)); + + c->allocator = s->allocator; + c->logger = s->logger; + c->apply_grain = s->apply_grain; + c->operating_point = s->operating_point; + c->all_layers = s->all_layers; + c->frame_size_limit = s->frame_size_limit; + + if (init_mem_pools(c)) goto error; + + /* On 32-bit systems extremely large frame sizes can cause overflows in + * dav1d_decode_frame() malloc size calculations. Prevent that from occuring + * by enforcing a maximum frame size limit, chosen to roughly correspond to + * the largest size possible to decode without exhausting virtual memory. */ + if (sizeof(size_t) < 8 && s->frame_size_limit - 1 >= 8192 * 8192) { + c->frame_size_limit = 8192 * 8192; + if (s->frame_size_limit) + dav1d_log(c, "Frame size limit reduced from %u to %u.\n", + s->frame_size_limit, c->frame_size_limit); + } + + c->frame_thread.flush = &c->frame_thread.flush_mem; + atomic_init(c->frame_thread.flush, 0); + c->n_fc = s->n_frame_threads; + c->fc = dav1d_alloc_aligned(sizeof(*c->fc) * s->n_frame_threads, 32); + if (!c->fc) goto error; + memset(c->fc, 0, sizeof(*c->fc) * s->n_frame_threads); + if (c->n_fc > 1) { + c->frame_thread.out_delayed = + calloc(c->n_fc, sizeof(*c->frame_thread.out_delayed)); + if (!c->frame_thread.out_delayed) goto error; + } + for (int n = 0; n < s->n_frame_threads; n++) { + Dav1dFrameContext *const f = &c->fc[n]; + f->c = c; + f->lf.last_sharpness = -1; + f->n_tc = s->n_tile_threads; + f->tc = dav1d_alloc_aligned(sizeof(*f->tc) * s->n_tile_threads, 64); + if (!f->tc) goto error; + memset(f->tc, 0, sizeof(*f->tc) * s->n_tile_threads); + if (f->n_tc > 1) { + if (pthread_mutex_init(&f->tile_thread.lock, NULL)) goto error; + if (pthread_cond_init(&f->tile_thread.cond, NULL)) { + pthread_mutex_destroy(&f->tile_thread.lock); + goto error; + } + if (pthread_cond_init(&f->tile_thread.icond, NULL)) { + pthread_mutex_destroy(&f->tile_thread.lock); + pthread_cond_destroy(&f->tile_thread.cond); + goto error; + } + f->tile_thread.inited = 1; + } + for (int m = 0; m < s->n_tile_threads; m++) { + Dav1dTileContext *const t = &f->tc[m]; + t->f = f; + memset(t->cf_16bpc, 0, sizeof(t->cf_16bpc)); + if (f->n_tc > 1) { + if (pthread_mutex_init(&t->tile_thread.td.lock, NULL)) goto error; + if (pthread_cond_init(&t->tile_thread.td.cond, NULL)) { + pthread_mutex_destroy(&t->tile_thread.td.lock); + goto error; + } + t->tile_thread.fttd = &f->tile_thread; + if (pthread_create(&t->tile_thread.td.thread, &thread_attr, dav1d_tile_task, t)) { + pthread_cond_destroy(&t->tile_thread.td.cond); + pthread_mutex_destroy(&t->tile_thread.td.lock); + goto error; + } + t->tile_thread.td.inited = 1; + } + } + dav1d_refmvs_init(&f->rf); + if (c->n_fc > 1) { + if (pthread_mutex_init(&f->frame_thread.td.lock, NULL)) goto error; + if (pthread_cond_init(&f->frame_thread.td.cond, NULL)) { + pthread_mutex_destroy(&f->frame_thread.td.lock); + goto error; + } + if (pthread_create(&f->frame_thread.td.thread, &thread_attr, dav1d_frame_task, f)) { + pthread_cond_destroy(&f->frame_thread.td.cond); + pthread_mutex_destroy(&f->frame_thread.td.lock); + goto error; + } + f->frame_thread.td.inited = 1; + } + } + + // intra edge tree + c->intra_edge.root[BL_128X128] = &c->intra_edge.branch_sb128[0].node; + dav1d_init_mode_tree(c->intra_edge.root[BL_128X128], c->intra_edge.tip_sb128, 1); + c->intra_edge.root[BL_64X64] = &c->intra_edge.branch_sb64[0].node; + dav1d_init_mode_tree(c->intra_edge.root[BL_64X64], c->intra_edge.tip_sb64, 0); + + pthread_attr_destroy(&thread_attr); + + return 0; + +error: + if (c) close_internal(c_out, 0); + pthread_attr_destroy(&thread_attr); + return DAV1D_ERR(ENOMEM); +} + +static void dummy_free(const uint8_t *const data, void *const user_data) { + assert(data && !user_data); +} + +int dav1d_parse_sequence_header(Dav1dSequenceHeader *const out, + const uint8_t *const ptr, const size_t sz) +{ + Dav1dData buf = { 0 }; + int res; + + validate_input_or_ret(out != NULL, DAV1D_ERR(EINVAL)); + + Dav1dSettings s; + dav1d_default_settings(&s); + s.logger.callback = NULL; + + Dav1dContext *c; + res = dav1d_open(&c, &s); + if (res < 0) return res; + + if (ptr) { + res = dav1d_data_wrap_internal(&buf, ptr, sz, dummy_free, NULL); + if (res < 0) goto error; + } + + while (buf.sz > 0) { + res = dav1d_parse_obus(c, &buf, 1); + if (res < 0) goto error; + + assert((size_t)res <= buf.sz); + buf.sz -= res; + buf.data += res; + } + + if (!c->seq_hdr) { + res = DAV1D_ERR(EINVAL); + goto error; + } + + memcpy(out, c->seq_hdr, sizeof(*out)); + + res = 0; +error: + dav1d_data_unref_internal(&buf); + dav1d_close(&c); + + return res; +} + +static int output_image(Dav1dContext *const c, Dav1dPicture *const out, + Dav1dPicture *const in) +{ + const Dav1dFilmGrainData *fgdata = &in->frame_hdr->film_grain.data; + int has_grain = fgdata->num_y_points || fgdata->num_uv_points[0] || + fgdata->num_uv_points[1]; + + // If there is nothing to be done, skip the allocation/copy + if (!c->apply_grain || !has_grain) { + dav1d_picture_move_ref(out, in); + return 0; + } + + // Apply film grain to a new copy of the image to avoid corrupting refs + int res = dav1d_picture_alloc_copy(c, out, in->p.w, in); + if (res < 0) { + dav1d_picture_unref_internal(in); + dav1d_picture_unref_internal(out); + return res; + } + + switch (out->p.bpc) { +#if CONFIG_8BPC + case 8: + dav1d_apply_grain_8bpc(&c->dsp[0].fg, out, in); + break; +#endif +#if CONFIG_16BPC + case 10: + case 12: + dav1d_apply_grain_16bpc(&c->dsp[(out->p.bpc >> 1) - 4].fg, out, in); + break; +#endif + default: + assert(0); + } + + dav1d_picture_unref_internal(in); + return 0; +} + +static int output_picture_ready(Dav1dContext *const c) { + + if (!c->out.data[0]) return 0; + + // skip lower spatial layers + if (c->operating_point_idc && !c->all_layers) { + const int max_spatial_id = ulog2(c->operating_point_idc >> 8); + if (max_spatial_id > c->out.frame_hdr->spatial_id) { + dav1d_picture_unref_internal(&c->out); + return 0; + } + } + + return 1; +} + +static int drain_picture(Dav1dContext *const c, Dav1dPicture *const out) { + unsigned drain_count = 0; + do { + const unsigned next = c->frame_thread.next; + Dav1dFrameContext *const f = &c->fc[next]; + pthread_mutex_lock(&f->frame_thread.td.lock); + while (f->n_tile_data > 0) + pthread_cond_wait(&f->frame_thread.td.cond, + &f->frame_thread.td.lock); + pthread_mutex_unlock(&f->frame_thread.td.lock); + Dav1dThreadPicture *const out_delayed = + &c->frame_thread.out_delayed[next]; + if (++c->frame_thread.next == c->n_fc) + c->frame_thread.next = 0; + if (out_delayed->p.data[0]) { + const unsigned progress = + atomic_load_explicit(&out_delayed->progress[1], + memory_order_relaxed); + if (out_delayed->visible && progress != FRAME_ERROR) + dav1d_picture_ref(&c->out, &out_delayed->p); + dav1d_thread_picture_unref(out_delayed); + if (output_picture_ready(c)) + return output_image(c, out, &c->out); + } + } while (++drain_count < c->n_fc); + + return DAV1D_ERR(EAGAIN); +} + +static int gen_picture(Dav1dContext *const c) +{ + int res; + Dav1dData *const in = &c->in; + + if (output_picture_ready(c)) + return 0; + + while (in->sz > 0) { + res = dav1d_parse_obus(c, in, 0); + if (res < 0) { + dav1d_data_unref_internal(in); + } else { + assert((size_t)res <= in->sz); + in->sz -= res; + in->data += res; + if (!in->sz) dav1d_data_unref_internal(in); + } + if (output_picture_ready(c)) + break; + if (res < 0) + return res; + } + + return 0; +} + +int dav1d_send_data(Dav1dContext *const c, Dav1dData *const in) +{ + validate_input_or_ret(c != NULL, DAV1D_ERR(EINVAL)); + validate_input_or_ret(in != NULL, DAV1D_ERR(EINVAL)); + validate_input_or_ret(in->data == NULL || in->sz, DAV1D_ERR(EINVAL)); + + if (in->data) + c->drain = 0; + if (c->in.data) + return DAV1D_ERR(EAGAIN); + dav1d_data_ref(&c->in, in); + + int res = gen_picture(c); + if (!res) + dav1d_data_unref_internal(in); + + return res; +} + +int dav1d_get_picture(Dav1dContext *const c, Dav1dPicture *const out) +{ + validate_input_or_ret(c != NULL, DAV1D_ERR(EINVAL)); + validate_input_or_ret(out != NULL, DAV1D_ERR(EINVAL)); + + const int drain = c->drain; + c->drain = 1; + + int res = gen_picture(c); + if (res < 0) + return res; + + if (output_picture_ready(c)) + return output_image(c, out, &c->out); + + if (c->n_fc > 1 && drain) + return drain_picture(c, out); + + return DAV1D_ERR(EAGAIN); +} + +void dav1d_flush(Dav1dContext *const c) { + dav1d_data_unref_internal(&c->in); + c->drain = 0; + + for (int i = 0; i < 8; i++) { + if (c->refs[i].p.p.data[0]) + dav1d_thread_picture_unref(&c->refs[i].p); + dav1d_ref_dec(&c->refs[i].segmap); + dav1d_ref_dec(&c->refs[i].refmvs); + dav1d_cdf_thread_unref(&c->cdf[i]); + } + c->frame_hdr = NULL; + c->seq_hdr = NULL; + dav1d_ref_dec(&c->seq_hdr_ref); + + c->mastering_display = NULL; + c->content_light = NULL; + c->itut_t35 = NULL; + dav1d_ref_dec(&c->mastering_display_ref); + dav1d_ref_dec(&c->content_light_ref); + dav1d_ref_dec(&c->itut_t35_ref); + + if (c->n_fc == 1) return; + + // mark each currently-running frame as flushing, so that we + // exit out as quickly as the running thread checks this flag + atomic_store(c->frame_thread.flush, 1); + for (unsigned n = 0, next = c->frame_thread.next; n < c->n_fc; n++, next++) { + if (next == c->n_fc) next = 0; + Dav1dFrameContext *const f = &c->fc[next]; + pthread_mutex_lock(&f->frame_thread.td.lock); + if (f->n_tile_data > 0) { + while (f->n_tile_data > 0) + pthread_cond_wait(&f->frame_thread.td.cond, + &f->frame_thread.td.lock); + assert(!f->cur.data[0]); + } + pthread_mutex_unlock(&f->frame_thread.td.lock); + Dav1dThreadPicture *const out_delayed = &c->frame_thread.out_delayed[next]; + if (out_delayed->p.data[0]) + dav1d_thread_picture_unref(out_delayed); + } + atomic_store(c->frame_thread.flush, 0); + + c->frame_thread.next = 0; +} + +COLD void dav1d_close(Dav1dContext **const c_out) { + validate_input(c_out != NULL); + close_internal(c_out, 1); +} + +static COLD void close_internal(Dav1dContext **const c_out, int flush) { + Dav1dContext *const c = *c_out; + if (!c) return; + + if (flush) dav1d_flush(c); + + for (unsigned n = 0; c->fc && n < c->n_fc; n++) { + Dav1dFrameContext *const f = &c->fc[n]; + + // clean-up threading stuff + if (c->n_fc > 1 && f->frame_thread.td.inited) { + pthread_mutex_lock(&f->frame_thread.td.lock); + f->frame_thread.die = 1; + pthread_cond_signal(&f->frame_thread.td.cond); + pthread_mutex_unlock(&f->frame_thread.td.lock); + pthread_join(f->frame_thread.td.thread, NULL); + freep(&f->frame_thread.b); + dav1d_freep_aligned(&f->frame_thread.pal_idx); + dav1d_freep_aligned(&f->frame_thread.cf); + freep(&f->frame_thread.tile_start_off); + dav1d_freep_aligned(&f->frame_thread.pal); + freep(&f->frame_thread.cbi); + pthread_mutex_destroy(&f->frame_thread.td.lock); + pthread_cond_destroy(&f->frame_thread.td.cond); + } + if (f->n_tc > 1 && f->tc && f->tile_thread.inited) { + pthread_mutex_lock(&f->tile_thread.lock); + for (int m = 0; m < f->n_tc; m++) { + Dav1dTileContext *const t = &f->tc[m]; + t->tile_thread.die = 1; + // mark not created tile threads as available + if (!t->tile_thread.td.inited) + f->tile_thread.available |= 1ULL<tile_thread.cond); + while (f->tile_thread.available != ~0ULL >> (64 - f->n_tc)) + pthread_cond_wait(&f->tile_thread.icond, + &f->tile_thread.lock); + pthread_mutex_unlock(&f->tile_thread.lock); + for (int m = 0; m < f->n_tc; m++) { + Dav1dTileContext *const t = &f->tc[m]; + if (f->n_tc > 1 && t->tile_thread.td.inited) { + pthread_join(t->tile_thread.td.thread, NULL); + pthread_mutex_destroy(&t->tile_thread.td.lock); + pthread_cond_destroy(&t->tile_thread.td.cond); + } + } + pthread_mutex_destroy(&f->tile_thread.lock); + pthread_cond_destroy(&f->tile_thread.cond); + pthread_cond_destroy(&f->tile_thread.icond); + freep(&f->tile_thread.task_idx_to_sby_and_tile_idx); + } + for (int m = 0; f->ts && m < f->n_ts; m++) { + Dav1dTileState *const ts = &f->ts[m]; + pthread_cond_destroy(&ts->tile_thread.cond); + pthread_mutex_destroy(&ts->tile_thread.lock); + } + dav1d_free_aligned(f->ts); + dav1d_free_aligned(f->tc); + dav1d_free_aligned(f->ipred_edge[0]); + free(f->a); + free(f->tile); + free(f->lf.mask); + free(f->lf.lr_mask); + free(f->lf.level); + free(f->lf.tx_lpf_right_edge[0]); + dav1d_refmvs_clear(&f->rf); + dav1d_free_aligned(f->lf.cdef_line_buf); + dav1d_free_aligned(f->lf.lr_lpf_line[0]); + } + dav1d_free_aligned(c->fc); + dav1d_data_unref_internal(&c->in); + if (c->n_fc > 1 && c->frame_thread.out_delayed) { + for (unsigned n = 0; n < c->n_fc; n++) + if (c->frame_thread.out_delayed[n].p.data[0]) + dav1d_thread_picture_unref(&c->frame_thread.out_delayed[n]); + free(c->frame_thread.out_delayed); + } + for (int n = 0; n < c->n_tile_data; n++) + dav1d_data_unref_internal(&c->tile[n].data); + free(c->tile); + for (int n = 0; n < 8; n++) { + dav1d_cdf_thread_unref(&c->cdf[n]); + if (c->refs[n].p.p.data[0]) + dav1d_thread_picture_unref(&c->refs[n].p); + dav1d_ref_dec(&c->refs[n].refmvs); + dav1d_ref_dec(&c->refs[n].segmap); + } + dav1d_ref_dec(&c->seq_hdr_ref); + dav1d_ref_dec(&c->frame_hdr_ref); + + dav1d_ref_dec(&c->mastering_display_ref); + dav1d_ref_dec(&c->content_light_ref); + dav1d_ref_dec(&c->itut_t35_ref); + + if (c->mem_pools_inited) { + dav1d_mem_pool_destroy(&c->seq_hdr_pool); + dav1d_mem_pool_destroy(&c->frame_hdr_pool); + dav1d_mem_pool_destroy(&c->segmap_pool); + dav1d_mem_pool_destroy(&c->refmvs_pool); + dav1d_mem_pool_destroy(&c->cdf_pool); + if (c->mem_pools_inited == 2) + dav1d_mem_pool_destroy(&c->picture_pool); + } + + dav1d_freep_aligned(c_out); +} + +void dav1d_picture_unref(Dav1dPicture *const p) { + dav1d_picture_unref_internal(p); +} + +uint8_t *dav1d_data_create(Dav1dData *const buf, const size_t sz) { + return dav1d_data_create_internal(buf, sz); +} + +int dav1d_data_wrap(Dav1dData *const buf, const uint8_t *const ptr, + const size_t sz, + void (*const free_callback)(const uint8_t *data, + void *user_data), + void *const user_data) +{ + return dav1d_data_wrap_internal(buf, ptr, sz, free_callback, user_data); +} + +int dav1d_data_wrap_user_data(Dav1dData *const buf, + const uint8_t *const user_data, + void (*const free_callback)(const uint8_t *user_data, + void *cookie), + void *const cookie) +{ + return dav1d_data_wrap_user_data_internal(buf, + user_data, + free_callback, + cookie); +} + +void dav1d_data_unref(Dav1dData *const buf) { + dav1d_data_unref_internal(buf); +} diff --git a/third_party/dav1d/src/log.c b/third_party/dav1d/src/log.c new file mode 100644 index 0000000000..de6776a617 --- /dev/null +++ b/third_party/dav1d/src/log.c @@ -0,0 +1,57 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include +#include + +#include "dav1d/dav1d.h" + +#include "common/validate.h" + +#include "src/internal.h" +#include "src/log.h" + +#if CONFIG_LOG +COLD void dav1d_log_default_callback(void *const cookie, + const char *const format, va_list ap) +{ + vfprintf(stderr, format, ap); +} + +COLD void dav1d_log(Dav1dContext *const c, const char *const format, ...) { + validate_input(c != NULL); + + if (!c->logger.callback) + return; + + va_list ap; + va_start(ap, format); + c->logger.callback(c->logger.cookie, format, ap); + va_end(ap); +} +#endif diff --git a/third_party/dav1d/src/log.h b/third_party/dav1d/src/log.h new file mode 100644 index 0000000000..df32de7f25 --- /dev/null +++ b/third_party/dav1d/src/log.h @@ -0,0 +1,47 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_LOG_H +#define DAV1D_SRC_LOG_H + +#include "config.h" + +#include + +#include "dav1d/dav1d.h" + +#include "common/attributes.h" + +#if CONFIG_LOG +#define dav1d_log dav1d_log +void dav1d_log_default_callback(void *cookie, const char *format, va_list ap); +void dav1d_log(Dav1dContext *c, const char *format, ...) ATTR_FORMAT_PRINTF(2, 3); +#else +#define dav1d_log_default_callback NULL +#define dav1d_log(...) do { } while(0) +#endif + +#endif /* DAV1D_SRC_LOG_H */ diff --git a/third_party/dav1d/src/loopfilter.h b/third_party/dav1d/src/loopfilter.h new file mode 100644 index 0000000000..c159050b26 --- /dev/null +++ b/third_party/dav1d/src/loopfilter.h @@ -0,0 +1,59 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_LOOPFILTER_H +#define DAV1D_SRC_LOOPFILTER_H + +#include +#include + +#include "common/bitdepth.h" + +#include "src/levels.h" +#include "src/lf_mask.h" + +#define decl_loopfilter_sb_fn(name) \ +void (name)(pixel *dst, ptrdiff_t stride, const uint32_t *mask, \ + const uint8_t (*lvl)[4], ptrdiff_t lvl_stride, \ + const Av1FilterLUT *lut, int w HIGHBD_DECL_SUFFIX) +typedef decl_loopfilter_sb_fn(*loopfilter_sb_fn); + +typedef struct Dav1dLoopFilterDSPContext { + /* + * dimension 1: plane (0=luma, 1=chroma) + * dimension 2: 0=col-edge filter (h), 1=row-edge filter (v) + * + * dst/stride are aligned by 32 + */ + loopfilter_sb_fn loop_filter_sb[2][2]; +} Dav1dLoopFilterDSPContext; + +bitfn_decls(void dav1d_loop_filter_dsp_init, Dav1dLoopFilterDSPContext *c); +bitfn_decls(void dav1d_loop_filter_dsp_init_arm, Dav1dLoopFilterDSPContext *c); +bitfn_decls(void dav1d_loop_filter_dsp_init_x86, Dav1dLoopFilterDSPContext *c); + +#endif /* DAV1D_SRC_LOOPFILTER_H */ diff --git a/third_party/dav1d/src/loopfilter_tmpl.c b/third_party/dav1d/src/loopfilter_tmpl.c new file mode 100644 index 0000000000..6ea744f37b --- /dev/null +++ b/third_party/dav1d/src/loopfilter_tmpl.c @@ -0,0 +1,260 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include + +#include "common/attributes.h" +#include "common/intops.h" + +#include "src/loopfilter.h" + +static NOINLINE void +loop_filter(pixel *dst, int E, int I, int H, + const ptrdiff_t stridea, const ptrdiff_t strideb, const int wd + HIGHBD_DECL_SUFFIX) +{ + const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; + const int F = 1 << bitdepth_min_8; + E <<= bitdepth_min_8; + I <<= bitdepth_min_8; + H <<= bitdepth_min_8; + + for (int i = 0; i < 4; i++, dst += stridea) { + int p6, p5, p4, p3, p2; + int p1 = dst[strideb * -2], p0 = dst[strideb * -1]; + int q0 = dst[strideb * +0], q1 = dst[strideb * +1]; + int q2, q3, q4, q5, q6; + int fm, flat8out, flat8in; + + fm = abs(p1 - p0) <= I && abs(q1 - q0) <= I && + abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1) <= E; + + if (wd > 4) { + p2 = dst[strideb * -3]; + q2 = dst[strideb * +2]; + + fm &= abs(p2 - p1) <= I && abs(q2 - q1) <= I; + + if (wd > 6) { + p3 = dst[strideb * -4]; + q3 = dst[strideb * +3]; + + fm &= abs(p3 - p2) <= I && abs(q3 - q2) <= I; + } + } + if (!fm) continue; + + if (wd >= 16) { + p6 = dst[strideb * -7]; + p5 = dst[strideb * -6]; + p4 = dst[strideb * -5]; + q4 = dst[strideb * +4]; + q5 = dst[strideb * +5]; + q6 = dst[strideb * +6]; + + flat8out = abs(p6 - p0) <= F && abs(p5 - p0) <= F && + abs(p4 - p0) <= F && abs(q4 - q0) <= F && + abs(q5 - q0) <= F && abs(q6 - q0) <= F; + } + + if (wd >= 6) + flat8in = abs(p2 - p0) <= F && abs(p1 - p0) <= F && + abs(q1 - q0) <= F && abs(q2 - q0) <= F; + + if (wd >= 8) + flat8in &= abs(p3 - p0) <= F && abs(q3 - q0) <= F; + + if (wd >= 16 && (flat8out & flat8in)) { + dst[strideb * -6] = (p6 + p6 + p6 + p6 + p6 + p6 * 2 + p5 * 2 + + p4 * 2 + p3 + p2 + p1 + p0 + q0 + 8) >> 4; + dst[strideb * -5] = (p6 + p6 + p6 + p6 + p6 + p5 * 2 + p4 * 2 + + p3 * 2 + p2 + p1 + p0 + q0 + q1 + 8) >> 4; + dst[strideb * -4] = (p6 + p6 + p6 + p6 + p5 + p4 * 2 + p3 * 2 + + p2 * 2 + p1 + p0 + q0 + q1 + q2 + 8) >> 4; + dst[strideb * -3] = (p6 + p6 + p6 + p5 + p4 + p3 * 2 + p2 * 2 + + p1 * 2 + p0 + q0 + q1 + q2 + q3 + 8) >> 4; + dst[strideb * -2] = (p6 + p6 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + + p0 * 2 + q0 + q1 + q2 + q3 + q4 + 8) >> 4; + dst[strideb * -1] = (p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 + + q0 * 2 + q1 + q2 + q3 + q4 + q5 + 8) >> 4; + dst[strideb * +0] = (p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 + + q1 * 2 + q2 + q3 + q4 + q5 + q6 + 8) >> 4; + dst[strideb * +1] = (p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 + + q2 * 2 + q3 + q4 + q5 + q6 + q6 + 8) >> 4; + dst[strideb * +2] = (p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + + q3 * 2 + q4 + q5 + q6 + q6 + q6 + 8) >> 4; + dst[strideb * +3] = (p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + + q4 * 2 + q5 + q6 + q6 + q6 + q6 + 8) >> 4; + dst[strideb * +4] = (p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + + q5 * 2 + q6 + q6 + q6 + q6 + q6 + 8) >> 4; + dst[strideb * +5] = (p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + + q6 * 2 + q6 + q6 + q6 + q6 + q6 + 8) >> 4; + } else if (wd >= 8 && flat8in) { + dst[strideb * -3] = (p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0 + 4) >> 3; + dst[strideb * -2] = (p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4) >> 3; + dst[strideb * -1] = (p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4) >> 3; + dst[strideb * +0] = (p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4) >> 3; + dst[strideb * +1] = (p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3 + 4) >> 3; + dst[strideb * +2] = (p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3 + 4) >> 3; + } else if (wd == 6 && flat8in) { + dst[strideb * -2] = (p2 + 2 * p2 + 2 * p1 + 2 * p0 + q0 + 4) >> 3; + dst[strideb * -1] = (p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3; + dst[strideb * +0] = (p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3; + dst[strideb * +1] = (p0 + 2 * q0 + 2 * q1 + 2 * q2 + q2 + 4) >> 3; + } else { + const int hev = abs(p1 - p0) > H || abs(q1 - q0) > H; + +#define iclip_diff(v) iclip(v, -128 * (1 << bitdepth_min_8), \ + 128 * (1 << bitdepth_min_8) - 1) + + if (hev) { + int f = iclip_diff(p1 - q1), f1, f2; + f = iclip_diff(3 * (q0 - p0) + f); + + f1 = imin(f + 4, (128 << bitdepth_min_8) - 1) >> 3; + f2 = imin(f + 3, (128 << bitdepth_min_8) - 1) >> 3; + + dst[strideb * -1] = iclip_pixel(p0 + f2); + dst[strideb * +0] = iclip_pixel(q0 - f1); + } else { + int f = iclip_diff(3 * (q0 - p0)), f1, f2; + + f1 = imin(f + 4, (128 << bitdepth_min_8) - 1) >> 3; + f2 = imin(f + 3, (128 << bitdepth_min_8) - 1) >> 3; + + dst[strideb * -1] = iclip_pixel(p0 + f2); + dst[strideb * +0] = iclip_pixel(q0 - f1); + + f = (f1 + 1) >> 1; + dst[strideb * -2] = iclip_pixel(p1 + f); + dst[strideb * +1] = iclip_pixel(q1 - f); + } +#undef iclip_diff + } + } +} + +static void loop_filter_h_sb128y_c(pixel *dst, const ptrdiff_t stride, + const uint32_t *const vmask, + const uint8_t (*l)[4], ptrdiff_t b4_stride, + const Av1FilterLUT *lut, const int h + HIGHBD_DECL_SUFFIX) +{ + const unsigned vm = vmask[0] | vmask[1] | vmask[2]; + for (unsigned y = 1; vm & ~(y - 1); + y <<= 1, dst += 4 * PXSTRIDE(stride), l += b4_stride) + { + if (vm & y) { + const int L = l[0][0] ? l[0][0] : l[-1][0]; + if (!L) continue; + const int H = L >> 4; + const int E = lut->e[L], I = lut->i[L]; + const int idx = (vmask[2] & y) ? 2 : !!(vmask[1] & y); + loop_filter(dst, E, I, H, PXSTRIDE(stride), 1, 4 << idx + HIGHBD_TAIL_SUFFIX); + } + } +} + +static void loop_filter_v_sb128y_c(pixel *dst, const ptrdiff_t stride, + const uint32_t *const vmask, + const uint8_t (*l)[4], ptrdiff_t b4_stride, + const Av1FilterLUT *lut, const int w + HIGHBD_DECL_SUFFIX) +{ + const unsigned vm = vmask[0] | vmask[1] | vmask[2]; + for (unsigned x = 1; vm & ~(x - 1); x <<= 1, dst += 4, l++) { + if (vm & x) { + const int L = l[0][0] ? l[0][0] : l[-b4_stride][0]; + if (!L) continue; + const int H = L >> 4; + const int E = lut->e[L], I = lut->i[L]; + const int idx = (vmask[2] & x) ? 2 : !!(vmask[1] & x); + loop_filter(dst, E, I, H, 1, PXSTRIDE(stride), 4 << idx + HIGHBD_TAIL_SUFFIX); + } + } +} + +static void loop_filter_h_sb128uv_c(pixel *dst, const ptrdiff_t stride, + const uint32_t *const vmask, + const uint8_t (*l)[4], ptrdiff_t b4_stride, + const Av1FilterLUT *lut, const int h + HIGHBD_DECL_SUFFIX) +{ + const unsigned vm = vmask[0] | vmask[1]; + for (unsigned y = 1; vm & ~(y - 1); + y <<= 1, dst += 4 * PXSTRIDE(stride), l += b4_stride) + { + if (vm & y) { + const int L = l[0][0] ? l[0][0] : l[-1][0]; + if (!L) continue; + const int H = L >> 4; + const int E = lut->e[L], I = lut->i[L]; + const int idx = !!(vmask[1] & y); + loop_filter(dst, E, I, H, PXSTRIDE(stride), 1, 4 + 2 * idx + HIGHBD_TAIL_SUFFIX); + } + } +} + +static void loop_filter_v_sb128uv_c(pixel *dst, const ptrdiff_t stride, + const uint32_t *const vmask, + const uint8_t (*l)[4], ptrdiff_t b4_stride, + const Av1FilterLUT *lut, const int w + HIGHBD_DECL_SUFFIX) +{ + const unsigned vm = vmask[0] | vmask[1]; + for (unsigned x = 1; vm & ~(x - 1); x <<= 1, dst += 4, l++) { + if (vm & x) { + const int L = l[0][0] ? l[0][0] : l[-b4_stride][0]; + if (!L) continue; + const int H = L >> 4; + const int E = lut->e[L], I = lut->i[L]; + const int idx = !!(vmask[1] & x); + loop_filter(dst, E, I, H, 1, PXSTRIDE(stride), 4 + 2 * idx + HIGHBD_TAIL_SUFFIX); + } + } +} + +COLD void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c) { + c->loop_filter_sb[0][0] = loop_filter_h_sb128y_c; + c->loop_filter_sb[0][1] = loop_filter_v_sb128y_c; + c->loop_filter_sb[1][0] = loop_filter_h_sb128uv_c; + c->loop_filter_sb[1][1] = loop_filter_v_sb128uv_c; + +#if HAVE_ASM +#if ARCH_AARCH64 || ARCH_ARM + bitfn(dav1d_loop_filter_dsp_init_arm)(c); +#elif ARCH_X86 + bitfn(dav1d_loop_filter_dsp_init_x86)(c); +#endif +#endif +} diff --git a/third_party/dav1d/src/looprestoration.h b/third_party/dav1d/src/looprestoration.h new file mode 100644 index 0000000000..539a76bec3 --- /dev/null +++ b/third_party/dav1d/src/looprestoration.h @@ -0,0 +1,80 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_LOOPRESTORATION_H +#define DAV1D_SRC_LOOPRESTORATION_H + +#include +#include + +#include "common/bitdepth.h" + +enum LrEdgeFlags { + LR_HAVE_LEFT = 1 << 0, + LR_HAVE_RIGHT = 1 << 1, + LR_HAVE_TOP = 1 << 2, + LR_HAVE_BOTTOM = 1 << 3, +}; + +#ifdef BITDEPTH +typedef const pixel (*const_left_pixel_row)[4]; +#else +typedef const void *const_left_pixel_row; +#endif + +// Although the spec applies restoration filters over 4x4 blocks, the wiener +// filter can be applied to a bigger surface. +// * w is constrained by the restoration unit size (w <= 256) +// * h is constrained by the stripe height (h <= 64) +#define decl_wiener_filter_fn(name) \ +void (name)(pixel *dst, ptrdiff_t dst_stride, \ + const_left_pixel_row left, \ + const pixel *lpf, ptrdiff_t lpf_stride, \ + int w, int h, const int16_t filterh[7], \ + const int16_t filterv[7], enum LrEdgeFlags edges \ + HIGHBD_DECL_SUFFIX) +typedef decl_wiener_filter_fn(*wienerfilter_fn); + +#define decl_selfguided_filter_fn(name) \ +void (name)(pixel *dst, ptrdiff_t dst_stride, \ + const_left_pixel_row left, \ + const pixel *lpf, ptrdiff_t lpf_stride, \ + int w, int h, int sgr_idx, const int16_t sgr_w[2], \ + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) +typedef decl_selfguided_filter_fn(*selfguided_fn); + +typedef struct Dav1dLoopRestorationDSPContext { + wienerfilter_fn wiener; + selfguided_fn selfguided; +} Dav1dLoopRestorationDSPContext; + +bitfn_decls(void dav1d_loop_restoration_dsp_init, Dav1dLoopRestorationDSPContext *c, int bpc); +bitfn_decls(void dav1d_loop_restoration_dsp_init_arm, Dav1dLoopRestorationDSPContext *c, int bpc); +bitfn_decls(void dav1d_loop_restoration_dsp_init_x86, Dav1dLoopRestorationDSPContext *c); +bitfn_decls(void dav1d_loop_restoration_dsp_init_ppc, Dav1dLoopRestorationDSPContext *c); + +#endif /* DAV1D_SRC_LOOPRESTORATION_H */ diff --git a/third_party/dav1d/src/looprestoration_tmpl.c b/third_party/dav1d/src/looprestoration_tmpl.c new file mode 100644 index 0000000000..6ed3c56cc0 --- /dev/null +++ b/third_party/dav1d/src/looprestoration_tmpl.c @@ -0,0 +1,521 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include + +#include "common/intops.h" + +#include "src/looprestoration.h" +#include "src/tables.h" + +// 256 * 1.5 + 3 + 3 = 390 +#define REST_UNIT_STRIDE (390) + +// TODO Reuse p when no padding is needed (add and remove lpf pixels in p) +// TODO Chroma only requires 2 rows of padding. +static void padding(pixel *dst, const pixel *p, const ptrdiff_t p_stride, + const pixel (*left)[4], + const pixel *lpf, const ptrdiff_t lpf_stride, + int unit_w, const int stripe_h, const enum LrEdgeFlags edges) +{ + const int have_left = !!(edges & LR_HAVE_LEFT); + const int have_right = !!(edges & LR_HAVE_RIGHT); + + // Copy more pixels if we don't have to pad them + unit_w += 3 * have_left + 3 * have_right; + pixel *dst_l = dst + 3 * !have_left; + p -= 3 * have_left; + lpf -= 3 * have_left; + + if (edges & LR_HAVE_TOP) { + // Copy previous loop filtered rows + const pixel *const above_1 = lpf; + const pixel *const above_2 = above_1 + PXSTRIDE(lpf_stride); + pixel_copy(dst_l, above_1, unit_w); + pixel_copy(dst_l + REST_UNIT_STRIDE, above_1, unit_w); + pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, above_2, unit_w); + } else { + // Pad with first row + pixel_copy(dst_l, p, unit_w); + pixel_copy(dst_l + REST_UNIT_STRIDE, p, unit_w); + pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, p, unit_w); + if (have_left) { + pixel_copy(dst_l, &left[0][1], 3); + pixel_copy(dst_l + REST_UNIT_STRIDE, &left[0][1], 3); + pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, &left[0][1], 3); + } + } + + pixel *dst_tl = dst_l + 3 * REST_UNIT_STRIDE; + if (edges & LR_HAVE_BOTTOM) { + // Copy next loop filtered rows + const pixel *const below_1 = lpf + 6 * PXSTRIDE(lpf_stride); + const pixel *const below_2 = below_1 + PXSTRIDE(lpf_stride); + pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, below_1, unit_w); + pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, below_2, unit_w); + pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, below_2, unit_w); + } else { + // Pad with last row + const pixel *const src = p + (stripe_h - 1) * PXSTRIDE(p_stride); + pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, src, unit_w); + pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, src, unit_w); + pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, src, unit_w); + if (have_left) { + pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3); + pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3); + pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3); + } + } + + // Inner UNIT_WxSTRIPE_H + for (int j = 0; j < stripe_h; j++) { + pixel_copy(dst_tl + 3 * have_left, p + 3 * have_left, unit_w - 3 * have_left); + dst_tl += REST_UNIT_STRIDE; + p += PXSTRIDE(p_stride); + } + + if (!have_right) { + pixel *pad = dst_l + unit_w; + pixel *row_last = &dst_l[unit_w - 1]; + // Pad 3x(STRIPE_H+6) with last column + for (int j = 0; j < stripe_h + 6; j++) { + pixel_set(pad, *row_last, 3); + pad += REST_UNIT_STRIDE; + row_last += REST_UNIT_STRIDE; + } + } + + if (!have_left) { + // Pad 3x(STRIPE_H+6) with first column + for (int j = 0; j < stripe_h + 6; j++) { + pixel_set(dst, *dst_l, 3); + dst += REST_UNIT_STRIDE; + dst_l += REST_UNIT_STRIDE; + } + } else { + dst += 3 * REST_UNIT_STRIDE; + for (int j = 0; j < stripe_h; j++) { + pixel_copy(dst, &left[j][1], 3); + dst += REST_UNIT_STRIDE; + } + } +} + +// FIXME Could split into luma and chroma specific functions, +// (since first and last tops are always 0 for chroma) +// FIXME Could implement a version that requires less temporary memory +// (should be possible to implement with only 6 rows of temp storage) +static void wiener_c(pixel *p, const ptrdiff_t p_stride, + const pixel (*const left)[4], + const pixel *lpf, const ptrdiff_t lpf_stride, + const int w, const int h, + const int16_t filterh[7], const int16_t filterv[7], + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) +{ + // Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels + // of padding above and below + pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE]; + pixel *tmp_ptr = tmp; + + padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges); + + // Values stored between horizontal and vertical filtering don't + // fit in a uint8_t. + uint16_t hor[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE]; + uint16_t *hor_ptr = hor; + + const int bitdepth = bitdepth_from_max(bitdepth_max); + const int round_bits_h = 3 + (bitdepth == 12) * 2; + const int rounding_off_h = 1 << (round_bits_h - 1); + const int clip_limit = 1 << (bitdepth + 1 + 7 - round_bits_h); + for (int j = 0; j < h + 6; j++) { + for (int i = 0; i < w; i++) { + int sum = (tmp_ptr[i + 3] << 7) + (1 << (bitdepth + 6)); + + for (int k = 0; k < 7; k++) { + sum += tmp_ptr[i + k] * filterh[k]; + } + + hor_ptr[i] = + iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1); + } + tmp_ptr += REST_UNIT_STRIDE; + hor_ptr += REST_UNIT_STRIDE; + } + + const int round_bits_v = 11 - (bitdepth == 12) * 2; + const int rounding_off_v = 1 << (round_bits_v - 1); + const int round_offset = 1 << (bitdepth + (round_bits_v - 1)); + for (int j = 0; j < h; j++) { + for (int i = 0; i < w; i++) { + int sum = (hor[(j + 3) * REST_UNIT_STRIDE + i] << 7) - round_offset; + + for (int k = 0; k < 7; k++) { + sum += hor[(j + k) * REST_UNIT_STRIDE + i] * filterv[k]; + } + + p[j * PXSTRIDE(p_stride) + i] = + iclip_pixel((sum + rounding_off_v) >> round_bits_v); + } + } +} + +// Sum over a 3x3 area +// The dst and src pointers are positioned 3 pixels above and 3 pixels to the +// left of the top left corner. However, the self guided filter only needs 1 +// pixel above and one pixel to the left. As for the pixels below and to the +// right they must be computed in the sums, but don't need to be stored. +// +// Example for a 4x4 block: +// x x x x x x x x x x +// x c c c c c c c c x +// x i s s s s s s i x +// x i s s s s s s i x +// x i s s s s s s i x +// x i s s s s s s i x +// x i s s s s s s i x +// x i s s s s s s i x +// x c c c c c c c c x +// x x x x x x x x x x +// +// s: Pixel summed and stored +// i: Pixel summed and stored (between loops) +// c: Pixel summed not stored +// x: Pixel not summed not stored +static void boxsum3(int32_t *sumsq, coef *sum, const pixel *src, + const int w, const int h) +{ + // We skip the first row, as it is never used + src += REST_UNIT_STRIDE; + + // We skip the first and last columns, as they are never used + for (int x = 1; x < w - 1; x++) { + coef *sum_v = sum + x; + int32_t *sumsq_v = sumsq + x; + const pixel *s = src + x; + int a = s[0], a2 = a * a; + int b = s[REST_UNIT_STRIDE], b2 = b * b; + + // We skip the first 2 rows, as they are skipped in the next loop and + // we don't need the last 2 row as it is skipped in the next loop + for (int y = 2; y < h - 2; y++) { + s += REST_UNIT_STRIDE; + const int c = s[REST_UNIT_STRIDE]; + const int c2 = c * c; + sum_v += REST_UNIT_STRIDE; + sumsq_v += REST_UNIT_STRIDE; + *sum_v = a + b + c; + *sumsq_v = a2 + b2 + c2; + a = b; + a2 = b2; + b = c; + b2 = c2; + } + } + + // We skip the first row as it is never read + sum += REST_UNIT_STRIDE; + sumsq += REST_UNIT_STRIDE; + // We skip the last 2 rows as it is never read + for (int y = 2; y < h - 2; y++) { + int a = sum[1], a2 = sumsq[1]; + int b = sum[2], b2 = sumsq[2]; + + // We don't store the first column as it is never read and + // we don't store the last 2 columns as they are never read + for (int x = 2; x < w - 2; x++) { + const int c = sum[x + 1], c2 = sumsq[x + 1]; + sum[x] = a + b + c; + sumsq[x] = a2 + b2 + c2; + a = b; + a2 = b2; + b = c; + b2 = c2; + } + sum += REST_UNIT_STRIDE; + sumsq += REST_UNIT_STRIDE; + } +} + +// Sum over a 5x5 area +// The dst and src pointers are positioned 3 pixels above and 3 pixels to the +// left of the top left corner. However, the self guided filter only needs 1 +// pixel above and one pixel to the left. As for the pixels below and to the +// right they must be computed in the sums, but don't need to be stored. +// +// Example for a 4x4 block: +// c c c c c c c c c c +// c c c c c c c c c c +// i i s s s s s s i i +// i i s s s s s s i i +// i i s s s s s s i i +// i i s s s s s s i i +// i i s s s s s s i i +// i i s s s s s s i i +// c c c c c c c c c c +// c c c c c c c c c c +// +// s: Pixel summed and stored +// i: Pixel summed and stored (between loops) +// c: Pixel summed not stored +// x: Pixel not summed not stored +static void boxsum5(int32_t *sumsq, coef *sum, const pixel *const src, + const int w, const int h) +{ + for (int x = 0; x < w; x++) { + coef *sum_v = sum + x; + int32_t *sumsq_v = sumsq + x; + const pixel *s = src + 3 * REST_UNIT_STRIDE + x; + int a = s[-3 * REST_UNIT_STRIDE], a2 = a * a; + int b = s[-2 * REST_UNIT_STRIDE], b2 = b * b; + int c = s[-1 * REST_UNIT_STRIDE], c2 = c * c; + int d = s[0], d2 = d * d; + + // We skip the first 2 rows, as they are skipped in the next loop and + // we don't need the last 2 row as it is skipped in the next loop + for (int y = 2; y < h - 2; y++) { + s += REST_UNIT_STRIDE; + const int e = *s, e2 = e * e; + sum_v += REST_UNIT_STRIDE; + sumsq_v += REST_UNIT_STRIDE; + *sum_v = a + b + c + d + e; + *sumsq_v = a2 + b2 + c2 + d2 + e2; + a = b; + b = c; + c = d; + d = e; + a2 = b2; + b2 = c2; + c2 = d2; + d2 = e2; + } + } + + // We skip the first row as it is never read + sum += REST_UNIT_STRIDE; + sumsq += REST_UNIT_STRIDE; + for (int y = 2; y < h - 2; y++) { + int a = sum[0], a2 = sumsq[0]; + int b = sum[1], b2 = sumsq[1]; + int c = sum[2], c2 = sumsq[2]; + int d = sum[3], d2 = sumsq[3]; + + for (int x = 2; x < w - 2; x++) { + const int e = sum[x + 2], e2 = sumsq[x + 2]; + sum[x] = a + b + c + d + e; + sumsq[x] = a2 + b2 + c2 + d2 + e2; + a = b; + b = c; + c = d; + d = e; + a2 = b2; + b2 = c2; + c2 = d2; + d2 = e2; + } + sum += REST_UNIT_STRIDE; + sumsq += REST_UNIT_STRIDE; + } +} + +static void selfguided_filter(coef *dst, const pixel *src, + const ptrdiff_t src_stride, const int w, + const int h, const int n, const int s + HIGHBD_DECL_SUFFIX) +{ + const int sgr_one_by_x = n == 25 ? 164 : 455; + + // Selfguided filter is applied to a maximum stripe height of 64 + 3 pixels + // of padding above and below + int32_t sumsq[68 /*(64 + 2 + 2)*/ * REST_UNIT_STRIDE]; + int32_t *A = sumsq + 2 * REST_UNIT_STRIDE + 3; + // By inverting A and B after the boxsums, B can be of size coef instead + // of int32_t + coef sum[68 /*(64 + 2 + 2)*/ * REST_UNIT_STRIDE]; + coef *B = sum + 2 * REST_UNIT_STRIDE + 3; + + const int step = (n == 25) + 1; + if (n == 25) + boxsum5(sumsq, sum, src, w + 6, h + 6); + else + boxsum3(sumsq, sum, src, w + 6, h + 6); + const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; + + int32_t *AA = A - REST_UNIT_STRIDE; + coef *BB = B - REST_UNIT_STRIDE; + for (int j = -1; j < h + 1; j+= step) { + for (int i = -1; i < w + 1; i++) { + const int a = + (AA[i] + ((1 << (2 * bitdepth_min_8)) >> 1)) >> (2 * bitdepth_min_8); + const int b = + (BB[i] + ((1 << bitdepth_min_8) >> 1)) >> bitdepth_min_8; + + const unsigned p = imax(a * n - b * b, 0); + const unsigned z = (p * s + (1 << 19)) >> 20; + const unsigned x = dav1d_sgr_x_by_x[imin(z, 255)]; + + // This is where we invert A and B, so that B is of size coef. + AA[i] = (x * BB[i] * sgr_one_by_x + (1 << 11)) >> 12; + BB[i] = 256 - x; + } + AA += step * REST_UNIT_STRIDE; + BB += step * REST_UNIT_STRIDE; + } + + src += 3 * REST_UNIT_STRIDE + 3; + if (n == 25) { + int j = 0; +#define SIX_NEIGHBORS(P, i)\ + ((P[i - REST_UNIT_STRIDE] + P[i + REST_UNIT_STRIDE]) * 6 + \ + (P[i - 1 - REST_UNIT_STRIDE] + P[i - 1 + REST_UNIT_STRIDE] + \ + P[i + 1 - REST_UNIT_STRIDE] + P[i + 1 + REST_UNIT_STRIDE]) * 5) + for (; j < h - 1; j+=2) { + for (int i = 0; i < w; i++) { + const int a = SIX_NEIGHBORS(B, i); + const int b = SIX_NEIGHBORS(A, i); + dst[i] = (a * src[i] + b + (1 << 8)) >> 9; + } + dst += 384 /* Maximum restoration width is 384 (256 * 1.5) */; + src += REST_UNIT_STRIDE; + B += REST_UNIT_STRIDE; + A += REST_UNIT_STRIDE; + for (int i = 0; i < w; i++) { + const int a = B[i] * 6 + (B[i - 1] + B[i + 1]) * 5; + const int b = A[i] * 6 + (A[i - 1] + A[i + 1]) * 5; + dst[i] = (a * src[i] + b + (1 << 7)) >> 8; + } + dst += 384 /* Maximum restoration width is 384 (256 * 1.5) */; + src += REST_UNIT_STRIDE; + B += REST_UNIT_STRIDE; + A += REST_UNIT_STRIDE; + } + if (j + 1 == h) { // Last row, when number of rows is odd + for (int i = 0; i < w; i++) { + const int a = SIX_NEIGHBORS(B, i); + const int b = SIX_NEIGHBORS(A, i); + dst[i] = (a * src[i] + b + (1 << 8)) >> 9; + } + } +#undef SIX_NEIGHBORS + } else { +#define EIGHT_NEIGHBORS(P, i)\ + ((P[i] + P[i - 1] + P[i + 1] + P[i - REST_UNIT_STRIDE] + P[i + REST_UNIT_STRIDE]) * 4 + \ + (P[i - 1 - REST_UNIT_STRIDE] + P[i - 1 + REST_UNIT_STRIDE] + \ + P[i + 1 - REST_UNIT_STRIDE] + P[i + 1 + REST_UNIT_STRIDE]) * 3) + for (int j = 0; j < h; j++) { + for (int i = 0; i < w; i++) { + const int a = EIGHT_NEIGHBORS(B, i); + const int b = EIGHT_NEIGHBORS(A, i); + dst[i] = (a * src[i] + b + (1 << 8)) >> 9; + } + dst += 384; + src += REST_UNIT_STRIDE; + B += REST_UNIT_STRIDE; + A += REST_UNIT_STRIDE; + } + } +#undef EIGHT_NEIGHBORS +} + +static void selfguided_c(pixel *p, const ptrdiff_t p_stride, + const pixel (*const left)[4], + const pixel *lpf, const ptrdiff_t lpf_stride, + const int w, const int h, const int sgr_idx, + const int16_t sgr_w[2], const enum LrEdgeFlags edges + HIGHBD_DECL_SUFFIX) +{ + // Selfguided filter is applied to a maximum stripe height of 64 + 3 pixels + // of padding above and below + pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE]; + + padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges); + + // Selfguided filter outputs to a maximum stripe height of 64 and a + // maximum restoration width of 384 (256 * 1.5) + coef dst[64 * 384]; + + // both r1 and r0 can't be zero + if (!dav1d_sgr_params[sgr_idx][0]) { + const int s1 = dav1d_sgr_params[sgr_idx][3]; + selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 9, s1 HIGHBD_TAIL_SUFFIX); + const int w1 = (1 << 7) - sgr_w[1]; + for (int j = 0; j < h; j++) { + for (int i = 0; i < w; i++) { + const int u = (p[i] << 4); + const int v = (u << 7) + w1 * (dst[j * 384 + i] - u); + p[i] = iclip_pixel((v + (1 << 10)) >> 11); + } + p += PXSTRIDE(p_stride); + } + } else if (!dav1d_sgr_params[sgr_idx][1]) { + const int s0 = dav1d_sgr_params[sgr_idx][2]; + selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 25, s0 HIGHBD_TAIL_SUFFIX); + const int w0 = sgr_w[0]; + for (int j = 0; j < h; j++) { + for (int i = 0; i < w; i++) { + const int u = (p[i] << 4); + const int v = (u << 7) + w0 * (dst[j * 384 + i] - u); + p[i] = iclip_pixel((v + (1 << 10)) >> 11); + } + p += PXSTRIDE(p_stride); + } + } else { + coef dst1[64 * 384]; + const int s0 = dav1d_sgr_params[sgr_idx][2]; + const int s1 = dav1d_sgr_params[sgr_idx][3]; + const int w0 = sgr_w[0]; + const int w1 = (1 << 7) - w0 - sgr_w[1]; + selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 25, s0 HIGHBD_TAIL_SUFFIX); + selfguided_filter(dst1, tmp, REST_UNIT_STRIDE, w, h, 9, s1 HIGHBD_TAIL_SUFFIX); + for (int j = 0; j < h; j++) { + for (int i = 0; i < w; i++) { + const int u = (p[i] << 4); + const int v = (u << 7) + w0 * (dst[j * 384 + i] - u) + + w1 * (dst1[j * 384 + i] - u); + p[i] = iclip_pixel((v + (1 << 10)) >> 11); + } + p += PXSTRIDE(p_stride); + } + } +} + +COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c, int bpc) { + c->wiener = wiener_c; + c->selfguided = selfguided_c; + +#if HAVE_ASM +#if ARCH_AARCH64 || ARCH_ARM + bitfn(dav1d_loop_restoration_dsp_init_arm)(c, bpc); +#elif ARCH_PPC64LE + bitfn(dav1d_loop_restoration_dsp_init_ppc)(c); +#elif ARCH_X86 + bitfn(dav1d_loop_restoration_dsp_init_x86)(c); +#endif +#endif +} diff --git a/third_party/dav1d/src/lr_apply.h b/third_party/dav1d/src/lr_apply.h new file mode 100644 index 0000000000..638bb8b74b --- /dev/null +++ b/third_party/dav1d/src/lr_apply.h @@ -0,0 +1,44 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_LR_APPLY_H +#define DAV1D_SRC_LR_APPLY_H + +#include +#include + +#include "common/bitdepth.h" + +#include "src/internal.h" + +void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f, + /*const*/ pixel *const src[3], int sby); + +void bytefn(dav1d_lr_sbrow)(Dav1dFrameContext *const f, pixel *const dst[3], + int sby); + +#endif /* DAV1D_SRC_LR_APPLY_H */ diff --git a/third_party/dav1d/src/lr_apply_tmpl.c b/third_party/dav1d/src/lr_apply_tmpl.c new file mode 100644 index 0000000000..02413b9137 --- /dev/null +++ b/third_party/dav1d/src/lr_apply_tmpl.c @@ -0,0 +1,302 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include + +#include "common/intops.h" + +#include "src/lr_apply.h" + +enum LrRestorePlanes { + LR_RESTORE_Y = 1 << 0, + LR_RESTORE_U = 1 << 1, + LR_RESTORE_V = 1 << 2, +}; + +// The loop filter buffer stores 12 rows of pixels. A superblock block will +// contain at most 2 stripes. Each stripe requires 4 rows pixels (2 above +// and 2 below) the final 4 rows are used to swap the bottom of the last +// stripe with the top of the next super block row. +static void backup_lpf(const Dav1dFrameContext *const f, + pixel *dst, const ptrdiff_t dst_stride, + const pixel *src, const ptrdiff_t src_stride, + const int ss_ver, const int sb128, + int row, const int row_h, const int src_w, + const int h, const int ss_hor) +{ + const int dst_w = f->frame_hdr->super_res.enabled ? + (f->frame_hdr->width[1] + ss_hor) >> ss_hor : src_w; + + // The first stripe of the frame is shorter by 8 luma pixel rows. + int stripe_h = (64 - 8 * !row) >> ss_ver; + + if (row) { + const int top = 4 << sb128; + // Copy the top part of the stored loop filtered pixels from the + // previous sb row needed above the first stripe of this sb row. + pixel_copy(&dst[PXSTRIDE(dst_stride) * 0], + &dst[PXSTRIDE(dst_stride) * top], dst_w); + pixel_copy(&dst[PXSTRIDE(dst_stride) * 1], + &dst[PXSTRIDE(dst_stride) * (top + 1)], dst_w); + pixel_copy(&dst[PXSTRIDE(dst_stride) * 2], + &dst[PXSTRIDE(dst_stride) * (top + 2)], dst_w); + pixel_copy(&dst[PXSTRIDE(dst_stride) * 3], + &dst[PXSTRIDE(dst_stride) * (top + 3)], dst_w); + } + + dst += 4 * PXSTRIDE(dst_stride); + src += (stripe_h - 2) * PXSTRIDE(src_stride); + + if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) { + while (row + stripe_h <= row_h) { + const int n_lines = 4 - (row + stripe_h + 1 == h); + f->dsp->mc.resize(dst, dst_stride, src, src_stride, + dst_w, n_lines, src_w, f->resize_step[ss_hor], + f->resize_start[ss_hor] HIGHBD_CALL_SUFFIX); + row += stripe_h; // unmodified stripe_h for the 1st stripe + stripe_h = 64 >> ss_ver; + src += stripe_h * PXSTRIDE(src_stride); + dst += n_lines * PXSTRIDE(dst_stride); + if (n_lines == 3) { + pixel_copy(dst, &dst[-PXSTRIDE(dst_stride)], dst_w); + dst += PXSTRIDE(dst_stride); + } + } + } else { + while (row + stripe_h <= row_h) { + const int n_lines = 4 - (row + stripe_h + 1 == h); + for (int i = 0; i < 4; i++) { + pixel_copy(dst, i == n_lines ? &dst[-PXSTRIDE(dst_stride)] : + src, src_w); + dst += PXSTRIDE(dst_stride); + src += PXSTRIDE(src_stride); + } + row += stripe_h; // unmodified stripe_h for the 1st stripe + stripe_h = 64 >> ss_ver; + src += (stripe_h - 4) * PXSTRIDE(src_stride); + } + } +} + +void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f, + /*const*/ pixel *const src[3], const int sby) +{ + const int offset = 8 * !!sby; + const ptrdiff_t *const src_stride = f->cur.stride; + const ptrdiff_t lr_stride = ((f->sr_cur.p.p.w + 31) & ~31) * sizeof(pixel); + + // TODO Also check block level restore type to reduce copying. + const int restore_planes = f->lf.restore_planes; + + if (restore_planes & LR_RESTORE_Y) { + const int h = f->cur.p.h; + const int w = f->bw << 2; + const int row_h = imin((sby + 1) << (6 + f->seq_hdr->sb128), h - 1); + const int y_stripe = (sby << (6 + f->seq_hdr->sb128)) - offset; + backup_lpf(f, f->lf.lr_lpf_line[0], lr_stride, + src[0] - offset * PXSTRIDE(src_stride[0]), src_stride[0], + 0, f->seq_hdr->sb128, y_stripe, row_h, w, h, 0); + } + if (restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) { + const int ss_ver = f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420; + const int ss_hor = f->sr_cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444; + const int h = (f->cur.p.h + ss_ver) >> ss_ver; + const int w = f->bw << (2 - ss_hor); + const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128), h - 1); + const int offset_uv = offset >> ss_ver; + const int y_stripe = + (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv; + + if (restore_planes & LR_RESTORE_U) { + backup_lpf(f, f->lf.lr_lpf_line[1], lr_stride, + src[1] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1], + ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor); + } + if (restore_planes & LR_RESTORE_V) { + backup_lpf(f, f->lf.lr_lpf_line[2], lr_stride, + src[2] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1], + ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor); + } + } +} + +static void lr_stripe(const Dav1dFrameContext *const f, pixel *p, + const pixel (*left)[4], int x, int y, + const int plane, const int unit_w, const int row_h, + const Av1RestorationUnit *const lr, enum LrEdgeFlags edges) +{ + const Dav1dDSPContext *const dsp = f->dsp; + const int chroma = !!plane; + const int ss_ver = chroma & (f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420); + const int sbrow_has_bottom = (edges & LR_HAVE_BOTTOM); + const pixel *lpf = f->lf.lr_lpf_line[plane] + x; + const ptrdiff_t p_stride = f->sr_cur.p.stride[chroma]; + const ptrdiff_t lpf_stride = sizeof(pixel) * ((f->sr_cur.p.p.w + 31) & ~31); + + // The first stripe of the frame is shorter by 8 luma pixel rows. + int stripe_h = imin((64 - 8 * !y) >> ss_ver, row_h - y); + + // FIXME [8] might be easier for SIMD + int16_t filterh[7], filterv[7]; + if (lr->type == DAV1D_RESTORATION_WIENER) { + filterh[0] = filterh[6] = lr->filter_h[0]; + filterh[1] = filterh[5] = lr->filter_h[1]; + filterh[2] = filterh[4] = lr->filter_h[2]; + filterh[3] = -((filterh[0] + filterh[1] + filterh[2]) * 2); + + filterv[0] = filterv[6] = lr->filter_v[0]; + filterv[1] = filterv[5] = lr->filter_v[1]; + filterv[2] = filterv[4] = lr->filter_v[2]; + filterv[3] = -((filterv[0] + filterv[1] + filterv[2]) * 2); + } + + while (y + stripe_h <= row_h) { + // Change HAVE_BOTTOM bit in edges to (y + stripe_h != row_h) + edges ^= (-(y + stripe_h != row_h) ^ edges) & LR_HAVE_BOTTOM; + if (lr->type == DAV1D_RESTORATION_WIENER) { + dsp->lr.wiener(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h, + filterh, filterv, edges HIGHBD_CALL_SUFFIX); + } else { + assert(lr->type == DAV1D_RESTORATION_SGRPROJ); + dsp->lr.selfguided(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h, + lr->sgr_idx, lr->sgr_weights, edges HIGHBD_CALL_SUFFIX); + } + + left += stripe_h; + y += stripe_h; + if (y + stripe_h > row_h && sbrow_has_bottom) break; + p += stripe_h * PXSTRIDE(p_stride); + edges |= LR_HAVE_TOP; + stripe_h = imin(64 >> ss_ver, row_h - y); + if (stripe_h == 0) break; + lpf += 4 * PXSTRIDE(lpf_stride); + } +} + +static void backup4xU(pixel (*dst)[4], const pixel *src, const ptrdiff_t src_stride, + int u) +{ + for (; u > 0; u--, dst++, src += PXSTRIDE(src_stride)) + pixel_copy(dst, src, 4); +} + +static void lr_sbrow(const Dav1dFrameContext *const f, pixel *p, const int y, + const int w, const int h, const int row_h, const int plane) +{ + const int chroma = !!plane; + const int ss_ver = chroma & (f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420); + const int ss_hor = chroma & (f->sr_cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444); + const ptrdiff_t p_stride = f->sr_cur.p.stride[chroma]; + + const int unit_size_log2 = f->frame_hdr->restoration.unit_size[!!plane]; + const int unit_size = 1 << unit_size_log2; + const int half_unit_size = unit_size >> 1; + const int max_unit_size = unit_size + half_unit_size; + + // Y coordinate of the sbrow (y is 8 luma pixel rows above row_y) + const int row_y = y + ((8 >> ss_ver) * !!y); + + // FIXME This is an ugly hack to lookup the proper AV1Filter unit for + // chroma planes. Question: For Multithreaded decoding, is it better + // to store the chroma LR information with collocated Luma information? + // In other words. For a chroma restoration unit locate at 128,128 and + // with a 4:2:0 chroma subsampling, do we store the filter information at + // the AV1Filter unit located at (128,128) or (256,256) + // TODO Support chroma subsampling. + const int shift_hor = 7 - ss_hor; + + pixel pre_lr_border[2][128 + 8 /* maximum sbrow height is 128 + 8 rows offset */][4]; + const Av1RestorationUnit *lr[2]; + + enum LrEdgeFlags edges = (y > 0 ? LR_HAVE_TOP : 0) | LR_HAVE_RIGHT | + (row_h < h ? LR_HAVE_BOTTOM : 0); + + int aligned_unit_pos = row_y & ~(unit_size - 1); + if (aligned_unit_pos && aligned_unit_pos + half_unit_size > h) + aligned_unit_pos -= unit_size; + aligned_unit_pos <<= ss_ver; + const int sb_idx = (aligned_unit_pos >> 7) * f->sr_sb128w; + const int unit_idx = ((aligned_unit_pos >> 6) & 1) << 1; + lr[0] = &f->lf.lr_mask[sb_idx].lr[plane][unit_idx]; + int restore = lr[0]->type != DAV1D_RESTORATION_NONE; + int x = 0, bit = 0; + for (; x + max_unit_size <= w; p += unit_size, edges |= LR_HAVE_LEFT, bit ^= 1) { + const int next_x = x + unit_size; + const int next_u_idx = unit_idx + ((next_x >> (shift_hor - 1)) & 1); + lr[!bit] = + &f->lf.lr_mask[sb_idx + (next_x >> shift_hor)].lr[plane][next_u_idx]; + const int restore_next = lr[!bit]->type != DAV1D_RESTORATION_NONE; + if (restore_next) + backup4xU(pre_lr_border[bit], p + unit_size - 4, p_stride, row_h - y); + if (restore) + lr_stripe(f, p, pre_lr_border[!bit], x, y, plane, unit_size, row_h, + lr[bit], edges); + x = next_x; + restore = restore_next; + } + if (restore) { + edges &= ~LR_HAVE_RIGHT; + const int unit_w = w - x; + lr_stripe(f, p, pre_lr_border[!bit], x, y, plane, unit_w, row_h, lr[bit], edges); + } +} + +void bytefn(dav1d_lr_sbrow)(Dav1dFrameContext *const f, pixel *const dst[3], + const int sby) +{ + const int offset_y = 8 * !!sby; + const ptrdiff_t *const dst_stride = f->sr_cur.p.stride; + const int restore_planes = f->lf.restore_planes; + + if (restore_planes & LR_RESTORE_Y) { + const int h = f->sr_cur.p.p.h; + const int w = f->sr_cur.p.p.w; + const int row_h = imin((sby + 1) << (6 + f->seq_hdr->sb128), h); + const int y_stripe = (sby << (6 + f->seq_hdr->sb128)) - offset_y; + lr_sbrow(f, dst[0] - offset_y * PXSTRIDE(dst_stride[0]), y_stripe, w, + h, row_h, 0); + } + if (restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) { + const int ss_ver = f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420; + const int ss_hor = f->sr_cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444; + const int h = (f->sr_cur.p.p.h + ss_ver) >> ss_ver; + const int w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor; + const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128), h); + const int offset_uv = offset_y >> ss_ver; + const int y_stripe = + (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv; + if (restore_planes & LR_RESTORE_U) + lr_sbrow(f, dst[1] - offset_uv * PXSTRIDE(dst_stride[1]), y_stripe, + w, h, row_h, 1); + + if (restore_planes & LR_RESTORE_V) + lr_sbrow(f, dst[2] - offset_uv * PXSTRIDE(dst_stride[1]), y_stripe, + w, h, row_h, 2); + } +} diff --git a/third_party/dav1d/src/mc.h b/third_party/dav1d/src/mc.h new file mode 100644 index 0000000000..784b58d221 --- /dev/null +++ b/third_party/dav1d/src/mc.h @@ -0,0 +1,138 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_MC_H +#define DAV1D_SRC_MC_H + +#include +#include + +#include "common/bitdepth.h" + +#include "src/levels.h" + +#define decl_mc_fn(name) \ +void (name)(pixel *dst, ptrdiff_t dst_stride, \ + const pixel *src, ptrdiff_t src_stride, \ + int w, int h, int mx, int my HIGHBD_DECL_SUFFIX) +typedef decl_mc_fn(*mc_fn); + +#define decl_mc_scaled_fn(name) \ +void (name)(pixel *dst, ptrdiff_t dst_stride, \ + const pixel *src, ptrdiff_t src_stride, \ + int w, int h, int mx, int my, int dx, int dy HIGHBD_DECL_SUFFIX) +typedef decl_mc_scaled_fn(*mc_scaled_fn); + +#define decl_warp8x8_fn(name) \ +void (name)(pixel *dst, ptrdiff_t dst_stride, \ + const pixel *src, ptrdiff_t src_stride, \ + const int16_t *abcd, int mx, int my HIGHBD_DECL_SUFFIX) +typedef decl_warp8x8_fn(*warp8x8_fn); + +#define decl_mct_fn(name) \ +void (name)(int16_t *tmp, const pixel *src, ptrdiff_t src_stride, \ + int w, int h, int mx, int my HIGHBD_DECL_SUFFIX) +typedef decl_mct_fn(*mct_fn); + +#define decl_mct_scaled_fn(name) \ +void (name)(int16_t *tmp, const pixel *src, ptrdiff_t src_stride, \ + int w, int h, int mx, int my, int dx, int dy HIGHBD_DECL_SUFFIX) +typedef decl_mct_scaled_fn(*mct_scaled_fn); + +#define decl_warp8x8t_fn(name) \ +void (name)(int16_t *tmp, const ptrdiff_t tmp_stride, \ + const pixel *src, ptrdiff_t src_stride, \ + const int16_t *abcd, int mx, int my HIGHBD_DECL_SUFFIX) +typedef decl_warp8x8t_fn(*warp8x8t_fn); + +#define decl_avg_fn(name) \ +void (name)(pixel *dst, ptrdiff_t dst_stride, \ + const int16_t *tmp1, const int16_t *tmp2, int w, int h \ + HIGHBD_DECL_SUFFIX) +typedef decl_avg_fn(*avg_fn); + +#define decl_w_avg_fn(name) \ +void (name)(pixel *dst, ptrdiff_t dst_stride, \ + const int16_t *tmp1, const int16_t *tmp2, int w, int h, int weight \ + HIGHBD_DECL_SUFFIX) +typedef decl_w_avg_fn(*w_avg_fn); + +#define decl_mask_fn(name) \ +void (name)(pixel *dst, ptrdiff_t dst_stride, \ + const int16_t *tmp1, const int16_t *tmp2, int w, int h, \ + const uint8_t *mask HIGHBD_DECL_SUFFIX) +typedef decl_mask_fn(*mask_fn); + +#define decl_w_mask_fn(name) \ +void (name)(pixel *dst, ptrdiff_t dst_stride, \ + const int16_t *tmp1, const int16_t *tmp2, int w, int h, \ + uint8_t *mask, int sign HIGHBD_DECL_SUFFIX) +typedef decl_w_mask_fn(*w_mask_fn); + +#define decl_blend_fn(name) \ +void (name)(pixel *dst, ptrdiff_t dst_stride, const pixel *tmp, \ + int w, int h, const uint8_t *mask) +typedef decl_blend_fn(*blend_fn); + +#define decl_blend_dir_fn(name) \ +void (name)(pixel *dst, ptrdiff_t dst_stride, const pixel *tmp, int w, int h) +typedef decl_blend_dir_fn(*blend_dir_fn); + +#define decl_emu_edge_fn(name) \ +void (name)(intptr_t bw, intptr_t bh, intptr_t iw, intptr_t ih, intptr_t x, intptr_t y, \ + pixel *dst, ptrdiff_t dst_stride, const pixel *src, ptrdiff_t src_stride) +typedef decl_emu_edge_fn(*emu_edge_fn); + +#define decl_resize_fn(name) \ +void (name)(pixel *dst, ptrdiff_t dst_stride, \ + const pixel *src, ptrdiff_t src_stride, \ + int dst_w, int h, int src_w, int dx, int mx HIGHBD_DECL_SUFFIX) +typedef decl_resize_fn(*resize_fn); + +typedef struct Dav1dMCDSPContext { + mc_fn mc[N_2D_FILTERS]; + mc_scaled_fn mc_scaled[N_2D_FILTERS]; + mct_fn mct[N_2D_FILTERS]; + mct_scaled_fn mct_scaled[N_2D_FILTERS]; + avg_fn avg; + w_avg_fn w_avg; + mask_fn mask; + w_mask_fn w_mask[3 /* 444, 422, 420 */]; + blend_fn blend; + blend_dir_fn blend_v; + blend_dir_fn blend_h; + warp8x8_fn warp8x8; + warp8x8t_fn warp8x8t; + emu_edge_fn emu_edge; + resize_fn resize; +} Dav1dMCDSPContext; + +bitfn_decls(void dav1d_mc_dsp_init, Dav1dMCDSPContext *c); +bitfn_decls(void dav1d_mc_dsp_init_arm, Dav1dMCDSPContext *c); +bitfn_decls(void dav1d_mc_dsp_init_x86, Dav1dMCDSPContext *c); + +#endif /* DAV1D_SRC_MC_H */ diff --git a/third_party/dav1d/src/mc_tmpl.c b/third_party/dav1d/src/mc_tmpl.c new file mode 100644 index 0000000000..c4d9e14eb8 --- /dev/null +++ b/third_party/dav1d/src/mc_tmpl.c @@ -0,0 +1,954 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include +#include + +#include "common/attributes.h" +#include "common/intops.h" + +#include "src/mc.h" +#include "src/tables.h" + +#if BITDEPTH == 8 +#define get_intermediate_bits(bitdepth_max) 4 +// Output in interval [-5132, 9212], fits in int16_t as is +#define PREP_BIAS 0 +#else +// 4 for 10 bits/component, 2 for 12 bits/component +#define get_intermediate_bits(bitdepth_max) (14 - bitdepth_from_max(bitdepth_max)) +// Output in interval [-20588, 36956] (10-bit), [-20602, 36983] (12-bit) +// Subtract a bias to ensure the output fits in int16_t +#define PREP_BIAS 8192 +#endif + +static NOINLINE void +put_c(pixel *dst, const ptrdiff_t dst_stride, + const pixel *src, const ptrdiff_t src_stride, const int w, int h) +{ + do { + pixel_copy(dst, src, w); + + dst += dst_stride; + src += src_stride; + } while (--h); +} + +static NOINLINE void +prep_c(int16_t *tmp, const pixel *src, const ptrdiff_t src_stride, + const int w, int h HIGHBD_DECL_SUFFIX) +{ + const int intermediate_bits = get_intermediate_bits(bitdepth_max); + do { + for (int x = 0; x < w; x++) + tmp[x] = (src[x] << intermediate_bits) - PREP_BIAS; + + tmp += w; + src += src_stride; + } while (--h); +} + +#define FILTER_8TAP(src, x, F, stride) \ + (F[0] * src[x + -3 * stride] + \ + F[1] * src[x + -2 * stride] + \ + F[2] * src[x + -1 * stride] + \ + F[3] * src[x + +0 * stride] + \ + F[4] * src[x + +1 * stride] + \ + F[5] * src[x + +2 * stride] + \ + F[6] * src[x + +3 * stride] + \ + F[7] * src[x + +4 * stride]) + +#define DAV1D_FILTER_8TAP_RND(src, x, F, stride, sh) \ + ((FILTER_8TAP(src, x, F, stride) + ((1 << (sh)) >> 1)) >> (sh)) + +#define DAV1D_FILTER_8TAP_CLIP(src, x, F, stride, sh) \ + iclip_pixel(DAV1D_FILTER_8TAP_RND(src, x, F, stride, sh)) + +#define GET_H_FILTER(mx) \ + const int8_t *const fh = !(mx) ? NULL : w > 4 ? \ + dav1d_mc_subpel_filters[filter_type & 3][(mx) - 1] : \ + dav1d_mc_subpel_filters[3 + (filter_type & 1)][(mx) - 1] + +#define GET_V_FILTER(my) \ + const int8_t *const fv = !(my) ? NULL : h > 4 ? \ + dav1d_mc_subpel_filters[filter_type >> 2][(my) - 1] : \ + dav1d_mc_subpel_filters[3 + ((filter_type >> 2) & 1)][(my) - 1] + +#define GET_FILTERS() \ + GET_H_FILTER(mx); \ + GET_V_FILTER(my) + +static NOINLINE void +put_8tap_c(pixel *dst, ptrdiff_t dst_stride, + const pixel *src, ptrdiff_t src_stride, + const int w, int h, const int mx, const int my, + const int filter_type HIGHBD_DECL_SUFFIX) +{ + const int intermediate_bits = get_intermediate_bits(bitdepth_max); + const int intermediate_rnd = (1 << intermediate_bits) >> 1; + + GET_FILTERS(); + dst_stride = PXSTRIDE(dst_stride); + src_stride = PXSTRIDE(src_stride); + + if (fh) { + if (fv) { + int tmp_h = h + 7; + int16_t mid[128 * 135], *mid_ptr = mid; + + src -= src_stride * 3; + do { + for (int x = 0; x < w; x++) + mid_ptr[x] = DAV1D_FILTER_8TAP_RND(src, x, fh, 1, + 6 - intermediate_bits); + + mid_ptr += 128; + src += src_stride; + } while (--tmp_h); + + mid_ptr = mid + 128 * 3; + do { + for (int x = 0; x < w; x++) + dst[x] = DAV1D_FILTER_8TAP_CLIP(mid_ptr, x, fv, 128, + 6 + intermediate_bits); + + mid_ptr += 128; + dst += dst_stride; + } while (--h); + } else { + do { + for (int x = 0; x < w; x++) { + const int px = DAV1D_FILTER_8TAP_RND(src, x, fh, 1, + 6 - intermediate_bits); + dst[x] = iclip_pixel((px + intermediate_rnd) >> intermediate_bits); + } + + dst += dst_stride; + src += src_stride; + } while (--h); + } + } else if (fv) { + do { + for (int x = 0; x < w; x++) + dst[x] = DAV1D_FILTER_8TAP_CLIP(src, x, fv, src_stride, 6); + + dst += dst_stride; + src += src_stride; + } while (--h); + } else + put_c(dst, dst_stride, src, src_stride, w, h); +} + +static NOINLINE void +put_8tap_scaled_c(pixel *dst, const ptrdiff_t dst_stride, + const pixel *src, ptrdiff_t src_stride, + const int w, int h, const int mx, int my, + const int dx, const int dy, const int filter_type + HIGHBD_DECL_SUFFIX) +{ + const int intermediate_bits = get_intermediate_bits(bitdepth_max); + const int intermediate_rnd = (1 << intermediate_bits) >> 1; + int tmp_h = (((h - 1) * dy + my) >> 10) + 8; + int16_t mid[128 * (256 + 7)], *mid_ptr = mid; + src_stride = PXSTRIDE(src_stride); + + src -= src_stride * 3; + do { + int x; + int imx = mx, ioff = 0; + + for (x = 0; x < w; x++) { + GET_H_FILTER(imx >> 6); + mid_ptr[x] = fh ? DAV1D_FILTER_8TAP_RND(src, ioff, fh, 1, + 6 - intermediate_bits) : + src[ioff] << intermediate_bits; + imx += dx; + ioff += imx >> 10; + imx &= 0x3ff; + } + + mid_ptr += 128; + src += src_stride; + } while (--tmp_h); + + mid_ptr = mid + 128 * 3; + for (int y = 0; y < h; y++) { + int x; + GET_V_FILTER(my >> 6); + + for (x = 0; x < w; x++) + dst[x] = fv ? DAV1D_FILTER_8TAP_CLIP(mid_ptr, x, fv, 128, + 6 + intermediate_bits) : + iclip_pixel((mid_ptr[x] + intermediate_rnd) >> + intermediate_bits); + + my += dy; + mid_ptr += (my >> 10) * 128; + my &= 0x3ff; + dst += PXSTRIDE(dst_stride); + } +} + +static NOINLINE void +prep_8tap_c(int16_t *tmp, const pixel *src, ptrdiff_t src_stride, + const int w, int h, const int mx, const int my, + const int filter_type HIGHBD_DECL_SUFFIX) +{ + const int intermediate_bits = get_intermediate_bits(bitdepth_max); + GET_FILTERS(); + src_stride = PXSTRIDE(src_stride); + + if (fh) { + if (fv) { + int tmp_h = h + 7; + int16_t mid[128 * 135], *mid_ptr = mid; + + src -= src_stride * 3; + do { + for (int x = 0; x < w; x++) + mid_ptr[x] = DAV1D_FILTER_8TAP_RND(src, x, fh, 1, + 6 - intermediate_bits); + + mid_ptr += 128; + src += src_stride; + } while (--tmp_h); + + mid_ptr = mid + 128 * 3; + do { + for (int x = 0; x < w; x++) { + int t = DAV1D_FILTER_8TAP_RND(mid_ptr, x, fv, 128, 6) - + PREP_BIAS; + assert(t >= INT16_MIN && t <= INT16_MAX); + tmp[x] = t; + } + + mid_ptr += 128; + tmp += w; + } while (--h); + } else { + do { + for (int x = 0; x < w; x++) + tmp[x] = DAV1D_FILTER_8TAP_RND(src, x, fh, 1, + 6 - intermediate_bits) - + PREP_BIAS; + + tmp += w; + src += src_stride; + } while (--h); + } + } else if (fv) { + do { + for (int x = 0; x < w; x++) + tmp[x] = DAV1D_FILTER_8TAP_RND(src, x, fv, src_stride, + 6 - intermediate_bits) - + PREP_BIAS; + + tmp += w; + src += src_stride; + } while (--h); + } else + prep_c(tmp, src, src_stride, w, h HIGHBD_TAIL_SUFFIX); +} + +static NOINLINE void +prep_8tap_scaled_c(int16_t *tmp, const pixel *src, ptrdiff_t src_stride, + const int w, int h, const int mx, int my, + const int dx, const int dy, const int filter_type + HIGHBD_DECL_SUFFIX) +{ + const int intermediate_bits = get_intermediate_bits(bitdepth_max); + int tmp_h = (((h - 1) * dy + my) >> 10) + 8; + int16_t mid[128 * (256 + 7)], *mid_ptr = mid; + src_stride = PXSTRIDE(src_stride); + + src -= src_stride * 3; + do { + int x; + int imx = mx, ioff = 0; + + for (x = 0; x < w; x++) { + GET_H_FILTER(imx >> 6); + mid_ptr[x] = fh ? DAV1D_FILTER_8TAP_RND(src, ioff, fh, 1, + 6 - intermediate_bits) : + src[ioff] << intermediate_bits; + imx += dx; + ioff += imx >> 10; + imx &= 0x3ff; + } + + mid_ptr += 128; + src += src_stride; + } while (--tmp_h); + + mid_ptr = mid + 128 * 3; + for (int y = 0; y < h; y++) { + int x; + GET_V_FILTER(my >> 6); + + for (x = 0; x < w; x++) + tmp[x] = (fv ? DAV1D_FILTER_8TAP_RND(mid_ptr, x, fv, 128, 6) + : mid_ptr[x]) - PREP_BIAS; + + my += dy; + mid_ptr += (my >> 10) * 128; + my &= 0x3ff; + tmp += w; + } +} + +#define filter_fns(type, type_h, type_v) \ +static void put_8tap_##type##_c(pixel *const dst, \ + const ptrdiff_t dst_stride, \ + const pixel *const src, \ + const ptrdiff_t src_stride, \ + const int w, const int h, \ + const int mx, const int my \ + HIGHBD_DECL_SUFFIX) \ +{ \ + put_8tap_c(dst, dst_stride, src, src_stride, w, h, mx, my, \ + type_h | (type_v << 2) HIGHBD_TAIL_SUFFIX); \ +} \ +static void put_8tap_##type##_scaled_c(pixel *const dst, \ + const ptrdiff_t dst_stride, \ + const pixel *const src, \ + const ptrdiff_t src_stride, \ + const int w, const int h, \ + const int mx, const int my, \ + const int dx, const int dy \ + HIGHBD_DECL_SUFFIX) \ +{ \ + put_8tap_scaled_c(dst, dst_stride, src, src_stride, w, h, mx, my, dx, dy, \ + type_h | (type_v << 2) HIGHBD_TAIL_SUFFIX); \ +} \ +static void prep_8tap_##type##_c(int16_t *const tmp, \ + const pixel *const src, \ + const ptrdiff_t src_stride, \ + const int w, const int h, \ + const int mx, const int my \ + HIGHBD_DECL_SUFFIX) \ +{ \ + prep_8tap_c(tmp, src, src_stride, w, h, mx, my, \ + type_h | (type_v << 2) HIGHBD_TAIL_SUFFIX); \ +} \ +static void prep_8tap_##type##_scaled_c(int16_t *const tmp, \ + const pixel *const src, \ + const ptrdiff_t src_stride, \ + const int w, const int h, \ + const int mx, const int my, \ + const int dx, const int dy \ + HIGHBD_DECL_SUFFIX) \ +{ \ + prep_8tap_scaled_c(tmp, src, src_stride, w, h, mx, my, dx, dy, \ + type_h | (type_v << 2) HIGHBD_TAIL_SUFFIX); \ +} + +filter_fns(regular, DAV1D_FILTER_8TAP_REGULAR, DAV1D_FILTER_8TAP_REGULAR) +filter_fns(regular_sharp, DAV1D_FILTER_8TAP_REGULAR, DAV1D_FILTER_8TAP_SHARP) +filter_fns(regular_smooth, DAV1D_FILTER_8TAP_REGULAR, DAV1D_FILTER_8TAP_SMOOTH) +filter_fns(smooth, DAV1D_FILTER_8TAP_SMOOTH, DAV1D_FILTER_8TAP_SMOOTH) +filter_fns(smooth_regular, DAV1D_FILTER_8TAP_SMOOTH, DAV1D_FILTER_8TAP_REGULAR) +filter_fns(smooth_sharp, DAV1D_FILTER_8TAP_SMOOTH, DAV1D_FILTER_8TAP_SHARP) +filter_fns(sharp, DAV1D_FILTER_8TAP_SHARP, DAV1D_FILTER_8TAP_SHARP) +filter_fns(sharp_regular, DAV1D_FILTER_8TAP_SHARP, DAV1D_FILTER_8TAP_REGULAR) +filter_fns(sharp_smooth, DAV1D_FILTER_8TAP_SHARP, DAV1D_FILTER_8TAP_SMOOTH) + +#define FILTER_BILIN(src, x, mxy, stride) \ + (16 * src[x] + ((mxy) * (src[x + stride] - src[x]))) + +#define FILTER_BILIN_RND(src, x, mxy, stride, sh) \ + ((FILTER_BILIN(src, x, mxy, stride) + ((1 << (sh)) >> 1)) >> (sh)) + +#define FILTER_BILIN_CLIP(src, x, mxy, stride, sh) \ + iclip_pixel(FILTER_BILIN_RND(src, x, mxy, stride, sh)) + +static void put_bilin_c(pixel *dst, ptrdiff_t dst_stride, + const pixel *src, ptrdiff_t src_stride, + const int w, int h, const int mx, const int my + HIGHBD_DECL_SUFFIX) +{ + const int intermediate_bits = get_intermediate_bits(bitdepth_max); + const int intermediate_rnd = (1 << intermediate_bits) >> 1; + dst_stride = PXSTRIDE(dst_stride); + src_stride = PXSTRIDE(src_stride); + + if (mx) { + if (my) { + int16_t mid[128 * 129], *mid_ptr = mid; + int tmp_h = h + 1; + + do { + for (int x = 0; x < w; x++) + mid_ptr[x] = FILTER_BILIN_RND(src, x, mx, 1, + 4 - intermediate_bits); + + mid_ptr += 128; + src += src_stride; + } while (--tmp_h); + + mid_ptr = mid; + do { + for (int x = 0; x < w; x++) + dst[x] = FILTER_BILIN_CLIP(mid_ptr, x, my, 128, + 4 + intermediate_bits); + + mid_ptr += 128; + dst += dst_stride; + } while (--h); + } else { + do { + for (int x = 0; x < w; x++) { + const int px = FILTER_BILIN_RND(src, x, mx, 1, + 4 - intermediate_bits); + dst[x] = iclip_pixel((px + intermediate_rnd) >> intermediate_bits); + } + + dst += dst_stride; + src += src_stride; + } while (--h); + } + } else if (my) { + do { + for (int x = 0; x < w; x++) + dst[x] = FILTER_BILIN_CLIP(src, x, my, src_stride, 4); + + dst += dst_stride; + src += src_stride; + } while (--h); + } else + put_c(dst, dst_stride, src, src_stride, w, h); +} + +static void put_bilin_scaled_c(pixel *dst, ptrdiff_t dst_stride, + const pixel *src, ptrdiff_t src_stride, + const int w, int h, const int mx, int my, + const int dx, const int dy + HIGHBD_DECL_SUFFIX) +{ + const int intermediate_bits = get_intermediate_bits(bitdepth_max); + int tmp_h = (((h - 1) * dy + my) >> 10) + 2; + int16_t mid[128 * (256 + 1)], *mid_ptr = mid; + + do { + int x; + int imx = mx, ioff = 0; + + for (x = 0; x < w; x++) { + mid_ptr[x] = FILTER_BILIN_RND(src, ioff, imx >> 6, 1, + 4 - intermediate_bits); + imx += dx; + ioff += imx >> 10; + imx &= 0x3ff; + } + + mid_ptr += 128; + src += PXSTRIDE(src_stride); + } while (--tmp_h); + + mid_ptr = mid; + do { + int x; + + for (x = 0; x < w; x++) + dst[x] = FILTER_BILIN_CLIP(mid_ptr, x, my >> 6, 128, + 4 + intermediate_bits); + + my += dy; + mid_ptr += (my >> 10) * 128; + my &= 0x3ff; + dst += PXSTRIDE(dst_stride); + } while (--h); +} + +static void prep_bilin_c(int16_t *tmp, + const pixel *src, ptrdiff_t src_stride, + const int w, int h, const int mx, const int my + HIGHBD_DECL_SUFFIX) +{ + const int intermediate_bits = get_intermediate_bits(bitdepth_max); + src_stride = PXSTRIDE(src_stride); + + if (mx) { + if (my) { + int16_t mid[128 * 129], *mid_ptr = mid; + int tmp_h = h + 1; + + do { + for (int x = 0; x < w; x++) + mid_ptr[x] = FILTER_BILIN_RND(src, x, mx, 1, + 4 - intermediate_bits); + + mid_ptr += 128; + src += src_stride; + } while (--tmp_h); + + mid_ptr = mid; + do { + for (int x = 0; x < w; x++) + tmp[x] = FILTER_BILIN_RND(mid_ptr, x, my, 128, 4) - + PREP_BIAS; + + mid_ptr += 128; + tmp += w; + } while (--h); + } else { + do { + for (int x = 0; x < w; x++) + tmp[x] = FILTER_BILIN_RND(src, x, mx, 1, + 4 - intermediate_bits) - + PREP_BIAS; + + tmp += w; + src += src_stride; + } while (--h); + } + } else if (my) { + do { + for (int x = 0; x < w; x++) + tmp[x] = FILTER_BILIN_RND(src, x, my, src_stride, + 4 - intermediate_bits) - PREP_BIAS; + + tmp += w; + src += src_stride; + } while (--h); + } else + prep_c(tmp, src, src_stride, w, h HIGHBD_TAIL_SUFFIX); +} + +static void prep_bilin_scaled_c(int16_t *tmp, + const pixel *src, ptrdiff_t src_stride, + const int w, int h, const int mx, int my, + const int dx, const int dy HIGHBD_DECL_SUFFIX) +{ + const int intermediate_bits = get_intermediate_bits(bitdepth_max); + int tmp_h = (((h - 1) * dy + my) >> 10) + 2; + int16_t mid[128 * (256 + 1)], *mid_ptr = mid; + + do { + int x; + int imx = mx, ioff = 0; + + for (x = 0; x < w; x++) { + mid_ptr[x] = FILTER_BILIN_RND(src, ioff, imx >> 6, 1, + 4 - intermediate_bits); + imx += dx; + ioff += imx >> 10; + imx &= 0x3ff; + } + + mid_ptr += 128; + src += PXSTRIDE(src_stride); + } while (--tmp_h); + + mid_ptr = mid; + do { + int x; + + for (x = 0; x < w; x++) + tmp[x] = FILTER_BILIN_RND(mid_ptr, x, my >> 6, 128, 4) - PREP_BIAS; + + my += dy; + mid_ptr += (my >> 10) * 128; + my &= 0x3ff; + tmp += w; + } while (--h); +} + +static void avg_c(pixel *dst, const ptrdiff_t dst_stride, + const int16_t *tmp1, const int16_t *tmp2, const int w, int h + HIGHBD_DECL_SUFFIX) +{ + const int intermediate_bits = get_intermediate_bits(bitdepth_max); + const int sh = intermediate_bits + 1; + const int rnd = (1 << intermediate_bits) + PREP_BIAS * 2; + do { + for (int x = 0; x < w; x++) + dst[x] = iclip_pixel((tmp1[x] + tmp2[x] + rnd) >> sh); + + tmp1 += w; + tmp2 += w; + dst += PXSTRIDE(dst_stride); + } while (--h); +} + +static void w_avg_c(pixel *dst, const ptrdiff_t dst_stride, + const int16_t *tmp1, const int16_t *tmp2, const int w, int h, + const int weight HIGHBD_DECL_SUFFIX) +{ + const int intermediate_bits = get_intermediate_bits(bitdepth_max); + const int sh = intermediate_bits + 4; + const int rnd = (8 << intermediate_bits) + PREP_BIAS * 16; + do { + for (int x = 0; x < w; x++) + dst[x] = iclip_pixel((tmp1[x] * weight + + tmp2[x] * (16 - weight) + rnd) >> sh); + + tmp1 += w; + tmp2 += w; + dst += PXSTRIDE(dst_stride); + } while (--h); +} + +static void mask_c(pixel *dst, const ptrdiff_t dst_stride, + const int16_t *tmp1, const int16_t *tmp2, const int w, int h, + const uint8_t *mask HIGHBD_DECL_SUFFIX) +{ + const int intermediate_bits = get_intermediate_bits(bitdepth_max); + const int sh = intermediate_bits + 6; + const int rnd = (32 << intermediate_bits) + PREP_BIAS * 64; + do { + for (int x = 0; x < w; x++) + dst[x] = iclip_pixel((tmp1[x] * mask[x] + + tmp2[x] * (64 - mask[x]) + rnd) >> sh); + + tmp1 += w; + tmp2 += w; + mask += w; + dst += PXSTRIDE(dst_stride); + } while (--h); +} + +#define blend_px(a, b, m) (((a * (64 - m) + b * m) + 32) >> 6) +static void blend_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp, + const int w, int h, const uint8_t *mask) +{ + do { + for (int x = 0; x < w; x++) { + dst[x] = blend_px(dst[x], tmp[x], mask[x]); + } + dst += PXSTRIDE(dst_stride); + tmp += w; + mask += w; + } while (--h); +} + +static void blend_v_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp, + const int w, int h) +{ + const uint8_t *const mask = &dav1d_obmc_masks[w]; + do { + for (int x = 0; x < (w * 3) >> 2; x++) { + dst[x] = blend_px(dst[x], tmp[x], mask[x]); + } + dst += PXSTRIDE(dst_stride); + tmp += w; + } while (--h); +} + +static void blend_h_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp, + const int w, int h) +{ + const uint8_t *mask = &dav1d_obmc_masks[h]; + h = (h * 3) >> 2; + do { + const int m = *mask++; + for (int x = 0; x < w; x++) { + dst[x] = blend_px(dst[x], tmp[x], m); + } + dst += PXSTRIDE(dst_stride); + tmp += w; + } while (--h); +} + +static void w_mask_c(pixel *dst, const ptrdiff_t dst_stride, + const int16_t *tmp1, const int16_t *tmp2, const int w, int h, + uint8_t *mask, const int sign, + const int ss_hor, const int ss_ver HIGHBD_DECL_SUFFIX) +{ + // store mask at 2x2 resolution, i.e. store 2x1 sum for even rows, + // and then load this intermediate to calculate final value for odd rows + const int intermediate_bits = get_intermediate_bits(bitdepth_max); + const int bitdepth = bitdepth_from_max(bitdepth_max); + const int sh = intermediate_bits + 6; + const int rnd = (32 << intermediate_bits) + PREP_BIAS * 64; + const int mask_sh = bitdepth + intermediate_bits - 4; + const int mask_rnd = 1 << (mask_sh - 5); + do { + for (int x = 0; x < w; x++) { + const int m = imin(38 + ((abs(tmp1[x] - tmp2[x]) + mask_rnd) >> mask_sh), 64); + dst[x] = iclip_pixel((tmp1[x] * m + + tmp2[x] * (64 - m) + rnd) >> sh); + + if (ss_hor) { + x++; + + const int n = imin(38 + ((abs(tmp1[x] - tmp2[x]) + mask_rnd) >> mask_sh), 64); + dst[x] = iclip_pixel((tmp1[x] * n + + tmp2[x] * (64 - n) + rnd) >> sh); + + if (h & ss_ver) { + mask[x >> 1] = (m + n + mask[x >> 1] + 2 - sign) >> 2; + } else if (ss_ver) { + mask[x >> 1] = m + n; + } else { + mask[x >> 1] = (m + n + 1 - sign) >> 1; + } + } else { + mask[x] = m; + } + } + + tmp1 += w; + tmp2 += w; + dst += PXSTRIDE(dst_stride); + if (!ss_ver || (h & 1)) mask += w >> ss_hor; + } while (--h); +} + +#define w_mask_fns(ssn, ss_hor, ss_ver) \ +static void w_mask_##ssn##_c(pixel *const dst, const ptrdiff_t dst_stride, \ + const int16_t *const tmp1, const int16_t *const tmp2, \ + const int w, const int h, uint8_t *mask, \ + const int sign HIGHBD_DECL_SUFFIX) \ +{ \ + w_mask_c(dst, dst_stride, tmp1, tmp2, w, h, mask, sign, ss_hor, ss_ver \ + HIGHBD_TAIL_SUFFIX); \ +} + +w_mask_fns(444, 0, 0); +w_mask_fns(422, 1, 0); +w_mask_fns(420, 1, 1); + +#undef w_mask_fns + +#if ARCH_X86 +#define FILTER_WARP(src, x, F, stride) \ + (F[0] * src[x + -3 * stride] + \ + F[4] * src[x + -2 * stride] + \ + F[1] * src[x + -1 * stride] + \ + F[5] * src[x + +0 * stride] + \ + F[2] * src[x + +1 * stride] + \ + F[6] * src[x + +2 * stride] + \ + F[3] * src[x + +3 * stride] + \ + F[7] * src[x + +4 * stride]) +#else +#define FILTER_WARP(src, x, F, stride) \ + (F[0] * src[x + -3 * stride] + \ + F[1] * src[x + -2 * stride] + \ + F[2] * src[x + -1 * stride] + \ + F[3] * src[x + +0 * stride] + \ + F[4] * src[x + +1 * stride] + \ + F[5] * src[x + +2 * stride] + \ + F[6] * src[x + +3 * stride] + \ + F[7] * src[x + +4 * stride]) +#endif + +#define FILTER_WARP_RND(src, x, F, stride, sh) \ + ((FILTER_WARP(src, x, F, stride) + ((1 << (sh)) >> 1)) >> (sh)) + +#define FILTER_WARP_CLIP(src, x, F, stride, sh) \ + iclip_pixel(FILTER_WARP_RND(src, x, F, stride, sh)) + +static void warp_affine_8x8_c(pixel *dst, const ptrdiff_t dst_stride, + const pixel *src, const ptrdiff_t src_stride, + const int16_t *const abcd, int mx, int my + HIGHBD_DECL_SUFFIX) +{ + const int intermediate_bits = get_intermediate_bits(bitdepth_max); + int16_t mid[15 * 8], *mid_ptr = mid; + + src -= 3 * PXSTRIDE(src_stride); + for (int y = 0; y < 15; y++, mx += abcd[1]) { + for (int x = 0, tmx = mx; x < 8; x++, tmx += abcd[0]) { + const int8_t *const filter = + dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)]; + + mid_ptr[x] = FILTER_WARP_RND(src, x, filter, 1, + 7 - intermediate_bits); + } + src += PXSTRIDE(src_stride); + mid_ptr += 8; + } + + mid_ptr = &mid[3 * 8]; + for (int y = 0; y < 8; y++, my += abcd[3]) { + for (int x = 0, tmy = my; x < 8; x++, tmy += abcd[2]) { + const int8_t *const filter = + dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)]; + + dst[x] = FILTER_WARP_CLIP(mid_ptr, x, filter, 8, + 7 + intermediate_bits); + } + mid_ptr += 8; + dst += PXSTRIDE(dst_stride); + } +} + +static void warp_affine_8x8t_c(int16_t *tmp, const ptrdiff_t tmp_stride, + const pixel *src, const ptrdiff_t src_stride, + const int16_t *const abcd, int mx, int my + HIGHBD_DECL_SUFFIX) +{ + const int intermediate_bits = get_intermediate_bits(bitdepth_max); + int16_t mid[15 * 8], *mid_ptr = mid; + + src -= 3 * PXSTRIDE(src_stride); + for (int y = 0; y < 15; y++, mx += abcd[1]) { + for (int x = 0, tmx = mx; x < 8; x++, tmx += abcd[0]) { + const int8_t *const filter = + dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)]; + + mid_ptr[x] = FILTER_WARP_RND(src, x, filter, 1, + 7 - intermediate_bits); + } + src += PXSTRIDE(src_stride); + mid_ptr += 8; + } + + mid_ptr = &mid[3 * 8]; + for (int y = 0; y < 8; y++, my += abcd[3]) { + for (int x = 0, tmy = my; x < 8; x++, tmy += abcd[2]) { + const int8_t *const filter = + dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)]; + + tmp[x] = FILTER_WARP_RND(mid_ptr, x, filter, 8, 7) - PREP_BIAS; + } + mid_ptr += 8; + tmp += tmp_stride; + } +} + +static void emu_edge_c(const intptr_t bw, const intptr_t bh, + const intptr_t iw, const intptr_t ih, + const intptr_t x, const intptr_t y, + pixel *dst, const ptrdiff_t dst_stride, + const pixel *ref, const ptrdiff_t ref_stride) +{ + // find offset in reference of visible block to copy + ref += iclip((int) y, 0, (int) ih - 1) * PXSTRIDE(ref_stride) + + iclip((int) x, 0, (int) iw - 1); + + // number of pixels to extend (left, right, top, bottom) + const int left_ext = iclip((int) -x, 0, (int) bw - 1); + const int right_ext = iclip((int) (x + bw - iw), 0, (int) bw - 1); + assert(left_ext + right_ext < bw); + const int top_ext = iclip((int) -y, 0, (int) bh - 1); + const int bottom_ext = iclip((int) (y + bh - ih), 0, (int) bh - 1); + assert(top_ext + bottom_ext < bh); + + // copy visible portion first + pixel *blk = dst + top_ext * PXSTRIDE(dst_stride); + const int center_w = (int) (bw - left_ext - right_ext); + const int center_h = (int) (bh - top_ext - bottom_ext); + for (int y = 0; y < center_h; y++) { + pixel_copy(blk + left_ext, ref, center_w); + // extend left edge for this line + if (left_ext) + pixel_set(blk, blk[left_ext], left_ext); + // extend right edge for this line + if (right_ext) + pixel_set(blk + left_ext + center_w, blk[left_ext + center_w - 1], + right_ext); + ref += PXSTRIDE(ref_stride); + blk += PXSTRIDE(dst_stride); + } + + // copy top + blk = dst + top_ext * PXSTRIDE(dst_stride); + for (int y = 0; y < top_ext; y++) { + pixel_copy(dst, blk, bw); + dst += PXSTRIDE(dst_stride); + } + + // copy bottom + dst += center_h * PXSTRIDE(dst_stride); + for (int y = 0; y < bottom_ext; y++) { + pixel_copy(dst, &dst[-PXSTRIDE(dst_stride)], bw); + dst += PXSTRIDE(dst_stride); + } +} + +static void resize_c(pixel *dst, const ptrdiff_t dst_stride, + const pixel *src, const ptrdiff_t src_stride, + const int dst_w, int h, const int src_w, + const int dx, const int mx0 HIGHBD_DECL_SUFFIX) +{ + do { + int mx = mx0, src_x = -1; + for (int x = 0; x < dst_w; x++) { + const int8_t *const F = dav1d_resize_filter[mx >> 8]; + dst[x] = iclip_pixel((-(F[0] * src[iclip(src_x - 3, 0, src_w - 1)] + + F[1] * src[iclip(src_x - 2, 0, src_w - 1)] + + F[2] * src[iclip(src_x - 1, 0, src_w - 1)] + + F[3] * src[iclip(src_x + 0, 0, src_w - 1)] + + F[4] * src[iclip(src_x + 1, 0, src_w - 1)] + + F[5] * src[iclip(src_x + 2, 0, src_w - 1)] + + F[6] * src[iclip(src_x + 3, 0, src_w - 1)] + + F[7] * src[iclip(src_x + 4, 0, src_w - 1)]) + + 64) >> 7); + mx += dx; + src_x += mx >> 14; + mx &= 0x3fff; + } + + dst += PXSTRIDE(dst_stride); + src += PXSTRIDE(src_stride); + } while (--h); +} + +COLD void bitfn(dav1d_mc_dsp_init)(Dav1dMCDSPContext *const c) { +#define init_mc_fns(type, name) do { \ + c->mc [type] = put_##name##_c; \ + c->mc_scaled [type] = put_##name##_scaled_c; \ + c->mct [type] = prep_##name##_c; \ + c->mct_scaled[type] = prep_##name##_scaled_c; \ +} while (0) + + init_mc_fns(FILTER_2D_8TAP_REGULAR, 8tap_regular); + init_mc_fns(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth); + init_mc_fns(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp); + init_mc_fns(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular); + init_mc_fns(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth); + init_mc_fns(FILTER_2D_8TAP_SHARP, 8tap_sharp); + init_mc_fns(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular); + init_mc_fns(FILTER_2D_8TAP_SMOOTH, 8tap_smooth); + init_mc_fns(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp); + init_mc_fns(FILTER_2D_BILINEAR, bilin); + + c->avg = avg_c; + c->w_avg = w_avg_c; + c->mask = mask_c; + c->blend = blend_c; + c->blend_v = blend_v_c; + c->blend_h = blend_h_c; + c->w_mask[0] = w_mask_444_c; + c->w_mask[1] = w_mask_422_c; + c->w_mask[2] = w_mask_420_c; + c->warp8x8 = warp_affine_8x8_c; + c->warp8x8t = warp_affine_8x8t_c; + c->emu_edge = emu_edge_c; + c->resize = resize_c; + +#if HAVE_ASM +#if ARCH_AARCH64 || ARCH_ARM + bitfn(dav1d_mc_dsp_init_arm)(c); +#elif ARCH_X86 + bitfn(dav1d_mc_dsp_init_x86)(c); +#endif +#endif +} diff --git a/third_party/dav1d/src/mem.c b/third_party/dav1d/src/mem.c new file mode 100644 index 0000000000..8728247636 --- /dev/null +++ b/third_party/dav1d/src/mem.c @@ -0,0 +1,74 @@ +/* + * Copyright © 2020, VideoLAN and dav1d authors + * Copyright © 2020, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include + +#include "src/mem.h" +#include "src/thread.h" + +void dav1d_mem_pool_push(Dav1dMemPool *const pool, Dav1dMemPoolBuffer *const buf) { + pthread_mutex_lock(&pool->lock); + buf->next = pool->buf; + pool->buf = buf; + pthread_mutex_unlock(&pool->lock); +} + +Dav1dMemPoolBuffer *dav1d_mem_pool_pop(Dav1dMemPool *const pool, const size_t size) { + pthread_mutex_lock(&pool->lock); + Dav1dMemPoolBuffer *buf = pool->buf; + uint8_t *data; + if (buf) { + pool->buf = buf->next; + pthread_mutex_unlock(&pool->lock); + data = buf->data; + if ((uintptr_t)buf - (uintptr_t)data != size) { + dav1d_free_aligned(data); + goto alloc; + } + } else { + pthread_mutex_unlock(&pool->lock); +alloc: + data = dav1d_alloc_aligned(size + sizeof(Dav1dMemPoolBuffer), 64); + if (!data) return NULL; + buf = (Dav1dMemPoolBuffer*)(data + size); + buf->data = data; + } + + return buf; +} + +COLD void dav1d_mem_pool_destroy(Dav1dMemPool *const pool) { + pthread_mutex_destroy(&pool->lock); + Dav1dMemPoolBuffer *buf = pool->buf; + while (buf) { + void *const data = buf->data; + buf = buf->next; + dav1d_free_aligned(data); + } +} diff --git a/third_party/dav1d/src/mem.h b/third_party/dav1d/src/mem.h new file mode 100644 index 0000000000..a4d1971daf --- /dev/null +++ b/third_party/dav1d/src/mem.h @@ -0,0 +1,100 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_MEM_H +#define DAV1D_SRC_MEM_H + +#include + +#if defined(HAVE_ALIGNED_MALLOC) || defined(HAVE_MEMALIGN) +#include +#endif + +#include "common/attributes.h" + +#include "src/thread.h" + +typedef struct Dav1dMemPoolBuffer { + void *data; + struct Dav1dMemPoolBuffer *next; +} Dav1dMemPoolBuffer; + +typedef struct Dav1dMemPool { + pthread_mutex_t lock; + Dav1dMemPoolBuffer *buf; +} Dav1dMemPool; + +void dav1d_mem_pool_push(Dav1dMemPool *pool, Dav1dMemPoolBuffer *buf); +Dav1dMemPoolBuffer *dav1d_mem_pool_pop(Dav1dMemPool *pool, size_t size); +void dav1d_mem_pool_destroy(Dav1dMemPool *pool); + +/* + * Allocate align-byte aligned memory. The return value can be released + * by calling the dav1d_free_aligned() function. + */ +static inline void *dav1d_alloc_aligned(size_t sz, size_t align) { + assert(!(align & (align - 1))); +#ifdef HAVE_POSIX_MEMALIGN + void *ptr; + if (posix_memalign(&ptr, align, sz)) return NULL; + return ptr; +#elif defined(HAVE_ALIGNED_MALLOC) + return _aligned_malloc(sz, align); +#elif defined(HAVE_MEMALIGN) + return memalign(align, sz); +#else +#error Missing aligned alloc implementation +#endif +} + +static inline void dav1d_free_aligned(void* ptr) { +#ifdef HAVE_POSIX_MEMALIGN + free(ptr); +#elif defined(HAVE_ALIGNED_MALLOC) + _aligned_free(ptr); +#elif defined(HAVE_MEMALIGN) + free(ptr); +#endif +} + +static inline void dav1d_freep_aligned(void* ptr) { + void **mem = (void **) ptr; + if (*mem) { + dav1d_free_aligned(*mem); + *mem = NULL; + } +} + +static inline void freep(void *ptr) { + void **mem = (void **) ptr; + if (*mem) { + free(*mem); + *mem = NULL; + } +} + +#endif /* DAV1D_SRC_MEM_H */ diff --git a/third_party/dav1d/src/meson.build b/third_party/dav1d/src/meson.build new file mode 100644 index 0000000000..328ea9cd3b --- /dev/null +++ b/third_party/dav1d/src/meson.build @@ -0,0 +1,337 @@ +# Copyright © 2018-2019, VideoLAN and dav1d authors +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# +# Build definition for the dav1d library +# + +# libdav1d source files +libdav1d_sources = files( + 'cdf.c', + 'cpu.c', + 'data.c', + 'decode.c', + 'dequant_tables.c', + 'getbits.c', + 'intra_edge.c', + 'itx_1d.c', + 'lf_mask.c', + 'log.c', + 'mem.c', + 'msac.c', + 'obu.c', + 'picture.c', + 'qm.c', + 'ref.c', + 'refmvs.c', + 'scan.c', + 'tables.c', + 'warpmv.c', + 'wedge.c', +) + +# libdav1d bitdepth source files +# These files are compiled for each bitdepth with +# `BITDEPTH` defined to the currently built bitdepth. +libdav1d_tmpl_sources = files( + 'cdef_apply_tmpl.c', + 'cdef_tmpl.c', + 'fg_apply_tmpl.c', + 'film_grain_tmpl.c', + 'ipred_prepare_tmpl.c', + 'ipred_tmpl.c', + 'itx_tmpl.c', + 'lf_apply_tmpl.c', + 'loopfilter_tmpl.c', + 'looprestoration_tmpl.c', + 'lr_apply_tmpl.c', + 'mc_tmpl.c', + 'recon_tmpl.c', +) + +libdav1d_arch_tmpl_sources = [] + +libdav1d_bitdepth_objs = [] + +# libdav1d entrypoint source files +# These source files contain library entry points and are +# built with the stack-realign flag set, where necessary. +libdav1d_entrypoints_sources = files( + 'lib.c', + 'thread_task.c' +) + +# ASM specific sources +libdav1d_asm_objs = [] +# Arch-specific flags +arch_flags = [] +if is_asm_enabled + if (host_machine.cpu_family() == 'aarch64' or + host_machine.cpu_family().startswith('arm')) + + libdav1d_sources += files( + 'arm/cpu.c', + ) + libdav1d_tmpl_sources += files( + 'arm/cdef_init_tmpl.c', + 'arm/ipred_init_tmpl.c', + 'arm/itx_init_tmpl.c', + 'arm/loopfilter_init_tmpl.c', + 'arm/looprestoration_init_tmpl.c', + 'arm/mc_init_tmpl.c', + ) + if (host_machine.cpu_family() == 'aarch64' or + host_machine.cpu() == 'arm64') + libdav1d_sources_asm = files( + # itx.S is used for both 8 and 16 bpc. + 'arm/64/itx.S', + 'arm/64/looprestoration_common.S', + 'arm/64/msac.S', + ) + + if dav1d_bitdepths.contains('8') + libdav1d_sources_asm += files( + 'arm/64/cdef.S', + 'arm/64/ipred.S', + 'arm/64/loopfilter.S', + 'arm/64/looprestoration.S', + 'arm/64/mc.S', + ) + endif + + if dav1d_bitdepths.contains('16') + libdav1d_sources_asm += files( + 'arm/64/cdef16.S', + 'arm/64/ipred16.S', + 'arm/64/itx16.S', + 'arm/64/loopfilter16.S', + 'arm/64/looprestoration16.S', + 'arm/64/mc16.S', + ) + endif + elif host_machine.cpu_family().startswith('arm') + libdav1d_sources_asm = files( + 'arm/32/msac.S', + ) + + if dav1d_bitdepths.contains('8') + libdav1d_sources_asm += files( + 'arm/32/cdef.S', + 'arm/32/ipred.S', + 'arm/32/itx.S', + 'arm/32/loopfilter.S', + 'arm/32/looprestoration.S', + 'arm/32/mc.S', + ) + endif + + if dav1d_bitdepths.contains('16') + libdav1d_sources_asm += files( + 'arm/32/cdef16.S', + 'arm/32/looprestoration16.S', + 'arm/32/mc16.S', + ) + endif + endif + + if use_gaspp + libdav1d_asm_objs = gaspp_gen.process(libdav1d_sources_asm) + else + libdav1d_sources += libdav1d_sources_asm + endif + elif host_machine.cpu_family().startswith('x86') + + libdav1d_sources += files( + 'x86/cpu.c', + 'x86/msac_init.c', + ) + + libdav1d_tmpl_sources += files( + 'x86/cdef_init_tmpl.c', + 'x86/film_grain_init_tmpl.c', + 'x86/ipred_init_tmpl.c', + 'x86/itx_init_tmpl.c', + 'x86/loopfilter_init_tmpl.c', + 'x86/looprestoration_init_tmpl.c', + 'x86/mc_init_tmpl.c', + ) + + # NASM source files + libdav1d_sources_asm = files( + 'x86/cpuid.asm', + 'x86/msac.asm', + ) + + if dav1d_bitdepths.contains('8') + libdav1d_sources_asm += files( + 'x86/cdef_avx512.asm', + 'x86/mc_avx512.asm', + 'x86/cdef_avx2.asm', + 'x86/mc_avx2.asm', + 'x86/film_grain.asm', + 'x86/ipred.asm', + 'x86/itx.asm', + 'x86/loopfilter.asm', + 'x86/looprestoration.asm', + 'x86/cdef_sse.asm', + 'x86/film_grain_ssse3.asm', + 'x86/ipred_ssse3.asm', + 'x86/itx_ssse3.asm', + 'x86/loopfilter_ssse3.asm', + 'x86/looprestoration_ssse3.asm', + 'x86/mc_sse.asm', + ) + endif + + if dav1d_bitdepths.contains('16') + libdav1d_sources_asm += files( + ) + endif + + # Compile the ASM sources with NASM + libdav1d_asm_objs = nasm_gen.process(libdav1d_sources_asm) + elif host_machine.cpu() == 'ppc64le' + arch_flags = ['-maltivec', '-mvsx'] + libdav1d_sources += files( + 'ppc/cpu.c', + ) + libdav1d_arch_tmpl_sources += files( + 'ppc/cdef_init_tmpl.c', + 'ppc/looprestoration_init_tmpl.c', + ) + endif +endif + + + +api_export_flags = [] + +# +# Windows .rc file and API export flags +# + +if host_machine.system() == 'windows' and get_option('default_library') != 'static' + rc_file = configure_file( + input : 'dav1d.rc.in', + output : 'dav1d.rc', + configuration : rc_data + ) + + libdav1d_rc_obj = winmod.compile_resources(rc_file) + + api_export_flags = ['-DDAV1D_BUILDING_DLL'] +else + libdav1d_rc_obj = [] +endif + + + + +# +# Library definitions +# + +# Helper library for dav1d entrypoints +libdav1d_entrypoints_objs = static_library('dav1d_entrypoint', + libdav1d_entrypoints_sources, + rev_target, config_h_target, + + include_directories : dav1d_inc_dirs, + dependencies: [stdatomic_dependency], + c_args : [stackalign_flag, stackrealign_flag, api_export_flags], + install : false, + build_by_default : false, +).extract_all_objects() + +# Helper library for each bitdepth +libdav1d_bitdepth_objs = [] +foreach bitdepth : dav1d_bitdepths + libdav1d_bitdepth_objs += static_library( + 'dav1d_bitdepth_@0@'.format(bitdepth), + libdav1d_tmpl_sources, config_h_target, + include_directories: dav1d_inc_dirs, + dependencies : [stdatomic_dependency], + c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + stackalign_flag, + install : false, + build_by_default : false, + ).extract_all_objects() +endforeach + +# Helper library for each bitdepth and architecture-specific flags +foreach bitdepth : dav1d_bitdepths + libdav1d_bitdepth_objs += static_library( + 'dav1d_arch_bitdepth_@0@'.format(bitdepth), + libdav1d_arch_tmpl_sources, config_h_target, + include_directories: dav1d_inc_dirs, + dependencies : [stdatomic_dependency], + c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + stackalign_flag + arch_flags, + install : false, + build_by_default : false, + ).extract_all_objects() +endforeach + +# The final dav1d library +if host_machine.system() == 'windows' + dav1d_soversion = '' +else + dav1d_soversion = dav1d_api_version_major +endif + +libdav1d = library('dav1d', + libdav1d_sources, + libdav1d_asm_objs, + libdav1d_rc_obj, + + objects : [ + libdav1d_bitdepth_objs, + libdav1d_entrypoints_objs + ], + + include_directories : dav1d_inc_dirs, + dependencies : [ + stdatomic_dependency, + thread_dependency, + thread_compat_dep, + libdl_dependency, + ], + c_args : [stackalign_flag, api_export_flags], + version : dav1d_soname_version, + soversion : dav1d_soversion, + install : true, +) + +dav1d_dep = declare_dependency(link_with: libdav1d, + include_directories : include_directories('../include/dav1d') +) + +# +# Generate pkg-config .pc file +# +pkg_mod = import('pkgconfig') +pkg_mod.generate(libraries: libdav1d, + version: meson.project_version(), + name: 'libdav1d', + filebase: 'dav1d', + description: 'AV1 decoding library' +) diff --git a/third_party/dav1d/src/msac.c b/third_party/dav1d/src/msac.c new file mode 100644 index 0000000000..8195977d57 --- /dev/null +++ b/third_party/dav1d/src/msac.c @@ -0,0 +1,208 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include + +#include "common/intops.h" + +#include "src/msac.h" + +#define EC_PROB_SHIFT 6 +#define EC_MIN_PROB 4 // must be <= (1<buf_pos; + const uint8_t *buf_end = s->buf_end; + int c = EC_WIN_SIZE - s->cnt - 24; + ec_win dif = s->dif; + while (c >= 0 && buf_pos < buf_end) { + dif ^= ((ec_win)*buf_pos++) << c; + c -= 8; + } + s->dif = dif; + s->cnt = EC_WIN_SIZE - c - 24; + s->buf_pos = buf_pos; +} + +/* Takes updated dif and range values, renormalizes them so that + * 32768 <= rng < 65536 (reading more bytes from the stream into dif if + * necessary), and stores them back in the decoder context. + * dif: The new value of dif. + * rng: The new value of the range. */ +static inline void ctx_norm(MsacContext *const s, const ec_win dif, + const unsigned rng) +{ + const int d = 15 ^ (31 ^ clz(rng)); + assert(rng <= 65535U); + s->cnt -= d; + s->dif = ((dif + 1) << d) - 1; /* Shift in 1s in the LSBs */ + s->rng = rng << d; + if (s->cnt < 0) + ctx_refill(s); +} + +unsigned dav1d_msac_decode_bool_equi_c(MsacContext *const s) { + const unsigned r = s->rng; + ec_win dif = s->dif; + assert((dif >> (EC_WIN_SIZE - 16)) < r); + // When the probability is 1/2, f = 16384 >> EC_PROB_SHIFT = 256 and we can + // replace the multiply with a simple shift. + unsigned v = ((r >> 8) << 7) + EC_MIN_PROB; + const ec_win vw = (ec_win)v << (EC_WIN_SIZE - 16); + const unsigned ret = dif >= vw; + dif -= ret * vw; + v += ret * (r - 2 * v); + ctx_norm(s, dif, v); + return !ret; +} + +/* Decode a single binary value. + * f: The probability that the bit is one + * Return: The value decoded (0 or 1). */ +unsigned dav1d_msac_decode_bool_c(MsacContext *const s, const unsigned f) { + const unsigned r = s->rng; + ec_win dif = s->dif; + assert((dif >> (EC_WIN_SIZE - 16)) < r); + unsigned v = ((r >> 8) * (f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT)) + EC_MIN_PROB; + const ec_win vw = (ec_win)v << (EC_WIN_SIZE - 16); + const unsigned ret = dif >= vw; + dif -= ret * vw; + v += ret * (r - 2 * v); + ctx_norm(s, dif, v); + return !ret; +} + +int dav1d_msac_decode_subexp(MsacContext *const s, const int ref, + const int n, const unsigned k) +{ + int i = 0; + int a = 0; + int b = k; + while ((2 << b) < n) { + if (!dav1d_msac_decode_bool_equi(s)) break; + b = k + i++; + a = (1 << b); + } + const unsigned v = dav1d_msac_decode_bools(s, b) + a; + return ref * 2 <= n ? inv_recenter(ref, v) : + n - 1 - inv_recenter(n - 1 - ref, v); +} + +/* Decodes a symbol given an inverse cumulative distribution function (CDF) + * table in Q15. */ +unsigned dav1d_msac_decode_symbol_adapt_c(MsacContext *const s, + uint16_t *const cdf, + const size_t n_symbols) +{ + const unsigned c = s->dif >> (EC_WIN_SIZE - 16), r = s->rng >> 8; + unsigned u, v = s->rng, val = -1; + + assert(n_symbols <= 15); + assert(cdf[n_symbols] <= 32); + + do { + val++; + u = v; + v = r * (cdf[val] >> EC_PROB_SHIFT); + v >>= 7 - EC_PROB_SHIFT; + v += EC_MIN_PROB * ((unsigned)n_symbols - val); + } while (c < v); + + assert(u <= s->rng); + + ctx_norm(s, s->dif - ((ec_win)v << (EC_WIN_SIZE - 16)), u - v); + + if (s->allow_update_cdf) { + const unsigned count = cdf[n_symbols]; + const unsigned rate = 4 + (count >> 4) + (n_symbols > 2); + unsigned i; + for (i = 0; i < val; i++) + cdf[i] += (32768 - cdf[i]) >> rate; + for (; i < n_symbols; i++) + cdf[i] -= cdf[i] >> rate; + cdf[n_symbols] = count + (count < 32); + } + + return val; +} + +unsigned dav1d_msac_decode_bool_adapt_c(MsacContext *const s, + uint16_t *const cdf) +{ + const unsigned bit = dav1d_msac_decode_bool(s, *cdf); + + if (s->allow_update_cdf) { + // update_cdf() specialized for boolean CDFs + const unsigned count = cdf[1]; + const int rate = 4 + (count >> 4); + if (bit) + cdf[0] += (32768 - cdf[0]) >> rate; + else + cdf[0] -= cdf[0] >> rate; + cdf[1] = count + (count < 32); + } + + return bit; +} + +unsigned dav1d_msac_decode_hi_tok_c(MsacContext *const s, uint16_t *const cdf) { + unsigned tok_br = dav1d_msac_decode_symbol_adapt4(s, cdf, 3); + unsigned tok = 3 + tok_br; + if (tok_br == 3) { + tok_br = dav1d_msac_decode_symbol_adapt4(s, cdf, 3); + tok = 6 + tok_br; + if (tok_br == 3) { + tok_br = dav1d_msac_decode_symbol_adapt4(s, cdf, 3); + tok = 9 + tok_br; + if (tok_br == 3) + tok = 12 + dav1d_msac_decode_symbol_adapt4(s, cdf, 3); + } + } + return tok; +} + +void dav1d_msac_init(MsacContext *const s, const uint8_t *const data, + const size_t sz, const int disable_cdf_update_flag) +{ + s->buf_pos = data; + s->buf_end = data + sz; + s->dif = ((ec_win)1 << (EC_WIN_SIZE - 1)) - 1; + s->rng = 0x8000; + s->cnt = -15; + s->allow_update_cdf = !disable_cdf_update_flag; + ctx_refill(s); + +#if ARCH_X86_64 && HAVE_ASM + s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt_c; + + dav1d_msac_init_x86(s); +#endif +} diff --git a/third_party/dav1d/src/msac.h b/third_party/dav1d/src/msac.h new file mode 100644 index 0000000000..eb04f58f81 --- /dev/null +++ b/third_party/dav1d/src/msac.h @@ -0,0 +1,108 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_MSAC_H +#define DAV1D_SRC_MSAC_H + +#include +#include + +#include "common/intops.h" + +typedef size_t ec_win; + +typedef struct MsacContext { + const uint8_t *buf_pos; + const uint8_t *buf_end; + ec_win dif; + unsigned rng; + int cnt; + int allow_update_cdf; + +#if ARCH_X86_64 && HAVE_ASM + unsigned (*symbol_adapt16)(struct MsacContext *s, uint16_t *cdf, size_t n_symbols); +#endif +} MsacContext; + +#if HAVE_ASM +#if ARCH_AARCH64 || ARCH_ARM +#include "src/arm/msac.h" +#elif ARCH_X86 +#include "src/x86/msac.h" +#endif +#endif + +void dav1d_msac_init(MsacContext *s, const uint8_t *data, size_t sz, + int disable_cdf_update_flag); +unsigned dav1d_msac_decode_symbol_adapt_c(MsacContext *s, uint16_t *cdf, + size_t n_symbols); +unsigned dav1d_msac_decode_bool_adapt_c(MsacContext *s, uint16_t *cdf); +unsigned dav1d_msac_decode_bool_equi_c(MsacContext *s); +unsigned dav1d_msac_decode_bool_c(MsacContext *s, unsigned f); +unsigned dav1d_msac_decode_hi_tok_c(MsacContext *s, uint16_t *cdf); +int dav1d_msac_decode_subexp(MsacContext *s, int ref, int n, unsigned k); + +/* Supported n_symbols ranges: adapt4: 1-4, adapt8: 1-7, adapt16: 3-15 */ +#ifndef dav1d_msac_decode_symbol_adapt4 +#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt_c +#endif +#ifndef dav1d_msac_decode_symbol_adapt8 +#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt_c +#endif +#ifndef dav1d_msac_decode_symbol_adapt16 +#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt_c +#endif +#ifndef dav1d_msac_decode_bool_adapt +#define dav1d_msac_decode_bool_adapt dav1d_msac_decode_bool_adapt_c +#endif +#ifndef dav1d_msac_decode_bool_equi +#define dav1d_msac_decode_bool_equi dav1d_msac_decode_bool_equi_c +#endif +#ifndef dav1d_msac_decode_bool +#define dav1d_msac_decode_bool dav1d_msac_decode_bool_c +#endif +#ifndef dav1d_msac_decode_hi_tok +#define dav1d_msac_decode_hi_tok dav1d_msac_decode_hi_tok_c +#endif + +static inline unsigned dav1d_msac_decode_bools(MsacContext *const s, unsigned n) { + unsigned v = 0; + while (n--) + v = (v << 1) | dav1d_msac_decode_bool_equi(s); + return v; +} + +static inline int dav1d_msac_decode_uniform(MsacContext *const s, const unsigned n) { + assert(n > 0); + const int l = ulog2(n) + 1; + assert(l > 1); + const unsigned m = (1 << l) - n; + const unsigned v = dav1d_msac_decode_bools(s, l - 1); + return v < m ? v : (v << 1) - m + dav1d_msac_decode_bool_equi(s); +} + +#endif /* DAV1D_SRC_MSAC_H */ diff --git a/third_party/dav1d/src/obu.c b/third_party/dav1d/src/obu.c new file mode 100644 index 0000000000..589129fd42 --- /dev/null +++ b/third_party/dav1d/src/obu.c @@ -0,0 +1,1603 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include +#include +#include + +#include "dav1d/data.h" + +#include "common/intops.h" + +#include "src/decode.h" +#include "src/getbits.h" +#include "src/levels.h" +#include "src/log.h" +#include "src/obu.h" +#include "src/ref.h" +#include "src/thread_task.h" + +static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb, + Dav1dSequenceHeader *const hdr) +{ +#define DEBUG_SEQ_HDR 0 + +#if DEBUG_SEQ_HDR + const unsigned init_bit_pos = dav1d_get_bits_pos(gb); +#endif + + hdr->profile = dav1d_get_bits(gb, 3); + if (hdr->profile > 2) goto error; +#if DEBUG_SEQ_HDR + printf("SEQHDR: post-profile: off=%u\n", + dav1d_get_bits_pos(gb) - init_bit_pos); +#endif + + hdr->still_picture = dav1d_get_bits(gb, 1); + hdr->reduced_still_picture_header = dav1d_get_bits(gb, 1); + if (hdr->reduced_still_picture_header && !hdr->still_picture) goto error; +#if DEBUG_SEQ_HDR + printf("SEQHDR: post-stillpicture_flags: off=%u\n", + dav1d_get_bits_pos(gb) - init_bit_pos); +#endif + + if (hdr->reduced_still_picture_header) { + hdr->timing_info_present = 0; + hdr->decoder_model_info_present = 0; + hdr->display_model_info_present = 0; + hdr->num_operating_points = 1; + hdr->operating_points[0].idc = 0; + hdr->operating_points[0].major_level = dav1d_get_bits(gb, 3); + hdr->operating_points[0].minor_level = dav1d_get_bits(gb, 2); + hdr->operating_points[0].tier = 0; + hdr->operating_points[0].decoder_model_param_present = 0; + hdr->operating_points[0].display_model_param_present = 0; + } else { + hdr->timing_info_present = dav1d_get_bits(gb, 1); + if (hdr->timing_info_present) { + hdr->num_units_in_tick = dav1d_get_bits(gb, 32); + hdr->time_scale = dav1d_get_bits(gb, 32); + hdr->equal_picture_interval = dav1d_get_bits(gb, 1); + if (hdr->equal_picture_interval) { + const unsigned num_ticks_per_picture = dav1d_get_vlc(gb); + if (num_ticks_per_picture == 0xFFFFFFFFU) + goto error; + hdr->num_ticks_per_picture = num_ticks_per_picture + 1; + } + + hdr->decoder_model_info_present = dav1d_get_bits(gb, 1); + if (hdr->decoder_model_info_present) { + hdr->encoder_decoder_buffer_delay_length = dav1d_get_bits(gb, 5) + 1; + hdr->num_units_in_decoding_tick = dav1d_get_bits(gb, 32); + hdr->buffer_removal_delay_length = dav1d_get_bits(gb, 5) + 1; + hdr->frame_presentation_delay_length = dav1d_get_bits(gb, 5) + 1; + } + } else { + hdr->decoder_model_info_present = 0; + } +#if DEBUG_SEQ_HDR + printf("SEQHDR: post-timinginfo: off=%u\n", + dav1d_get_bits_pos(gb) - init_bit_pos); +#endif + + hdr->display_model_info_present = dav1d_get_bits(gb, 1); + hdr->num_operating_points = dav1d_get_bits(gb, 5) + 1; + for (int i = 0; i < hdr->num_operating_points; i++) { + struct Dav1dSequenceHeaderOperatingPoint *const op = + &hdr->operating_points[i]; + op->idc = dav1d_get_bits(gb, 12); + if (op->idc && (!(op->idc & 0xff) || !(op->idc & 0xf00))) + goto error; + op->major_level = 2 + dav1d_get_bits(gb, 3); + op->minor_level = dav1d_get_bits(gb, 2); + op->tier = op->major_level > 3 ? dav1d_get_bits(gb, 1) : 0; + op->decoder_model_param_present = + hdr->decoder_model_info_present && dav1d_get_bits(gb, 1); + if (op->decoder_model_param_present) { + struct Dav1dSequenceHeaderOperatingParameterInfo *const opi = + &hdr->operating_parameter_info[i]; + opi->decoder_buffer_delay = + dav1d_get_bits(gb, hdr->encoder_decoder_buffer_delay_length); + opi->encoder_buffer_delay = + dav1d_get_bits(gb, hdr->encoder_decoder_buffer_delay_length); + opi->low_delay_mode = dav1d_get_bits(gb, 1); + } + op->display_model_param_present = + hdr->display_model_info_present && dav1d_get_bits(gb, 1); + if (op->display_model_param_present) { + op->initial_display_delay = dav1d_get_bits(gb, 4) + 1; + } + } + const int op_idx = + c->operating_point < hdr->num_operating_points ? c->operating_point : 0; + c->operating_point_idc = hdr->operating_points[op_idx].idc; +#if DEBUG_SEQ_HDR + printf("SEQHDR: post-operating-points: off=%u\n", + dav1d_get_bits_pos(gb) - init_bit_pos); +#endif + } + + hdr->width_n_bits = dav1d_get_bits(gb, 4) + 1; + hdr->height_n_bits = dav1d_get_bits(gb, 4) + 1; + hdr->max_width = dav1d_get_bits(gb, hdr->width_n_bits) + 1; + hdr->max_height = dav1d_get_bits(gb, hdr->height_n_bits) + 1; +#if DEBUG_SEQ_HDR + printf("SEQHDR: post-size: off=%u\n", + dav1d_get_bits_pos(gb) - init_bit_pos); +#endif + hdr->frame_id_numbers_present = + hdr->reduced_still_picture_header ? 0 : dav1d_get_bits(gb, 1); + if (hdr->frame_id_numbers_present) { + hdr->delta_frame_id_n_bits = dav1d_get_bits(gb, 4) + 2; + hdr->frame_id_n_bits = dav1d_get_bits(gb, 3) + hdr->delta_frame_id_n_bits + 1; + } +#if DEBUG_SEQ_HDR + printf("SEQHDR: post-frame-id-numbers-present: off=%u\n", + dav1d_get_bits_pos(gb) - init_bit_pos); +#endif + + hdr->sb128 = dav1d_get_bits(gb, 1); + hdr->filter_intra = dav1d_get_bits(gb, 1); + hdr->intra_edge_filter = dav1d_get_bits(gb, 1); + if (hdr->reduced_still_picture_header) { + hdr->inter_intra = 0; + hdr->masked_compound = 0; + hdr->warped_motion = 0; + hdr->dual_filter = 0; + hdr->order_hint = 0; + hdr->jnt_comp = 0; + hdr->ref_frame_mvs = 0; + hdr->order_hint_n_bits = 0; + hdr->screen_content_tools = DAV1D_ADAPTIVE; + hdr->force_integer_mv = DAV1D_ADAPTIVE; + } else { + hdr->inter_intra = dav1d_get_bits(gb, 1); + hdr->masked_compound = dav1d_get_bits(gb, 1); + hdr->warped_motion = dav1d_get_bits(gb, 1); + hdr->dual_filter = dav1d_get_bits(gb, 1); + hdr->order_hint = dav1d_get_bits(gb, 1); + if (hdr->order_hint) { + hdr->jnt_comp = dav1d_get_bits(gb, 1); + hdr->ref_frame_mvs = dav1d_get_bits(gb, 1); + } else { + hdr->jnt_comp = 0; + hdr->ref_frame_mvs = 0; + hdr->order_hint_n_bits = 0; + } + hdr->screen_content_tools = dav1d_get_bits(gb, 1) ? DAV1D_ADAPTIVE : dav1d_get_bits(gb, 1); + #if DEBUG_SEQ_HDR + printf("SEQHDR: post-screentools: off=%u\n", + dav1d_get_bits_pos(gb) - init_bit_pos); + #endif + hdr->force_integer_mv = hdr->screen_content_tools ? + dav1d_get_bits(gb, 1) ? DAV1D_ADAPTIVE : dav1d_get_bits(gb, 1) : 2; + if (hdr->order_hint) + hdr->order_hint_n_bits = dav1d_get_bits(gb, 3) + 1; + } + hdr->super_res = dav1d_get_bits(gb, 1); + hdr->cdef = dav1d_get_bits(gb, 1); + hdr->restoration = dav1d_get_bits(gb, 1); +#if DEBUG_SEQ_HDR + printf("SEQHDR: post-featurebits: off=%u\n", + dav1d_get_bits_pos(gb) - init_bit_pos); +#endif + + hdr->hbd = dav1d_get_bits(gb, 1); + if (hdr->profile == 2 && hdr->hbd) hdr->hbd += dav1d_get_bits(gb, 1); + hdr->monochrome = hdr->profile != 1 ? dav1d_get_bits(gb, 1) : 0; + hdr->color_description_present = dav1d_get_bits(gb, 1); + if (hdr->color_description_present) { + hdr->pri = dav1d_get_bits(gb, 8); + hdr->trc = dav1d_get_bits(gb, 8); + hdr->mtrx = dav1d_get_bits(gb, 8); + } else { + hdr->pri = DAV1D_COLOR_PRI_UNKNOWN; + hdr->trc = DAV1D_TRC_UNKNOWN; + hdr->mtrx = DAV1D_MC_UNKNOWN; + } + if (hdr->monochrome) { + hdr->color_range = dav1d_get_bits(gb, 1); + hdr->layout = DAV1D_PIXEL_LAYOUT_I400; + hdr->ss_hor = hdr->ss_ver = 1; + hdr->chr = DAV1D_CHR_UNKNOWN; + hdr->separate_uv_delta_q = 0; + } else if (hdr->pri == DAV1D_COLOR_PRI_BT709 && + hdr->trc == DAV1D_TRC_SRGB && + hdr->mtrx == DAV1D_MC_IDENTITY) + { + hdr->layout = DAV1D_PIXEL_LAYOUT_I444; + hdr->ss_hor = hdr->ss_ver = 0; + hdr->color_range = 1; + if (hdr->profile != 1 && !(hdr->profile == 2 && hdr->hbd == 2)) + goto error; + } else { + hdr->color_range = dav1d_get_bits(gb, 1); + switch (hdr->profile) { + case 0: hdr->layout = DAV1D_PIXEL_LAYOUT_I420; + hdr->ss_hor = hdr->ss_ver = 1; + break; + case 1: hdr->layout = DAV1D_PIXEL_LAYOUT_I444; + hdr->ss_hor = hdr->ss_ver = 0; + break; + case 2: + if (hdr->hbd == 2) { + hdr->ss_hor = dav1d_get_bits(gb, 1); + hdr->ss_ver = hdr->ss_hor && dav1d_get_bits(gb, 1); + } else { + hdr->ss_hor = 1; + hdr->ss_ver = 0; + } + hdr->layout = hdr->ss_hor ? + hdr->ss_ver ? DAV1D_PIXEL_LAYOUT_I420 : + DAV1D_PIXEL_LAYOUT_I422 : + DAV1D_PIXEL_LAYOUT_I444; + break; + } + hdr->chr = hdr->ss_hor == 1 && hdr->ss_ver == 1 ? + dav1d_get_bits(gb, 2) : DAV1D_CHR_UNKNOWN; + } + hdr->separate_uv_delta_q = !hdr->monochrome && dav1d_get_bits(gb, 1); +#if DEBUG_SEQ_HDR + printf("SEQHDR: post-colorinfo: off=%u\n", + dav1d_get_bits_pos(gb) - init_bit_pos); +#endif + + hdr->film_grain_present = dav1d_get_bits(gb, 1); +#if DEBUG_SEQ_HDR + printf("SEQHDR: post-filmgrain: off=%u\n", + dav1d_get_bits_pos(gb) - init_bit_pos); +#endif + + dav1d_get_bits(gb, 1); // dummy bit + + // We needn't bother flushing the OBU here: we'll check we didn't + // overrun in the caller and will then discard gb, so there's no + // point in setting its position properly. + + return 0; + +error: + dav1d_log(c, "Error parsing sequence header\n"); + return DAV1D_ERR(EINVAL); +} + +static int read_frame_size(Dav1dContext *const c, GetBits *const gb, + const int use_ref) +{ + const Dav1dSequenceHeader *const seqhdr = c->seq_hdr; + Dav1dFrameHeader *const hdr = c->frame_hdr; + + if (use_ref) { + for (int i = 0; i < 7; i++) { + if (dav1d_get_bits(gb, 1)) { + const Dav1dThreadPicture *const ref = + &c->refs[c->frame_hdr->refidx[i]].p; + if (!ref->p.data[0]) return -1; + hdr->width[1] = ref->p.p.w; + hdr->height = ref->p.p.h; + hdr->render_width = ref->p.frame_hdr->render_width; + hdr->render_height = ref->p.frame_hdr->render_height; + hdr->super_res.enabled = seqhdr->super_res && dav1d_get_bits(gb, 1); + if (hdr->super_res.enabled) { + const int d = hdr->super_res.width_scale_denominator = + 9 + dav1d_get_bits(gb, 3); + hdr->width[0] = imax((hdr->width[1] * 8 + (d >> 1)) / d, + imin(16, hdr->width[1])); + } else { + hdr->super_res.width_scale_denominator = 8; + hdr->width[0] = hdr->width[1]; + } + return 0; + } + } + } + + if (hdr->frame_size_override) { + hdr->width[1] = dav1d_get_bits(gb, seqhdr->width_n_bits) + 1; + hdr->height = dav1d_get_bits(gb, seqhdr->height_n_bits) + 1; + } else { + hdr->width[1] = seqhdr->max_width; + hdr->height = seqhdr->max_height; + } + hdr->super_res.enabled = seqhdr->super_res && dav1d_get_bits(gb, 1); + if (hdr->super_res.enabled) { + const int d = hdr->super_res.width_scale_denominator = 9 + dav1d_get_bits(gb, 3); + hdr->width[0] = imax((hdr->width[1] * 8 + (d >> 1)) / d, imin(16, hdr->width[1])); + } else { + hdr->super_res.width_scale_denominator = 8; + hdr->width[0] = hdr->width[1]; + } + hdr->have_render_size = dav1d_get_bits(gb, 1); + if (hdr->have_render_size) { + hdr->render_width = dav1d_get_bits(gb, 16) + 1; + hdr->render_height = dav1d_get_bits(gb, 16) + 1; + } else { + hdr->render_width = hdr->width[1]; + hdr->render_height = hdr->height; + } + return 0; +} + +static inline int tile_log2(const int sz, const int tgt) { + int k; + for (k = 0; (sz << k) < tgt; k++) ; + return k; +} + +static const Dav1dLoopfilterModeRefDeltas default_mode_ref_deltas = { + .mode_delta = { 0, 0 }, + .ref_delta = { 1, 0, 0, 0, -1, 0, -1, -1 }, +}; + +static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { +#define DEBUG_FRAME_HDR 0 + +#if DEBUG_FRAME_HDR + const uint8_t *const init_ptr = gb->ptr; +#endif + const Dav1dSequenceHeader *const seqhdr = c->seq_hdr; + Dav1dFrameHeader *const hdr = c->frame_hdr; + + hdr->show_existing_frame = + !seqhdr->reduced_still_picture_header && dav1d_get_bits(gb, 1); +#if DEBUG_FRAME_HDR + printf("HDR: post-show_existing_frame: off=%td\n", + (gb->ptr - init_ptr) * 8 - gb->bits_left); +#endif + if (hdr->show_existing_frame) { + hdr->existing_frame_idx = dav1d_get_bits(gb, 3); + if (seqhdr->decoder_model_info_present && !seqhdr->equal_picture_interval) + hdr->frame_presentation_delay = dav1d_get_bits(gb, seqhdr->frame_presentation_delay_length); + if (seqhdr->frame_id_numbers_present) { + hdr->frame_id = dav1d_get_bits(gb, seqhdr->frame_id_n_bits); + Dav1dFrameHeader *const ref_frame_hdr = c->refs[hdr->existing_frame_idx].p.p.frame_hdr; + if (!ref_frame_hdr || ref_frame_hdr->frame_id != hdr->frame_id) return DAV1D_ERR(EINVAL); + } + return 0; + } + + hdr->frame_type = seqhdr->reduced_still_picture_header ? DAV1D_FRAME_TYPE_KEY : dav1d_get_bits(gb, 2); + hdr->show_frame = seqhdr->reduced_still_picture_header || dav1d_get_bits(gb, 1); + if (hdr->show_frame) { + if (seqhdr->decoder_model_info_present && !seqhdr->equal_picture_interval) + hdr->frame_presentation_delay = dav1d_get_bits(gb, seqhdr->frame_presentation_delay_length); + } else + hdr->showable_frame = dav1d_get_bits(gb, 1); + hdr->error_resilient_mode = + (hdr->frame_type == DAV1D_FRAME_TYPE_KEY && hdr->show_frame) || + hdr->frame_type == DAV1D_FRAME_TYPE_SWITCH || + seqhdr->reduced_still_picture_header || dav1d_get_bits(gb, 1); +#if DEBUG_FRAME_HDR + printf("HDR: post-frametype_bits: off=%td\n", + (gb->ptr - init_ptr) * 8 - gb->bits_left); +#endif + hdr->disable_cdf_update = dav1d_get_bits(gb, 1); + hdr->allow_screen_content_tools = seqhdr->screen_content_tools == DAV1D_ADAPTIVE ? + dav1d_get_bits(gb, 1) : seqhdr->screen_content_tools; + if (hdr->allow_screen_content_tools) + hdr->force_integer_mv = seqhdr->force_integer_mv == DAV1D_ADAPTIVE ? + dav1d_get_bits(gb, 1) : seqhdr->force_integer_mv; + else + hdr->force_integer_mv = 0; + + if (!(hdr->frame_type & 1)) + hdr->force_integer_mv = 1; + + if (seqhdr->frame_id_numbers_present) + hdr->frame_id = dav1d_get_bits(gb, seqhdr->frame_id_n_bits); + + hdr->frame_size_override = seqhdr->reduced_still_picture_header ? 0 : + hdr->frame_type == DAV1D_FRAME_TYPE_SWITCH ? 1 : dav1d_get_bits(gb, 1); +#if DEBUG_FRAME_HDR + printf("HDR: post-frame_size_override_flag: off=%td\n", + (gb->ptr - init_ptr) * 8 - gb->bits_left); +#endif + hdr->frame_offset = seqhdr->order_hint ? + dav1d_get_bits(gb, seqhdr->order_hint_n_bits) : 0; + hdr->primary_ref_frame = !hdr->error_resilient_mode && hdr->frame_type & 1 ? + dav1d_get_bits(gb, 3) : DAV1D_PRIMARY_REF_NONE; + + if (seqhdr->decoder_model_info_present) { + hdr->buffer_removal_time_present = dav1d_get_bits(gb, 1); + if (hdr->buffer_removal_time_present) { + for (int i = 0; i < c->seq_hdr->num_operating_points; i++) { + const struct Dav1dSequenceHeaderOperatingPoint *const seqop = &seqhdr->operating_points[i]; + struct Dav1dFrameHeaderOperatingPoint *const op = &hdr->operating_points[i]; + if (seqop->decoder_model_param_present) { + int in_temporal_layer = (seqop->idc >> hdr->temporal_id) & 1; + int in_spatial_layer = (seqop->idc >> (hdr->spatial_id + 8)) & 1; + if (!seqop->idc || (in_temporal_layer && in_spatial_layer)) + op->buffer_removal_time = dav1d_get_bits(gb, seqhdr->buffer_removal_delay_length); + } + } + } + } + + if (hdr->frame_type == DAV1D_FRAME_TYPE_KEY || + hdr->frame_type == DAV1D_FRAME_TYPE_INTRA) + { + hdr->refresh_frame_flags = (hdr->frame_type == DAV1D_FRAME_TYPE_KEY && + hdr->show_frame) ? 0xff : dav1d_get_bits(gb, 8); + if (hdr->refresh_frame_flags != 0xff && hdr->error_resilient_mode && seqhdr->order_hint) + for (int i = 0; i < 8; i++) + dav1d_get_bits(gb, seqhdr->order_hint_n_bits); + if (read_frame_size(c, gb, 0) < 0) goto error; + hdr->allow_intrabc = hdr->allow_screen_content_tools && + !hdr->super_res.enabled && dav1d_get_bits(gb, 1); + hdr->use_ref_frame_mvs = 0; + } else { + hdr->allow_intrabc = 0; + hdr->refresh_frame_flags = hdr->frame_type == DAV1D_FRAME_TYPE_SWITCH ? 0xff : + dav1d_get_bits(gb, 8); + if (hdr->error_resilient_mode && seqhdr->order_hint) + for (int i = 0; i < 8; i++) + dav1d_get_bits(gb, seqhdr->order_hint_n_bits); + hdr->frame_ref_short_signaling = + seqhdr->order_hint && dav1d_get_bits(gb, 1); + if (hdr->frame_ref_short_signaling) { // FIXME: Nearly verbatim copy from section 7.8 + hdr->refidx[0] = dav1d_get_bits(gb, 3); + hdr->refidx[1] = hdr->refidx[2] = -1; + hdr->refidx[3] = dav1d_get_bits(gb, 3); + hdr->refidx[4] = hdr->refidx[5] = hdr->refidx[6] = -1; + + int shifted_frame_offset[8]; + const int current_frame_offset = 1 << (seqhdr->order_hint_n_bits - 1); + for (int i = 0; i < 8; i++) { + if (!c->refs[i].p.p.frame_hdr) goto error; + shifted_frame_offset[i] = current_frame_offset + + get_poc_diff(seqhdr->order_hint_n_bits, + c->refs[i].p.p.frame_hdr->frame_offset, + hdr->frame_offset); + } + + int used_frame[8] = { 0 }; + used_frame[hdr->refidx[0]] = 1; + used_frame[hdr->refidx[3]] = 1; + + int latest_frame_offset = -1; + for (int i = 0; i < 8; i++) { + const int hint = shifted_frame_offset[i]; + if (!used_frame[i] && hint >= current_frame_offset && + hint >= latest_frame_offset) + { + hdr->refidx[6] = i; + latest_frame_offset = hint; + } + } + if (latest_frame_offset != -1) + used_frame[hdr->refidx[6]] = 1; + + int earliest_frame_offset = INT_MAX; + for (int i = 0; i < 8; i++) { + const int hint = shifted_frame_offset[i]; + if (!used_frame[i] && hint >= current_frame_offset && + hint < earliest_frame_offset) + { + hdr->refidx[4] = i; + earliest_frame_offset = hint; + } + } + if (earliest_frame_offset != INT_MAX) + used_frame[hdr->refidx[4]] = 1; + + earliest_frame_offset = INT_MAX; + for (int i = 0; i < 8; i++) { + const int hint = shifted_frame_offset[i]; + if (!used_frame[i] && hint >= current_frame_offset && + (hint < earliest_frame_offset)) + { + hdr->refidx[5] = i; + earliest_frame_offset = hint; + } + } + if (earliest_frame_offset != INT_MAX) + used_frame[hdr->refidx[5]] = 1; + + for (int i = 1; i < 7; i++) { + if (hdr->refidx[i] < 0) { + latest_frame_offset = -1; + for (int j = 0; j < 8; j++) { + const int hint = shifted_frame_offset[j]; + if (!used_frame[j] && hint < current_frame_offset && + hint >= latest_frame_offset) + { + hdr->refidx[i] = j; + latest_frame_offset = hint; + } + } + if (latest_frame_offset != -1) + used_frame[hdr->refidx[i]] = 1; + } + } + + earliest_frame_offset = INT_MAX; + int ref = -1; + for (int i = 0; i < 8; i++) { + const int hint = shifted_frame_offset[i]; + if (hint < earliest_frame_offset) { + ref = i; + earliest_frame_offset = hint; + } + } + for (int i = 0; i < 7; i++) { + if (hdr->refidx[i] < 0) + hdr->refidx[i] = ref; + } + } + for (int i = 0; i < 7; i++) { + if (!hdr->frame_ref_short_signaling) + hdr->refidx[i] = dav1d_get_bits(gb, 3); + if (seqhdr->frame_id_numbers_present) { + const int delta_ref_frame_id_minus_1 = dav1d_get_bits(gb, seqhdr->delta_frame_id_n_bits); + const int ref_frame_id = (hdr->frame_id + (1 << seqhdr->frame_id_n_bits) - delta_ref_frame_id_minus_1 - 1) & ((1 << seqhdr->frame_id_n_bits) - 1); + Dav1dFrameHeader *const ref_frame_hdr = c->refs[hdr->refidx[i]].p.p.frame_hdr; + if (!ref_frame_hdr || ref_frame_hdr->frame_id != ref_frame_id) goto error; + } + } + const int use_ref = !hdr->error_resilient_mode && + hdr->frame_size_override; + if (read_frame_size(c, gb, use_ref) < 0) goto error; + hdr->hp = !hdr->force_integer_mv && dav1d_get_bits(gb, 1); + hdr->subpel_filter_mode = dav1d_get_bits(gb, 1) ? DAV1D_FILTER_SWITCHABLE : + dav1d_get_bits(gb, 2); + hdr->switchable_motion_mode = dav1d_get_bits(gb, 1); + hdr->use_ref_frame_mvs = !hdr->error_resilient_mode && + seqhdr->ref_frame_mvs && seqhdr->order_hint && + hdr->frame_type & 1 && dav1d_get_bits(gb, 1); + } +#if DEBUG_FRAME_HDR + printf("HDR: post-frametype-specific-bits: off=%td\n", + (gb->ptr - init_ptr) * 8 - gb->bits_left); +#endif + + hdr->refresh_context = !seqhdr->reduced_still_picture_header && + !hdr->disable_cdf_update && !dav1d_get_bits(gb, 1); +#if DEBUG_FRAME_HDR + printf("HDR: post-refresh_context: off=%td\n", + (gb->ptr - init_ptr) * 8 - gb->bits_left); +#endif + + // tile data + hdr->tiling.uniform = dav1d_get_bits(gb, 1); + const int sbsz_min1 = (64 << seqhdr->sb128) - 1; + const int sbsz_log2 = 6 + seqhdr->sb128; + const int sbw = (hdr->width[0] + sbsz_min1) >> sbsz_log2; + const int sbh = (hdr->height + sbsz_min1) >> sbsz_log2; + const int max_tile_width_sb = 4096 >> sbsz_log2; + const int max_tile_area_sb = 4096 * 2304 >> (2 * sbsz_log2); + hdr->tiling.min_log2_cols = tile_log2(max_tile_width_sb, sbw); + hdr->tiling.max_log2_cols = tile_log2(1, imin(sbw, DAV1D_MAX_TILE_COLS)); + hdr->tiling.max_log2_rows = tile_log2(1, imin(sbh, DAV1D_MAX_TILE_ROWS)); + const int min_log2_tiles = imax(tile_log2(max_tile_area_sb, sbw * sbh), + hdr->tiling.min_log2_cols); + if (hdr->tiling.uniform) { + for (hdr->tiling.log2_cols = hdr->tiling.min_log2_cols; + hdr->tiling.log2_cols < hdr->tiling.max_log2_cols && dav1d_get_bits(gb, 1); + hdr->tiling.log2_cols++) ; + const int tile_w = 1 + ((sbw - 1) >> hdr->tiling.log2_cols); + hdr->tiling.cols = 0; + for (int sbx = 0; sbx < sbw; sbx += tile_w, hdr->tiling.cols++) + hdr->tiling.col_start_sb[hdr->tiling.cols] = sbx; + hdr->tiling.min_log2_rows = + imax(min_log2_tiles - hdr->tiling.log2_cols, 0); + + for (hdr->tiling.log2_rows = hdr->tiling.min_log2_rows; + hdr->tiling.log2_rows < hdr->tiling.max_log2_rows && dav1d_get_bits(gb, 1); + hdr->tiling.log2_rows++) ; + const int tile_h = 1 + ((sbh - 1) >> hdr->tiling.log2_rows); + hdr->tiling.rows = 0; + for (int sby = 0; sby < sbh; sby += tile_h, hdr->tiling.rows++) + hdr->tiling.row_start_sb[hdr->tiling.rows] = sby; + } else { + hdr->tiling.cols = 0; + int widest_tile = 0, max_tile_area_sb = sbw * sbh; + for (int sbx = 0; sbx < sbw && hdr->tiling.cols < DAV1D_MAX_TILE_COLS; hdr->tiling.cols++) { + const int tile_width_sb = imin(sbw - sbx, max_tile_width_sb); + const int tile_w = (tile_width_sb > 1) ? + 1 + dav1d_get_uniform(gb, tile_width_sb) : + 1; + hdr->tiling.col_start_sb[hdr->tiling.cols] = sbx; + sbx += tile_w; + widest_tile = imax(widest_tile, tile_w); + } + hdr->tiling.log2_cols = tile_log2(1, hdr->tiling.cols); + if (min_log2_tiles) max_tile_area_sb >>= min_log2_tiles + 1; + const int max_tile_height_sb = imax(max_tile_area_sb / widest_tile, 1); + + hdr->tiling.rows = 0; + for (int sby = 0; sby < sbh && hdr->tiling.rows < DAV1D_MAX_TILE_ROWS; hdr->tiling.rows++) { + const int tile_height_sb = imin(sbh - sby, max_tile_height_sb); + const int tile_h = (tile_height_sb > 1) ? + 1 + dav1d_get_uniform(gb, tile_height_sb) : + 1; + hdr->tiling.row_start_sb[hdr->tiling.rows] = sby; + sby += tile_h; + } + hdr->tiling.log2_rows = tile_log2(1, hdr->tiling.rows); + } + hdr->tiling.col_start_sb[hdr->tiling.cols] = sbw; + hdr->tiling.row_start_sb[hdr->tiling.rows] = sbh; + if (hdr->tiling.log2_cols || hdr->tiling.log2_rows) { + hdr->tiling.update = dav1d_get_bits(gb, hdr->tiling.log2_cols + + hdr->tiling.log2_rows); + if (hdr->tiling.update >= hdr->tiling.cols * hdr->tiling.rows) + goto error; + hdr->tiling.n_bytes = dav1d_get_bits(gb, 2) + 1; + } else { + hdr->tiling.n_bytes = hdr->tiling.update = 0; + } +#if DEBUG_FRAME_HDR + printf("HDR: post-tiling: off=%td\n", + (gb->ptr - init_ptr) * 8 - gb->bits_left); +#endif + + // quant data + hdr->quant.yac = dav1d_get_bits(gb, 8); + hdr->quant.ydc_delta = dav1d_get_bits(gb, 1) ? dav1d_get_sbits(gb, 6) : 0; + if (!seqhdr->monochrome) { + // If the sequence header says that delta_q might be different + // for U, V, we must check whether it actually is for this + // frame. + const int diff_uv_delta = seqhdr->separate_uv_delta_q ? dav1d_get_bits(gb, 1) : 0; + hdr->quant.udc_delta = dav1d_get_bits(gb, 1) ? dav1d_get_sbits(gb, 6) : 0; + hdr->quant.uac_delta = dav1d_get_bits(gb, 1) ? dav1d_get_sbits(gb, 6) : 0; + if (diff_uv_delta) { + hdr->quant.vdc_delta = dav1d_get_bits(gb, 1) ? dav1d_get_sbits(gb, 6) : 0; + hdr->quant.vac_delta = dav1d_get_bits(gb, 1) ? dav1d_get_sbits(gb, 6) : 0; + } else { + hdr->quant.vdc_delta = hdr->quant.udc_delta; + hdr->quant.vac_delta = hdr->quant.uac_delta; + } + } +#if DEBUG_FRAME_HDR + printf("HDR: post-quant: off=%td\n", + (gb->ptr - init_ptr) * 8 - gb->bits_left); +#endif + hdr->quant.qm = dav1d_get_bits(gb, 1); + if (hdr->quant.qm) { + hdr->quant.qm_y = dav1d_get_bits(gb, 4); + hdr->quant.qm_u = dav1d_get_bits(gb, 4); + hdr->quant.qm_v = + seqhdr->separate_uv_delta_q ? (int)dav1d_get_bits(gb, 4) : + hdr->quant.qm_u; + } +#if DEBUG_FRAME_HDR + printf("HDR: post-qm: off=%td\n", + (gb->ptr - init_ptr) * 8 - gb->bits_left); +#endif + + // segmentation data + hdr->segmentation.enabled = dav1d_get_bits(gb, 1); + if (hdr->segmentation.enabled) { + if (hdr->primary_ref_frame == DAV1D_PRIMARY_REF_NONE) { + hdr->segmentation.update_map = 1; + hdr->segmentation.temporal = 0; + hdr->segmentation.update_data = 1; + } else { + hdr->segmentation.update_map = dav1d_get_bits(gb, 1); + hdr->segmentation.temporal = + hdr->segmentation.update_map ? dav1d_get_bits(gb, 1) : 0; + hdr->segmentation.update_data = dav1d_get_bits(gb, 1); + } + + if (hdr->segmentation.update_data) { + hdr->segmentation.seg_data.preskip = 0; + hdr->segmentation.seg_data.last_active_segid = -1; + for (int i = 0; i < DAV1D_MAX_SEGMENTS; i++) { + Dav1dSegmentationData *const seg = + &hdr->segmentation.seg_data.d[i]; + if (dav1d_get_bits(gb, 1)) { + seg->delta_q = dav1d_get_sbits(gb, 8); + hdr->segmentation.seg_data.last_active_segid = i; + } else { + seg->delta_q = 0; + } + if (dav1d_get_bits(gb, 1)) { + seg->delta_lf_y_v = dav1d_get_sbits(gb, 6); + hdr->segmentation.seg_data.last_active_segid = i; + } else { + seg->delta_lf_y_v = 0; + } + if (dav1d_get_bits(gb, 1)) { + seg->delta_lf_y_h = dav1d_get_sbits(gb, 6); + hdr->segmentation.seg_data.last_active_segid = i; + } else { + seg->delta_lf_y_h = 0; + } + if (dav1d_get_bits(gb, 1)) { + seg->delta_lf_u = dav1d_get_sbits(gb, 6); + hdr->segmentation.seg_data.last_active_segid = i; + } else { + seg->delta_lf_u = 0; + } + if (dav1d_get_bits(gb, 1)) { + seg->delta_lf_v = dav1d_get_sbits(gb, 6); + hdr->segmentation.seg_data.last_active_segid = i; + } else { + seg->delta_lf_v = 0; + } + if (dav1d_get_bits(gb, 1)) { + seg->ref = dav1d_get_bits(gb, 3); + hdr->segmentation.seg_data.last_active_segid = i; + hdr->segmentation.seg_data.preskip = 1; + } else { + seg->ref = -1; + } + if ((seg->skip = dav1d_get_bits(gb, 1))) { + hdr->segmentation.seg_data.last_active_segid = i; + hdr->segmentation.seg_data.preskip = 1; + } + if ((seg->globalmv = dav1d_get_bits(gb, 1))) { + hdr->segmentation.seg_data.last_active_segid = i; + hdr->segmentation.seg_data.preskip = 1; + } + } + } else { + // segmentation.update_data was false so we should copy + // segmentation data from the reference frame. + assert(hdr->primary_ref_frame != DAV1D_PRIMARY_REF_NONE); + const int pri_ref = hdr->refidx[hdr->primary_ref_frame]; + if (!c->refs[pri_ref].p.p.frame_hdr) return DAV1D_ERR(EINVAL); + hdr->segmentation.seg_data = + c->refs[pri_ref].p.p.frame_hdr->segmentation.seg_data; + } + } else { + memset(&hdr->segmentation.seg_data, 0, sizeof(Dav1dSegmentationDataSet)); + for (int i = 0; i < DAV1D_MAX_SEGMENTS; i++) + hdr->segmentation.seg_data.d[i].ref = -1; + } +#if DEBUG_FRAME_HDR + printf("HDR: post-segmentation: off=%td\n", + (gb->ptr - init_ptr) * 8 - gb->bits_left); +#endif + + // delta q + hdr->delta.q.present = hdr->quant.yac ? dav1d_get_bits(gb, 1) : 0; + hdr->delta.q.res_log2 = hdr->delta.q.present ? dav1d_get_bits(gb, 2) : 0; + hdr->delta.lf.present = hdr->delta.q.present && !hdr->allow_intrabc && + dav1d_get_bits(gb, 1); + hdr->delta.lf.res_log2 = hdr->delta.lf.present ? dav1d_get_bits(gb, 2) : 0; + hdr->delta.lf.multi = hdr->delta.lf.present ? dav1d_get_bits(gb, 1) : 0; +#if DEBUG_FRAME_HDR + printf("HDR: post-delta_q_lf_flags: off=%td\n", + (gb->ptr - init_ptr) * 8 - gb->bits_left); +#endif + + // derive lossless flags + const int delta_lossless = !hdr->quant.ydc_delta && !hdr->quant.udc_delta && + !hdr->quant.uac_delta && !hdr->quant.vdc_delta && !hdr->quant.vac_delta; + hdr->all_lossless = 1; + for (int i = 0; i < DAV1D_MAX_SEGMENTS; i++) { + hdr->segmentation.qidx[i] = hdr->segmentation.enabled ? + iclip_u8(hdr->quant.yac + hdr->segmentation.seg_data.d[i].delta_q) : + hdr->quant.yac; + hdr->segmentation.lossless[i] = + !hdr->segmentation.qidx[i] && delta_lossless; + hdr->all_lossless &= hdr->segmentation.lossless[i]; + } + + // loopfilter + if (hdr->all_lossless || hdr->allow_intrabc) { + hdr->loopfilter.level_y[0] = hdr->loopfilter.level_y[1] = 0; + hdr->loopfilter.level_u = hdr->loopfilter.level_v = 0; + hdr->loopfilter.sharpness = 0; + hdr->loopfilter.mode_ref_delta_enabled = 1; + hdr->loopfilter.mode_ref_delta_update = 1; + hdr->loopfilter.mode_ref_deltas = default_mode_ref_deltas; + } else { + hdr->loopfilter.level_y[0] = dav1d_get_bits(gb, 6); + hdr->loopfilter.level_y[1] = dav1d_get_bits(gb, 6); + if (!seqhdr->monochrome && + (hdr->loopfilter.level_y[0] || hdr->loopfilter.level_y[1])) + { + hdr->loopfilter.level_u = dav1d_get_bits(gb, 6); + hdr->loopfilter.level_v = dav1d_get_bits(gb, 6); + } + hdr->loopfilter.sharpness = dav1d_get_bits(gb, 3); + + if (hdr->primary_ref_frame == DAV1D_PRIMARY_REF_NONE) { + hdr->loopfilter.mode_ref_deltas = default_mode_ref_deltas; + } else { + const int ref = hdr->refidx[hdr->primary_ref_frame]; + if (!c->refs[ref].p.p.frame_hdr) return DAV1D_ERR(EINVAL); + hdr->loopfilter.mode_ref_deltas = + c->refs[ref].p.p.frame_hdr->loopfilter.mode_ref_deltas; + } + hdr->loopfilter.mode_ref_delta_enabled = dav1d_get_bits(gb, 1); + if (hdr->loopfilter.mode_ref_delta_enabled) { + hdr->loopfilter.mode_ref_delta_update = dav1d_get_bits(gb, 1); + if (hdr->loopfilter.mode_ref_delta_update) { + for (int i = 0; i < 8; i++) + if (dav1d_get_bits(gb, 1)) + hdr->loopfilter.mode_ref_deltas.ref_delta[i] = + dav1d_get_sbits(gb, 6); + for (int i = 0; i < 2; i++) + if (dav1d_get_bits(gb, 1)) + hdr->loopfilter.mode_ref_deltas.mode_delta[i] = + dav1d_get_sbits(gb, 6); + } + } + } +#if DEBUG_FRAME_HDR + printf("HDR: post-lpf: off=%td\n", + (gb->ptr - init_ptr) * 8 - gb->bits_left); +#endif + + // cdef + if (!hdr->all_lossless && seqhdr->cdef && !hdr->allow_intrabc) { + hdr->cdef.damping = dav1d_get_bits(gb, 2) + 3; + hdr->cdef.n_bits = dav1d_get_bits(gb, 2); + for (int i = 0; i < (1 << hdr->cdef.n_bits); i++) { + hdr->cdef.y_strength[i] = dav1d_get_bits(gb, 6); + if (!seqhdr->monochrome) + hdr->cdef.uv_strength[i] = dav1d_get_bits(gb, 6); + } + } else { + hdr->cdef.n_bits = 0; + hdr->cdef.y_strength[0] = 0; + hdr->cdef.uv_strength[0] = 0; + } +#if DEBUG_FRAME_HDR + printf("HDR: post-cdef: off=%td\n", + (gb->ptr - init_ptr) * 8 - gb->bits_left); +#endif + + // restoration + if ((!hdr->all_lossless || hdr->super_res.enabled) && + seqhdr->restoration && !hdr->allow_intrabc) + { + hdr->restoration.type[0] = dav1d_get_bits(gb, 2); + if (!seqhdr->monochrome) { + hdr->restoration.type[1] = dav1d_get_bits(gb, 2); + hdr->restoration.type[2] = dav1d_get_bits(gb, 2); + } else { + hdr->restoration.type[1] = + hdr->restoration.type[2] = DAV1D_RESTORATION_NONE; + } + + if (hdr->restoration.type[0] || hdr->restoration.type[1] || + hdr->restoration.type[2]) + { + // Log2 of the restoration unit size. + hdr->restoration.unit_size[0] = 6 + seqhdr->sb128; + if (dav1d_get_bits(gb, 1)) { + hdr->restoration.unit_size[0]++; + if (!seqhdr->sb128) + hdr->restoration.unit_size[0] += dav1d_get_bits(gb, 1); + } + hdr->restoration.unit_size[1] = hdr->restoration.unit_size[0]; + if ((hdr->restoration.type[1] || hdr->restoration.type[2]) && + seqhdr->ss_hor == 1 && seqhdr->ss_ver == 1) + { + hdr->restoration.unit_size[1] -= dav1d_get_bits(gb, 1); + } + } else { + hdr->restoration.unit_size[0] = 8; + } + } else { + hdr->restoration.type[0] = DAV1D_RESTORATION_NONE; + hdr->restoration.type[1] = DAV1D_RESTORATION_NONE; + hdr->restoration.type[2] = DAV1D_RESTORATION_NONE; + } +#if DEBUG_FRAME_HDR + printf("HDR: post-restoration: off=%td\n", + (gb->ptr - init_ptr) * 8 - gb->bits_left); +#endif + + hdr->txfm_mode = hdr->all_lossless ? DAV1D_TX_4X4_ONLY : + dav1d_get_bits(gb, 1) ? DAV1D_TX_SWITCHABLE : DAV1D_TX_LARGEST; +#if DEBUG_FRAME_HDR + printf("HDR: post-txfmmode: off=%td\n", + (gb->ptr - init_ptr) * 8 - gb->bits_left); +#endif + hdr->switchable_comp_refs = hdr->frame_type & 1 ? dav1d_get_bits(gb, 1) : 0; +#if DEBUG_FRAME_HDR + printf("HDR: post-refmode: off=%td\n", + (gb->ptr - init_ptr) * 8 - gb->bits_left); +#endif + hdr->skip_mode_allowed = 0; + if (hdr->switchable_comp_refs && hdr->frame_type & 1 && seqhdr->order_hint) { + const unsigned poc = hdr->frame_offset; + unsigned off_before = 0xFFFFFFFFU; + int off_after = -1; + int off_before_idx, off_after_idx; + for (int i = 0; i < 7; i++) { + if (!c->refs[hdr->refidx[i]].p.p.data[0]) return DAV1D_ERR(EINVAL); + const unsigned refpoc = c->refs[hdr->refidx[i]].p.p.frame_hdr->frame_offset; + + const int diff = get_poc_diff(seqhdr->order_hint_n_bits, refpoc, poc); + if (diff > 0) { + if (off_after == -1 || get_poc_diff(seqhdr->order_hint_n_bits, + off_after, refpoc) > 0) + { + off_after = refpoc; + off_after_idx = i; + } + } else if (diff < 0 && (off_before == 0xFFFFFFFFU || + get_poc_diff(seqhdr->order_hint_n_bits, + refpoc, off_before) > 0)) + { + off_before = refpoc; + off_before_idx = i; + } + } + + if (off_before != 0xFFFFFFFFU && off_after != -1) { + hdr->skip_mode_refs[0] = imin(off_before_idx, off_after_idx); + hdr->skip_mode_refs[1] = imax(off_before_idx, off_after_idx); + hdr->skip_mode_allowed = 1; + } else if (off_before != 0xFFFFFFFFU) { + unsigned off_before2 = 0xFFFFFFFFU; + int off_before2_idx; + for (int i = 0; i < 7; i++) { + if (!c->refs[hdr->refidx[i]].p.p.data[0]) return DAV1D_ERR(EINVAL); + const unsigned refpoc = c->refs[hdr->refidx[i]].p.p.frame_hdr->frame_offset; + if (get_poc_diff(seqhdr->order_hint_n_bits, + refpoc, off_before) < 0) { + if (off_before2 == 0xFFFFFFFFU || + get_poc_diff(seqhdr->order_hint_n_bits, + refpoc, off_before2) > 0) + { + off_before2 = refpoc; + off_before2_idx = i; + } + } + } + + if (off_before2 != 0xFFFFFFFFU) { + hdr->skip_mode_refs[0] = imin(off_before_idx, off_before2_idx); + hdr->skip_mode_refs[1] = imax(off_before_idx, off_before2_idx); + hdr->skip_mode_allowed = 1; + } + } + } + hdr->skip_mode_enabled = hdr->skip_mode_allowed ? dav1d_get_bits(gb, 1) : 0; +#if DEBUG_FRAME_HDR + printf("HDR: post-extskip: off=%td\n", + (gb->ptr - init_ptr) * 8 - gb->bits_left); +#endif + hdr->warp_motion = !hdr->error_resilient_mode && hdr->frame_type & 1 && + seqhdr->warped_motion && dav1d_get_bits(gb, 1); +#if DEBUG_FRAME_HDR + printf("HDR: post-warpmotionbit: off=%td\n", + (gb->ptr - init_ptr) * 8 - gb->bits_left); +#endif + hdr->reduced_txtp_set = dav1d_get_bits(gb, 1); +#if DEBUG_FRAME_HDR + printf("HDR: post-reducedtxtpset: off=%td\n", + (gb->ptr - init_ptr) * 8 - gb->bits_left); +#endif + + for (int i = 0; i < 7; i++) + hdr->gmv[i] = dav1d_default_wm_params; + + if (hdr->frame_type & 1) { + for (int i = 0; i < 7; i++) { + hdr->gmv[i].type = !dav1d_get_bits(gb, 1) ? DAV1D_WM_TYPE_IDENTITY : + dav1d_get_bits(gb, 1) ? DAV1D_WM_TYPE_ROT_ZOOM : + dav1d_get_bits(gb, 1) ? DAV1D_WM_TYPE_TRANSLATION : + DAV1D_WM_TYPE_AFFINE; + + if (hdr->gmv[i].type == DAV1D_WM_TYPE_IDENTITY) continue; + + const Dav1dWarpedMotionParams *ref_gmv; + if (hdr->primary_ref_frame == DAV1D_PRIMARY_REF_NONE) { + ref_gmv = &dav1d_default_wm_params; + } else { + const int pri_ref = hdr->refidx[hdr->primary_ref_frame]; + if (!c->refs[pri_ref].p.p.frame_hdr) return DAV1D_ERR(EINVAL); + ref_gmv = &c->refs[pri_ref].p.p.frame_hdr->gmv[i]; + } + int32_t *const mat = hdr->gmv[i].matrix; + const int32_t *const ref_mat = ref_gmv->matrix; + int bits, shift; + + if (hdr->gmv[i].type >= DAV1D_WM_TYPE_ROT_ZOOM) { + mat[2] = (1 << 16) + 2 * + dav1d_get_bits_subexp(gb, (ref_mat[2] - (1 << 16)) >> 1, 12); + mat[3] = 2 * dav1d_get_bits_subexp(gb, ref_mat[3] >> 1, 12); + + bits = 12; + shift = 10; + } else { + bits = 9 - !hdr->hp; + shift = 13 + !hdr->hp; + } + + if (hdr->gmv[i].type == DAV1D_WM_TYPE_AFFINE) { + mat[4] = 2 * dav1d_get_bits_subexp(gb, ref_mat[4] >> 1, 12); + mat[5] = (1 << 16) + 2 * + dav1d_get_bits_subexp(gb, (ref_mat[5] - (1 << 16)) >> 1, 12); + } else { + mat[4] = -mat[3]; + mat[5] = mat[2]; + } + + mat[0] = dav1d_get_bits_subexp(gb, ref_mat[0] >> shift, bits) * (1 << shift); + mat[1] = dav1d_get_bits_subexp(gb, ref_mat[1] >> shift, bits) * (1 << shift); + } + } +#if DEBUG_FRAME_HDR + printf("HDR: post-gmv: off=%td\n", + (gb->ptr - init_ptr) * 8 - gb->bits_left); +#endif + + hdr->film_grain.present = seqhdr->film_grain_present && + (hdr->show_frame || hdr->showable_frame) && + dav1d_get_bits(gb, 1); + if (hdr->film_grain.present) { + const unsigned seed = dav1d_get_bits(gb, 16); + hdr->film_grain.update = hdr->frame_type != DAV1D_FRAME_TYPE_INTER || dav1d_get_bits(gb, 1); + if (!hdr->film_grain.update) { + const int refidx = dav1d_get_bits(gb, 3); + int i; + for (i = 0; i < 7; i++) + if (hdr->refidx[i] == refidx) + break; + if (i == 7 || !c->refs[refidx].p.p.frame_hdr) goto error; + hdr->film_grain.data = c->refs[refidx].p.p.frame_hdr->film_grain.data; + hdr->film_grain.data.seed = seed; + } else { + Dav1dFilmGrainData *const fgd = &hdr->film_grain.data; + fgd->seed = seed; + + fgd->num_y_points = dav1d_get_bits(gb, 4); + if (fgd->num_y_points > 14) goto error; + for (int i = 0; i < fgd->num_y_points; i++) { + fgd->y_points[i][0] = dav1d_get_bits(gb, 8); + if (i && fgd->y_points[i - 1][0] >= fgd->y_points[i][0]) + goto error; + fgd->y_points[i][1] = dav1d_get_bits(gb, 8); + } + + fgd->chroma_scaling_from_luma = + !seqhdr->monochrome && dav1d_get_bits(gb, 1); + if (seqhdr->monochrome || fgd->chroma_scaling_from_luma || + (seqhdr->ss_ver == 1 && seqhdr->ss_hor == 1 && !fgd->num_y_points)) + { + fgd->num_uv_points[0] = fgd->num_uv_points[1] = 0; + } else for (int pl = 0; pl < 2; pl++) { + fgd->num_uv_points[pl] = dav1d_get_bits(gb, 4); + if (fgd->num_uv_points[pl] > 10) goto error; + for (int i = 0; i < fgd->num_uv_points[pl]; i++) { + fgd->uv_points[pl][i][0] = dav1d_get_bits(gb, 8); + if (i && fgd->uv_points[pl][i - 1][0] >= fgd->uv_points[pl][i][0]) + goto error; + fgd->uv_points[pl][i][1] = dav1d_get_bits(gb, 8); + } + } + + if (seqhdr->ss_hor == 1 && seqhdr->ss_ver == 1 && + !!fgd->num_uv_points[0] != !!fgd->num_uv_points[1]) + { + goto error; + } + + fgd->scaling_shift = dav1d_get_bits(gb, 2) + 8; + fgd->ar_coeff_lag = dav1d_get_bits(gb, 2); + const int num_y_pos = 2 * fgd->ar_coeff_lag * (fgd->ar_coeff_lag + 1); + if (fgd->num_y_points) + for (int i = 0; i < num_y_pos; i++) + fgd->ar_coeffs_y[i] = dav1d_get_bits(gb, 8) - 128; + for (int pl = 0; pl < 2; pl++) + if (fgd->num_uv_points[pl] || fgd->chroma_scaling_from_luma) { + const int num_uv_pos = num_y_pos + !!fgd->num_y_points; + for (int i = 0; i < num_uv_pos; i++) + fgd->ar_coeffs_uv[pl][i] = dav1d_get_bits(gb, 8) - 128; + if (!fgd->num_y_points) + fgd->ar_coeffs_uv[pl][num_uv_pos] = 0; + } + fgd->ar_coeff_shift = dav1d_get_bits(gb, 2) + 6; + fgd->grain_scale_shift = dav1d_get_bits(gb, 2); + for (int pl = 0; pl < 2; pl++) + if (fgd->num_uv_points[pl]) { + fgd->uv_mult[pl] = dav1d_get_bits(gb, 8) - 128; + fgd->uv_luma_mult[pl] = dav1d_get_bits(gb, 8) - 128; + fgd->uv_offset[pl] = dav1d_get_bits(gb, 9) - 256; + } + fgd->overlap_flag = dav1d_get_bits(gb, 1); + fgd->clip_to_restricted_range = dav1d_get_bits(gb, 1); + } + } else { + memset(&hdr->film_grain.data, 0, sizeof(hdr->film_grain.data)); + } +#if DEBUG_FRAME_HDR + printf("HDR: post-filmgrain: off=%td\n", + (gb->ptr - init_ptr) * 8 - gb->bits_left); +#endif + + return 0; + +error: + dav1d_log(c, "Error parsing frame header\n"); + return DAV1D_ERR(EINVAL); +} + +static void parse_tile_hdr(Dav1dContext *const c, GetBits *const gb) { + const int n_tiles = c->frame_hdr->tiling.cols * c->frame_hdr->tiling.rows; + const int have_tile_pos = n_tiles > 1 ? dav1d_get_bits(gb, 1) : 0; + + if (have_tile_pos) { + const int n_bits = c->frame_hdr->tiling.log2_cols + + c->frame_hdr->tiling.log2_rows; + c->tile[c->n_tile_data].start = dav1d_get_bits(gb, n_bits); + c->tile[c->n_tile_data].end = dav1d_get_bits(gb, n_bits); + } else { + c->tile[c->n_tile_data].start = 0; + c->tile[c->n_tile_data].end = n_tiles - 1; + } +} + +// Check that we haven't read more than obu_len bytes from the buffer +// since init_bit_pos. +static int check_for_overrun(Dav1dContext *const c, GetBits *const gb, + const unsigned init_bit_pos, + const unsigned obu_len) +{ + // Make sure we haven't actually read past the end of the gb buffer + if (gb->error) { + dav1d_log(c, "Overrun in OBU bit buffer\n"); + return 1; + } + + const unsigned pos = dav1d_get_bits_pos(gb); + + // We assume that init_bit_pos was the bit position of the buffer + // at some point in the past, so cannot be smaller than pos. + assert (init_bit_pos <= pos); + + if (pos - init_bit_pos > 8 * obu_len) { + dav1d_log(c, "Overrun in OBU bit buffer into next OBU\n"); + return 1; + } + + return 0; +} + +int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int global) { + GetBits gb; + int res; + + dav1d_init_get_bits(&gb, in->data, in->sz); + + // obu header + dav1d_get_bits(&gb, 1); // obu_forbidden_bit + const enum Dav1dObuType type = dav1d_get_bits(&gb, 4); + const int has_extension = dav1d_get_bits(&gb, 1); + const int has_length_field = dav1d_get_bits(&gb, 1); + dav1d_get_bits(&gb, 1); // reserved + + int temporal_id = 0, spatial_id = 0; + if (has_extension) { + temporal_id = dav1d_get_bits(&gb, 3); + spatial_id = dav1d_get_bits(&gb, 2); + dav1d_get_bits(&gb, 3); // reserved + } + + // obu length field + const unsigned len = has_length_field ? + dav1d_get_uleb128(&gb) : (unsigned) in->sz - 1 - has_extension; + if (gb.error) goto error; + + const unsigned init_bit_pos = dav1d_get_bits_pos(&gb); + const unsigned init_byte_pos = init_bit_pos >> 3; + + // We must have read a whole number of bytes at this point (1 byte + // for the header and whole bytes at a time when reading the + // leb128 length field). + assert((init_bit_pos & 7) == 0); + + // We also know that we haven't tried to read more than in->sz + // bytes yet (otherwise the error flag would have been set by the + // code in getbits.c) + assert(in->sz >= init_byte_pos); + + // Make sure that there are enough bits left in the buffer for the + // rest of the OBU. + if (len > in->sz - init_byte_pos) goto error; + + // skip obu not belonging to the selected temporal/spatial layer + if (type != DAV1D_OBU_SEQ_HDR && type != DAV1D_OBU_TD && + has_extension && c->operating_point_idc != 0) + { + const int in_temporal_layer = (c->operating_point_idc >> temporal_id) & 1; + const int in_spatial_layer = (c->operating_point_idc >> (spatial_id + 8)) & 1; + if (!in_temporal_layer || !in_spatial_layer) + return len + init_byte_pos; + } + + switch (type) { + case DAV1D_OBU_SEQ_HDR: { + Dav1dRef *ref = dav1d_ref_create_using_pool(&c->seq_hdr_pool, + sizeof(Dav1dSequenceHeader)); + if (!ref) return DAV1D_ERR(ENOMEM); + Dav1dSequenceHeader *seq_hdr = ref->data; + memset(seq_hdr, 0, sizeof(*seq_hdr)); + if ((res = parse_seq_hdr(c, &gb, seq_hdr)) < 0) { + dav1d_ref_dec(&ref); + return res; + } + if (check_for_overrun(c, &gb, init_bit_pos, len)) { + dav1d_ref_dec(&ref); + return DAV1D_ERR(EINVAL); + } + // If we have read a sequence header which is different from + // the old one, this is a new video sequence and can't use any + // previous state. Free that state. + if (!c->seq_hdr) + c->frame_hdr = NULL; + // see 7.5, operating_parameter_info is allowed to change in + // sequence headers of a single sequence + else if (memcmp(seq_hdr, c->seq_hdr, offsetof(Dav1dSequenceHeader, operating_parameter_info))) { + c->frame_hdr = NULL; + c->mastering_display = NULL; + c->content_light = NULL; + dav1d_ref_dec(&c->mastering_display_ref); + dav1d_ref_dec(&c->content_light_ref); + for (int i = 0; i < 8; i++) { + if (c->refs[i].p.p.data[0]) + dav1d_thread_picture_unref(&c->refs[i].p); + dav1d_ref_dec(&c->refs[i].segmap); + dav1d_ref_dec(&c->refs[i].refmvs); + dav1d_cdf_thread_unref(&c->cdf[i]); + } + } + dav1d_ref_dec(&c->seq_hdr_ref); + c->seq_hdr_ref = ref; + c->seq_hdr = seq_hdr; + break; + } + case DAV1D_OBU_REDUNDANT_FRAME_HDR: + if (c->frame_hdr) break; + // fall-through + case DAV1D_OBU_FRAME: + case DAV1D_OBU_FRAME_HDR: + if (global) break; + if (!c->seq_hdr) goto error; + if (!c->frame_hdr_ref) { + c->frame_hdr_ref = dav1d_ref_create_using_pool(&c->frame_hdr_pool, + sizeof(Dav1dFrameHeader)); + if (!c->frame_hdr_ref) return DAV1D_ERR(ENOMEM); + } +#ifndef NDEBUG + // ensure that the reference is writable + assert(dav1d_ref_is_writable(c->frame_hdr_ref)); +#endif + c->frame_hdr = c->frame_hdr_ref->data; + memset(c->frame_hdr, 0, sizeof(*c->frame_hdr)); + c->frame_hdr->temporal_id = temporal_id; + c->frame_hdr->spatial_id = spatial_id; + if ((res = parse_frame_hdr(c, &gb)) < 0) { + c->frame_hdr = NULL; + return res; + } + for (int n = 0; n < c->n_tile_data; n++) + dav1d_data_unref_internal(&c->tile[n].data); + c->n_tile_data = 0; + c->n_tiles = 0; + if (type != DAV1D_OBU_FRAME) { + // This is actually a frame header OBU so read the + // trailing bit and check for overrun. + dav1d_get_bits(&gb, 1); + if (check_for_overrun(c, &gb, init_bit_pos, len)) { + c->frame_hdr = NULL; + return DAV1D_ERR(EINVAL); + } + } + + if (c->frame_size_limit && (int64_t)c->frame_hdr->width[1] * + c->frame_hdr->height > c->frame_size_limit) + { + dav1d_log(c, "Frame size %dx%d exceeds limit %u\n", c->frame_hdr->width[1], + c->frame_hdr->height, c->frame_size_limit); + c->frame_hdr = NULL; + return DAV1D_ERR(ERANGE); + } + + if (type != DAV1D_OBU_FRAME) + break; + // OBU_FRAMEs shouldn't be signaled with show_existing_frame + if (c->frame_hdr->show_existing_frame) { + c->frame_hdr = NULL; + goto error; + } + + // This is the frame header at the start of a frame OBU. + // There's no trailing bit at the end to skip, but we do need + // to align to the next byte. + dav1d_bytealign_get_bits(&gb); + // fall-through + case DAV1D_OBU_TILE_GRP: { + if (global) break; + if (!c->frame_hdr) goto error; + if (c->n_tile_data_alloc < c->n_tile_data + 1) { + if ((c->n_tile_data + 1) > INT_MAX / (int)sizeof(*c->tile)) goto error; + struct Dav1dTileGroup *tile = realloc(c->tile, (c->n_tile_data + 1) * sizeof(*c->tile)); + if (!tile) goto error; + c->tile = tile; + memset(c->tile + c->n_tile_data, 0, sizeof(*c->tile)); + c->n_tile_data_alloc = c->n_tile_data + 1; + } + parse_tile_hdr(c, &gb); + // Align to the next byte boundary and check for overrun. + dav1d_bytealign_get_bits(&gb); + if (check_for_overrun(c, &gb, init_bit_pos, len)) + return DAV1D_ERR(EINVAL); + // The current bit position is a multiple of 8 (because we + // just aligned it) and less than 8*pkt_bytelen because + // otherwise the overrun check would have fired. + const unsigned pkt_bytelen = init_byte_pos + len; + const unsigned bit_pos = dav1d_get_bits_pos(&gb); + assert((bit_pos & 7) == 0); + assert(pkt_bytelen >= (bit_pos >> 3)); + dav1d_data_ref(&c->tile[c->n_tile_data].data, in); + c->tile[c->n_tile_data].data.data += bit_pos >> 3; + c->tile[c->n_tile_data].data.sz = pkt_bytelen - (bit_pos >> 3); + // ensure tile groups are in order and sane, see 6.10.1 + if (c->tile[c->n_tile_data].start > c->tile[c->n_tile_data].end || + c->tile[c->n_tile_data].start != c->n_tiles) + { + for (int i = 0; i <= c->n_tile_data; i++) + dav1d_data_unref_internal(&c->tile[i].data); + c->n_tile_data = 0; + c->n_tiles = 0; + goto error; + } + c->n_tiles += 1 + c->tile[c->n_tile_data].end - + c->tile[c->n_tile_data].start; + c->n_tile_data++; + break; + } + case DAV1D_OBU_METADATA: { +#define DEBUG_OBU_METADATA 0 +#if DEBUG_OBU_METADATA + const uint8_t *const init_ptr = gb.ptr; +#endif + // obu metadta type field + const enum ObuMetaType meta_type = dav1d_get_uleb128(&gb); + const int meta_type_len = (dav1d_get_bits_pos(&gb) - init_bit_pos) >> 3; + if (gb.error) goto error; + + switch (meta_type) { + case OBU_META_HDR_CLL: { + Dav1dRef *ref = dav1d_ref_create(sizeof(Dav1dContentLightLevel)); + if (!ref) return DAV1D_ERR(ENOMEM); + Dav1dContentLightLevel *const content_light = ref->data; + + content_light->max_content_light_level = dav1d_get_bits(&gb, 16); +#if DEBUG_OBU_METADATA + printf("CLLOBU: max-content-light-level: %d [off=%td]\n", + content_light->max_content_light_level, + (gb.ptr - init_ptr) * 8 - gb.bits_left); +#endif + content_light->max_frame_average_light_level = dav1d_get_bits(&gb, 16); +#if DEBUG_OBU_METADATA + printf("CLLOBU: max-frame-average-light-level: %d [off=%td]\n", + content_light->max_frame_average_light_level, + (gb.ptr - init_ptr) * 8 - gb.bits_left); +#endif + + // Skip the trailing bit, align to the next byte boundary and check for overrun. + dav1d_get_bits(&gb, 1); + dav1d_bytealign_get_bits(&gb); + if (check_for_overrun(c, &gb, init_bit_pos, len)) { + dav1d_ref_dec(&ref); + goto error; + } + + dav1d_ref_dec(&c->content_light_ref); + c->content_light = content_light; + c->content_light_ref = ref; + break; + } + case OBU_META_HDR_MDCV: { + Dav1dRef *ref = dav1d_ref_create(sizeof(Dav1dMasteringDisplay)); + if (!ref) return DAV1D_ERR(ENOMEM); + Dav1dMasteringDisplay *const mastering_display = ref->data; + + for (int i = 0; i < 3; i++) { + mastering_display->primaries[i][0] = dav1d_get_bits(&gb, 16); + mastering_display->primaries[i][1] = dav1d_get_bits(&gb, 16); +#if DEBUG_OBU_METADATA + printf("MDCVOBU: primaries[%d]: (%d, %d) [off=%td]\n", i, + mastering_display->primaries[i][0], + mastering_display->primaries[i][1], + (gb.ptr - init_ptr) * 8 - gb.bits_left); +#endif + } + mastering_display->white_point[0] = dav1d_get_bits(&gb, 16); +#if DEBUG_OBU_METADATA + printf("MDCVOBU: white-point-x: %d [off=%td]\n", + mastering_display->white_point[0], + (gb.ptr - init_ptr) * 8 - gb.bits_left); +#endif + mastering_display->white_point[1] = dav1d_get_bits(&gb, 16); +#if DEBUG_OBU_METADATA + printf("MDCVOBU: white-point-y: %d [off=%td]\n", + mastering_display->white_point[1], + (gb.ptr - init_ptr) * 8 - gb.bits_left); +#endif + mastering_display->max_luminance = dav1d_get_bits(&gb, 32); +#if DEBUG_OBU_METADATA + printf("MDCVOBU: max-luminance: %d [off=%td]\n", + mastering_display->max_luminance, + (gb.ptr - init_ptr) * 8 - gb.bits_left); +#endif + mastering_display->min_luminance = dav1d_get_bits(&gb, 32); +#if DEBUG_OBU_METADATA + printf("MDCVOBU: min-luminance: %d [off=%td]\n", + mastering_display->min_luminance, + (gb.ptr - init_ptr) * 8 - gb.bits_left); +#endif + // Skip the trailing bit, align to the next byte boundary and check for overrun. + dav1d_get_bits(&gb, 1); + dav1d_bytealign_get_bits(&gb); + if (check_for_overrun(c, &gb, init_bit_pos, len)) { + dav1d_ref_dec(&ref); + goto error; + } + + dav1d_ref_dec(&c->mastering_display_ref); + c->mastering_display = mastering_display; + c->mastering_display_ref = ref; + break; + } + case OBU_META_ITUT_T35: { + int payload_size = len; + // Don't take into account all the trailing bits for payload_size + while (payload_size > 0 && !in->data[init_byte_pos + payload_size - 1]) + payload_size--; // trailing_zero_bit x 8 + payload_size--; // trailing_one_bit + trailing_zero_bit x 7 + + // Don't take into account meta_type bytes + payload_size -= meta_type_len; + + int country_code_extension_byte = 0; + const int country_code = dav1d_get_bits(&gb, 8); + payload_size--; + if (country_code == 0xFF) { + country_code_extension_byte = dav1d_get_bits(&gb, 8); + payload_size--; + } + + if (payload_size <= 0) { + dav1d_log(c, "Malformed ITU-T T.35 metadata message format\n"); + goto error; + } + + Dav1dRef *ref = dav1d_ref_create(sizeof(Dav1dITUTT35) + payload_size * sizeof(uint8_t)); + if (!ref) return DAV1D_ERR(ENOMEM); + Dav1dITUTT35 *const itut_t35_metadata = ref->data; + + // We need our public headers to be C++ compatible, so payload can't be + // a flexible array member + itut_t35_metadata->payload = (uint8_t *) &itut_t35_metadata[1]; + itut_t35_metadata->country_code = country_code; + itut_t35_metadata->country_code_extension_byte = country_code_extension_byte; + for (int i = 0; i < payload_size; i++) + itut_t35_metadata->payload[i] = dav1d_get_bits(&gb, 8); + itut_t35_metadata->payload_size = payload_size; + + dav1d_ref_dec(&c->itut_t35_ref); + c->itut_t35 = itut_t35_metadata; + c->itut_t35_ref = ref; + break; + } + case OBU_META_SCALABILITY: + case OBU_META_TIMECODE: + // ignore metadata OBUs we don't care about + break; + default: + // print a warning but don't fail for unknown types + dav1d_log(c, "Unknown Metadata OBU type %d\n", meta_type); + break; + } + + break; + } + case DAV1D_OBU_PADDING: + case DAV1D_OBU_TD: + // ignore OBUs we don't care about + break; + default: + // print a warning but don't fail for unknown types + dav1d_log(c, "Unknown OBU type %d of size %u\n", type, len); + break; + } + + if (c->seq_hdr && c->frame_hdr) { + if (c->frame_hdr->show_existing_frame) { + if (!c->refs[c->frame_hdr->existing_frame_idx].p.p.data[0]) return DAV1D_ERR(EINVAL); + if (c->n_fc == 1) { + dav1d_picture_ref(&c->out, + &c->refs[c->frame_hdr->existing_frame_idx].p.p); + dav1d_data_props_copy(&c->out.m, &in->m); + } else { + // need to append this to the frame output queue + const unsigned next = c->frame_thread.next++; + if (c->frame_thread.next == c->n_fc) + c->frame_thread.next = 0; + + Dav1dFrameContext *const f = &c->fc[next]; + pthread_mutex_lock(&f->frame_thread.td.lock); + while (f->n_tile_data > 0) + pthread_cond_wait(&f->frame_thread.td.cond, + &f->frame_thread.td.lock); + Dav1dThreadPicture *const out_delayed = + &c->frame_thread.out_delayed[next]; + if (out_delayed->p.data[0]) { + const unsigned progress = atomic_load_explicit(&out_delayed->progress[1], + memory_order_relaxed); + if (out_delayed->visible && progress != FRAME_ERROR) + dav1d_picture_ref(&c->out, &out_delayed->p); + dav1d_thread_picture_unref(out_delayed); + } + dav1d_thread_picture_ref(out_delayed, + &c->refs[c->frame_hdr->existing_frame_idx].p); + out_delayed->visible = 1; + dav1d_data_props_copy(&out_delayed->p.m, &in->m); + pthread_mutex_unlock(&f->frame_thread.td.lock); + } + if (c->refs[c->frame_hdr->existing_frame_idx].p.p.frame_hdr->frame_type == DAV1D_FRAME_TYPE_KEY) { + const int r = c->frame_hdr->existing_frame_idx; + for (int i = 0; i < 8; i++) { + if (i == r) continue; + + if (c->refs[i].p.p.data[0]) + dav1d_thread_picture_unref(&c->refs[i].p); + dav1d_thread_picture_ref(&c->refs[i].p, &c->refs[r].p); + + dav1d_cdf_thread_unref(&c->cdf[i]); + dav1d_cdf_thread_ref(&c->cdf[i], &c->cdf[r]); + + dav1d_ref_dec(&c->refs[i].segmap); + c->refs[i].segmap = c->refs[r].segmap; + if (c->refs[r].segmap) + dav1d_ref_inc(c->refs[r].segmap); + dav1d_ref_dec(&c->refs[i].refmvs); + } + } + c->frame_hdr = NULL; + } else if (c->n_tiles == c->frame_hdr->tiling.cols * c->frame_hdr->tiling.rows) { + if (!c->n_tile_data) + return DAV1D_ERR(EINVAL); + if ((res = dav1d_submit_frame(c)) < 0) + return res; + assert(!c->n_tile_data); + c->frame_hdr = NULL; + c->n_tiles = 0; + } + } + + return len + init_byte_pos; + +error: + dav1d_log(c, "Error parsing OBU data\n"); + return DAV1D_ERR(EINVAL); +} diff --git a/third_party/dav1d/src/obu.h b/third_party/dav1d/src/obu.h new file mode 100644 index 0000000000..aa79b5277a --- /dev/null +++ b/third_party/dav1d/src/obu.h @@ -0,0 +1,36 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_OBU_H +#define DAV1D_SRC_OBU_H + +#include "dav1d/data.h" +#include "src/internal.h" + +int dav1d_parse_obus(Dav1dContext *c, Dav1dData *in, int global); + +#endif /* DAV1D_SRC_OBU_H */ diff --git a/third_party/dav1d/src/picture.c b/third_party/dav1d/src/picture.c new file mode 100644 index 0000000000..739c14ca0c --- /dev/null +++ b/third_party/dav1d/src/picture.c @@ -0,0 +1,324 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include +#include +#include +#include +#include + +#include "common/intops.h" +#include "common/validate.h" + +#include "src/internal.h" +#include "src/log.h" +#include "src/picture.h" +#include "src/ref.h" +#include "src/thread.h" +#include "src/thread_task.h" + +int dav1d_default_picture_alloc(Dav1dPicture *const p, void *const cookie) { + assert(sizeof(Dav1dMemPoolBuffer) <= DAV1D_PICTURE_ALIGNMENT); + const int hbd = p->p.bpc > 8; + const int aligned_w = (p->p.w + 127) & ~127; + const int aligned_h = (p->p.h + 127) & ~127; + const int has_chroma = p->p.layout != DAV1D_PIXEL_LAYOUT_I400; + const int ss_ver = p->p.layout == DAV1D_PIXEL_LAYOUT_I420; + const int ss_hor = p->p.layout != DAV1D_PIXEL_LAYOUT_I444; + ptrdiff_t y_stride = aligned_w << hbd; + ptrdiff_t uv_stride = has_chroma ? y_stride >> ss_hor : 0; + /* Due to how mapping of addresses to sets works in most L1 and L2 cache + * implementations, strides of multiples of certain power-of-two numbers + * may cause multiple rows of the same superblock to map to the same set, + * causing evictions of previous rows resulting in a reduction in cache + * hit rate. Avoid that by slightly padding the stride when necessary. */ + if (!(y_stride & 1023)) + y_stride += DAV1D_PICTURE_ALIGNMENT; + if (!(uv_stride & 1023) && has_chroma) + uv_stride += DAV1D_PICTURE_ALIGNMENT; + p->stride[0] = y_stride; + p->stride[1] = uv_stride; + const size_t y_sz = y_stride * aligned_h; + const size_t uv_sz = uv_stride * (aligned_h >> ss_ver); + const size_t pic_size = y_sz + 2 * uv_sz; + + Dav1dMemPoolBuffer *const buf = dav1d_mem_pool_pop(cookie, pic_size + + DAV1D_PICTURE_ALIGNMENT - + sizeof(Dav1dMemPoolBuffer)); + if (!buf) return DAV1D_ERR(ENOMEM); + p->allocator_data = buf; + + uint8_t *const data = buf->data; + p->data[0] = data; + p->data[1] = has_chroma ? data + y_sz : NULL; + p->data[2] = has_chroma ? data + y_sz + uv_sz : NULL; + + return 0; +} + +void dav1d_default_picture_release(Dav1dPicture *const p, void *const cookie) { + dav1d_mem_pool_push(cookie, p->allocator_data); +} + +struct pic_ctx_context { + Dav1dPicAllocator allocator; + Dav1dPicture pic; + void *extra_ptr; /* MUST BE AT THE END */ +}; + +static void free_buffer(const uint8_t *const data, void *const user_data) { + struct pic_ctx_context *pic_ctx = user_data; + + pic_ctx->allocator.release_picture_callback(&pic_ctx->pic, + pic_ctx->allocator.cookie); + free(pic_ctx); +} + +static int picture_alloc_with_edges(Dav1dContext *const c, + Dav1dPicture *const p, + const int w, const int h, + Dav1dSequenceHeader *const seq_hdr, Dav1dRef *const seq_hdr_ref, + Dav1dFrameHeader *const frame_hdr, Dav1dRef *const frame_hdr_ref, + Dav1dContentLightLevel *const content_light, Dav1dRef *const content_light_ref, + Dav1dMasteringDisplay *const mastering_display, Dav1dRef *const mastering_display_ref, + Dav1dITUTT35 *const itut_t35, Dav1dRef *const itut_t35_ref, + const int bpc, + const Dav1dDataProps *const props, + Dav1dPicAllocator *const p_allocator, + const size_t extra, void **const extra_ptr) +{ + if (p->data[0]) { + dav1d_log(c, "Picture already allocated!\n"); + return -1; + } + assert(bpc > 0 && bpc <= 16); + + struct pic_ctx_context *pic_ctx = malloc(extra + sizeof(struct pic_ctx_context)); + if (pic_ctx == NULL) + return DAV1D_ERR(ENOMEM); + + p->p.w = w; + p->p.h = h; + p->seq_hdr = seq_hdr; + p->frame_hdr = frame_hdr; + p->content_light = content_light; + p->mastering_display = mastering_display; + p->itut_t35 = itut_t35; + p->p.layout = seq_hdr->layout; + p->p.bpc = bpc; + dav1d_data_props_set_defaults(&p->m); + const int res = p_allocator->alloc_picture_callback(p, p_allocator->cookie); + if (res < 0) { + free(pic_ctx); + return res; + } + + pic_ctx->allocator = *p_allocator; + pic_ctx->pic = *p; + + if (!(p->ref = dav1d_ref_wrap(p->data[0], free_buffer, pic_ctx))) { + p_allocator->release_picture_callback(p, p_allocator->cookie); + free(pic_ctx); + dav1d_log(c, "Failed to wrap picture: %s\n", strerror(errno)); + return DAV1D_ERR(ENOMEM); + } + + p->seq_hdr_ref = seq_hdr_ref; + if (seq_hdr_ref) dav1d_ref_inc(seq_hdr_ref); + + p->frame_hdr_ref = frame_hdr_ref; + if (frame_hdr_ref) dav1d_ref_inc(frame_hdr_ref); + + dav1d_data_props_copy(&p->m, props); + + if (extra && extra_ptr) + *extra_ptr = &pic_ctx->extra_ptr; + + p->content_light_ref = content_light_ref; + if (content_light_ref) dav1d_ref_inc(content_light_ref); + + p->mastering_display_ref = mastering_display_ref; + if (mastering_display_ref) dav1d_ref_inc(mastering_display_ref); + + p->itut_t35_ref = itut_t35_ref; + if (itut_t35_ref) dav1d_ref_inc(itut_t35_ref); + + return 0; +} + +int dav1d_thread_picture_alloc(Dav1dContext *const c, Dav1dFrameContext *const f, + const int bpc) +{ + Dav1dThreadPicture *const p = &f->sr_cur; + p->t = c->n_fc > 1 ? &f->frame_thread.td : NULL; + + const int res = + picture_alloc_with_edges(c, &p->p, f->frame_hdr->width[1], f->frame_hdr->height, + f->seq_hdr, f->seq_hdr_ref, + f->frame_hdr, f->frame_hdr_ref, + c->content_light, c->content_light_ref, + c->mastering_display, c->mastering_display_ref, + c->itut_t35, c->itut_t35_ref, + bpc, &f->tile[0].data.m, &c->allocator, + p->t != NULL ? sizeof(atomic_int) * 2 : 0, + (void **) &p->progress); + if (res) return res; + + // Must be removed from the context after being attached to the frame + dav1d_ref_dec(&c->itut_t35_ref); + c->itut_t35 = NULL; + + p->visible = f->frame_hdr->show_frame; + if (p->t) { + atomic_init(&p->progress[0], 0); + atomic_init(&p->progress[1], 0); + } + return res; +} + +int dav1d_picture_alloc_copy(Dav1dContext *const c, Dav1dPicture *const dst, const int w, + const Dav1dPicture *const src) +{ + struct pic_ctx_context *const pic_ctx = src->ref->user_data; + const int res = picture_alloc_with_edges(c, dst, w, src->p.h, + src->seq_hdr, src->seq_hdr_ref, + src->frame_hdr, src->frame_hdr_ref, + src->content_light, src->content_light_ref, + src->mastering_display, src->mastering_display_ref, + src->itut_t35, src->itut_t35_ref, + src->p.bpc, &src->m, &pic_ctx->allocator, + 0, NULL); + return res; +} + +void dav1d_picture_ref(Dav1dPicture *const dst, const Dav1dPicture *const src) { + validate_input(dst != NULL); + validate_input(dst->data[0] == NULL); + validate_input(src != NULL); + + if (src->ref) { + validate_input(src->data[0] != NULL); + dav1d_ref_inc(src->ref); + if (src->frame_hdr_ref) dav1d_ref_inc(src->frame_hdr_ref); + if (src->seq_hdr_ref) dav1d_ref_inc(src->seq_hdr_ref); + if (src->m.user_data.ref) dav1d_ref_inc(src->m.user_data.ref); + if (src->content_light_ref) dav1d_ref_inc(src->content_light_ref); + if (src->mastering_display_ref) dav1d_ref_inc(src->mastering_display_ref); + if (src->itut_t35_ref) dav1d_ref_inc(src->itut_t35_ref); + } + *dst = *src; +} + +void dav1d_picture_move_ref(Dav1dPicture *const dst, Dav1dPicture *const src) { + validate_input(dst != NULL); + validate_input(dst->data[0] == NULL); + validate_input(src != NULL); + + if (src->ref) + validate_input(src->data[0] != NULL); + + *dst = *src; + memset(src, 0, sizeof(*src)); +} + +void dav1d_thread_picture_ref(Dav1dThreadPicture *const dst, + const Dav1dThreadPicture *const src) +{ + dav1d_picture_ref(&dst->p, &src->p); + dst->t = src->t; + dst->visible = src->visible; + dst->progress = src->progress; +} + +void dav1d_picture_unref_internal(Dav1dPicture *const p) { + validate_input(p != NULL); + + if (p->ref) { + validate_input(p->data[0] != NULL); + dav1d_ref_dec(&p->ref); + dav1d_ref_dec(&p->seq_hdr_ref); + dav1d_ref_dec(&p->frame_hdr_ref); + dav1d_ref_dec(&p->m.user_data.ref); + dav1d_ref_dec(&p->content_light_ref); + dav1d_ref_dec(&p->mastering_display_ref); + dav1d_ref_dec(&p->itut_t35_ref); + } + memset(p, 0, sizeof(*p)); +} + +void dav1d_thread_picture_unref(Dav1dThreadPicture *const p) { + dav1d_picture_unref_internal(&p->p); + + p->t = NULL; + p->progress = NULL; +} + +int dav1d_thread_picture_wait(const Dav1dThreadPicture *const p, + int y_unclipped, const enum PlaneType plane_type) +{ + assert(plane_type != PLANE_TYPE_ALL); + + if (!p->t) + return 0; + + // convert to luma units; include plane delay from loopfilters; clip + const int ss_ver = p->p.p.layout == DAV1D_PIXEL_LAYOUT_I420; + y_unclipped *= 1 << (plane_type & ss_ver); // we rely here on PLANE_TYPE_UV being 1 + y_unclipped += (plane_type != PLANE_TYPE_BLOCK) * 8; // delay imposed by loopfilter + const unsigned y = iclip(y_unclipped, 1, p->p.p.h); + atomic_uint *const progress = &p->progress[plane_type != PLANE_TYPE_BLOCK]; + unsigned state; + + if ((state = atomic_load_explicit(progress, memory_order_acquire)) >= y) + return state == FRAME_ERROR; + + pthread_mutex_lock(&p->t->lock); + while ((state = atomic_load_explicit(progress, memory_order_relaxed)) < y) + pthread_cond_wait(&p->t->cond, &p->t->lock); + pthread_mutex_unlock(&p->t->lock); + return state == FRAME_ERROR; +} + +void dav1d_thread_picture_signal(const Dav1dThreadPicture *const p, + const int y, // in pixel units + const enum PlaneType plane_type) +{ + assert(plane_type != PLANE_TYPE_UV); + + if (!p->t) + return; + + pthread_mutex_lock(&p->t->lock); + if (plane_type != PLANE_TYPE_Y) + atomic_store(&p->progress[0], y); + if (plane_type != PLANE_TYPE_BLOCK) + atomic_store(&p->progress[1], y); + pthread_cond_broadcast(&p->t->cond); + pthread_mutex_unlock(&p->t->lock); +} diff --git a/third_party/dav1d/src/picture.h b/third_party/dav1d/src/picture.h new file mode 100644 index 0000000000..fad2536c3d --- /dev/null +++ b/third_party/dav1d/src/picture.h @@ -0,0 +1,117 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_PICTURE_H +#define DAV1D_SRC_PICTURE_H + +#include + +#include "src/thread.h" +#include "dav1d/picture.h" + +#include "src/thread_data.h" +#include "src/ref.h" + +enum PlaneType { + PLANE_TYPE_Y, + PLANE_TYPE_UV, + PLANE_TYPE_BLOCK, + PLANE_TYPE_ALL, +}; + +typedef struct Dav1dThreadPicture { + Dav1dPicture p; + int visible; + struct thread_data *t; + // [0] block data (including segmentation map and motion vectors) + // [1] pixel data + atomic_uint *progress; +} Dav1dThreadPicture; + +typedef struct Dav1dPictureBuffer { + void *data; + struct Dav1dPictureBuffer *next; +} Dav1dPictureBuffer; + +/* + * Allocate a picture with custom border size. + */ +int dav1d_thread_picture_alloc(Dav1dContext *c, Dav1dFrameContext *f, const int bpc); + +/** + * Allocate a picture with identical metadata to an existing picture. + * The width is a separate argument so this function can be used for + * super-res, where the width changes, but everything else is the same. + * For the more typical use case of allocating a new image of the same + * dimensions, use src->p.w as width. + */ +int dav1d_picture_alloc_copy(Dav1dContext *c, Dav1dPicture *dst, const int w, + const Dav1dPicture *src); + +/** + * Create a copy of a picture. + */ +void dav1d_picture_ref(Dav1dPicture *dst, const Dav1dPicture *src); +void dav1d_thread_picture_ref(Dav1dThreadPicture *dst, + const Dav1dThreadPicture *src); +void dav1d_thread_picture_unref(Dav1dThreadPicture *p); + +/** + * Move a picture reference. + */ +void dav1d_picture_move_ref(Dav1dPicture *dst, Dav1dPicture *src); + +/** + * Wait for picture to reach a certain stage. + * + * y is in full-pixel units. If pt is not UV, this is in luma + * units, else it is in chroma units. + * plane_type is used to determine how many pixels delay are + * introduced by loopfilter processes. + * + * Returns 0 on success, and 1 if there was an error while decoding p + */ +int dav1d_thread_picture_wait(const Dav1dThreadPicture *p, int y, + enum PlaneType plane_type); + +/** + * Signal decoding progress. + * + * y is in full-pixel luma units. FRAME_ERROR is used to signal a decoding + * error to frames using this frame as reference frame. + * plane_type denotes whether we have completed block data (pass 1; + * PLANE_TYPE_BLOCK), pixel data (pass 2, PLANE_TYPE_Y) or both (no + * 2-pass decoding; PLANE_TYPE_ALL). + */ +void dav1d_thread_picture_signal(const Dav1dThreadPicture *p, int y, + enum PlaneType plane_type); + +int dav1d_default_picture_alloc(Dav1dPicture *p, void *cookie); +void dav1d_default_picture_release(Dav1dPicture *p, void *cookie); +void dav1d_picture_unref_internal(Dav1dPicture *p); + +#endif /* DAV1D_SRC_PICTURE_H */ diff --git a/third_party/dav1d/src/ppc/cdef_init_tmpl.c b/third_party/dav1d/src/ppc/cdef_init_tmpl.c new file mode 100644 index 0000000000..07cbce6de4 --- /dev/null +++ b/third_party/dav1d/src/ppc/cdef_init_tmpl.c @@ -0,0 +1,488 @@ +/* + * Copyright © 2019, Luca Barbato + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include "common/bitdepth.h" +#include "common/intops.h" + +#include "src/cdef.h" +#include "src/cpu.h" + +#include "src/ppc/types.h" + +#if BITDEPTH == 8 +static inline i16x8 vconstrain(const i16x8 diff, const int16_t threshold, + const int damping) +{ + const i16x8 zero = vec_splat_s16(0); + if (!threshold) return zero; + const uint16_t shift = imax(0, damping - ulog2(threshold)); + const i16x8 abs_diff = vec_abs(diff); + const b16x8 mask = vec_cmplt(diff, zero); + const i16x8 thr = vec_splats(threshold); + const i16x8 sub = vec_sub(thr, vec_sra(abs_diff, vec_splats(shift))); + const i16x8 max = vec_max(zero, sub); + const i16x8 min = vec_min(abs_diff, max); + const i16x8 neg = vec_sub(zero, min); + return vec_sel(min, neg, mask); +} + +static inline void copy4xN(uint16_t *tmp, const ptrdiff_t tmp_stride, + const uint8_t *src, const ptrdiff_t src_stride, + const uint8_t (*left)[2], const uint8_t *const top, + const int w, const int h, + const enum CdefEdgeFlags edges) +{ + const u16x8 fill = vec_splats((uint16_t)INT16_MAX); + + u16x8 l0; + u16x8 l1; + + int y_start = -2, y_end = h + 2; + + // Copy top and bottom first + if (!(edges & CDEF_HAVE_TOP)) { + l0 = fill; + l1 = fill; + y_start = 0; + } else { + l0 = u8h_to_u16(vec_vsx_ld(0, top + 0 * src_stride - 2)); + l1 = u8h_to_u16(vec_vsx_ld(0, top + 1 * src_stride - 2)); + } + + vec_st(l0, 0, tmp - 2 * 8); + vec_st(l1, 0, tmp - 1 * 8); + + if (!(edges & CDEF_HAVE_BOTTOM)) { + l0 = fill; + l1 = fill; + y_end -= 2; + } else { + l0 = u8h_to_u16(vec_vsx_ld(0, src - 2 + (h + 0) * src_stride)); + l1 = u8h_to_u16(vec_vsx_ld(0, src - 2 + (h + 1) * src_stride)); + } + + vec_st(l0, 0, tmp + (h + 0) * 8); + vec_st(l1, 0, tmp + (h + 1) * 8); + + for (int y = 0; y < h; y++) { + u16x8 l = u8h_to_u16(vec_vsx_ld(0, src - 2 + y * src_stride)); + vec_st(l, 0, tmp + y * 8); + } + + if (!(edges & CDEF_HAVE_LEFT)) { + for (int y = y_start; y < y_end; y++) { + tmp[y * 8] = INT16_MAX; + tmp[1 + y * 8] = INT16_MAX; + } + } else { + for (int y = 0; y < h; y++) { + tmp[y * 8] = left[y][0]; + tmp[1 + y * 8] = left[y][1]; + } + } + if (!(edges & CDEF_HAVE_RIGHT)) { + for (int y = y_start; y < y_end; y++) { + tmp[- 2 + (y + 1) * 8] = INT16_MAX; + tmp[- 1 + (y + 1) * 8] = INT16_MAX; + } + } +} + +static inline void copy8xN(uint16_t *tmp, const ptrdiff_t tmp_stride, + const uint8_t *src, const ptrdiff_t src_stride, + const uint8_t (*left)[2], const uint8_t *const top, + const int w, const int h, + const enum CdefEdgeFlags edges) +{ + const u16x8 fill = vec_splats((uint16_t)INT16_MAX); + + u16x8 l0h, l0l; + u16x8 l1h, l1l; + + int y_start = -2, y_end = h + 2; + + // Copy top and bottom first + if (!(edges & CDEF_HAVE_TOP)) { + l0h = fill; + l0l = fill; + l1h = fill; + l1l = fill; + y_start = 0; + } else { + u8x16 l0 = vec_vsx_ld(0, top + 0 * src_stride - 2); + u8x16 l1 = vec_vsx_ld(0, top + 1 * src_stride - 2); + l0h = u8h_to_u16(l0); + l0l = u8l_to_u16(l0); + l1h = u8h_to_u16(l1); + l1l = u8l_to_u16(l1); + } + + vec_st(l0h, 0, tmp - 4 * 8); + vec_st(l0l, 0, tmp - 3 * 8); + vec_st(l1h, 0, tmp - 2 * 8); + vec_st(l1l, 0, tmp - 1 * 8); + + if (!(edges & CDEF_HAVE_BOTTOM)) { + l0h = fill; + l0l = fill; + l1h = fill; + l1l = fill; + y_end -= 2; + } else { + u8x16 l0 = vec_vsx_ld(0, src - 2 + (h + 0) * src_stride); + u8x16 l1 = vec_vsx_ld(0, src - 2 + (h + 1) * src_stride); + l0h = u8h_to_u16(l0); + l0l = u8l_to_u16(l0); + l1h = u8h_to_u16(l1); + l1l = u8l_to_u16(l1); + } + + vec_st(l0h, 0, tmp + (h + 0) * 16); + vec_st(l0l, 0, tmp + (h + 0) * 16 + 8); + vec_st(l1h, 0, tmp + (h + 1) * 16); + vec_st(l1l, 0, tmp + (h + 1) * 16 + 8); + + for (int y = 0; y < h; y++) { + u8x16 l = vec_vsx_ld(0, src - 2 + y * src_stride); + u16x8 lh = u8h_to_u16(l); + u16x8 ll = u8l_to_u16(l); + vec_st(lh, 0, tmp + y * 16); + vec_st(ll, 0, tmp + 8 + y * 16); + } + + if (!(edges & CDEF_HAVE_LEFT)) { + for (int y = y_start; y < y_end; y++) { + tmp[y * 16] = INT16_MAX; + tmp[1 + y * 16] = INT16_MAX; + } + } else { + for (int y = 0; y < h; y++) { + tmp[y * 16] = left[y][0]; + tmp[1 + y * 16] = left[y][1]; + } + } + if (!(edges & CDEF_HAVE_RIGHT)) { + for (int y = y_start; y < y_end; y++) { + tmp[- 6 + (y + 1) * 16] = INT16_MAX; + tmp[- 5 + (y + 1) * 16] = INT16_MAX; + } + } +} + +static inline i16x8 max_mask(i16x8 a, i16x8 b) { + const i16x8 I16X8_INT16_MAX = vec_splats((int16_t)INT16_MAX); + + const b16x8 mask = vec_cmpeq(a, I16X8_INT16_MAX); + + const i16x8 val = vec_sel(a, b, mask); + + return vec_max(val, b); +} + +#define LOAD_PIX(addr) \ + const i16x8 px = (i16x8)vec_vsx_ld(0, addr); \ + i16x8 max = px; \ + i16x8 min = px; \ + i16x8 sum = vec_splat_s16(0); + +#define LOAD_PIX4(addr) \ + const i16x8 a = (i16x8)vec_vsx_ld(0, addr); \ + const i16x8 b = (i16x8)vec_vsx_ld(0, addr + tmp_stride); \ + const i16x8 px = vec_xxpermdi(a, b, 0); \ + i16x8 max = px; \ + i16x8 min = px; \ + i16x8 sum = vec_splat_s16(0); + +#define LOAD_DIR(p, addr, o0, o1) \ + const i16x8 p ## 0 = (i16x8)vec_vsx_ld(0, addr + o0); \ + const i16x8 p ## 1 = (i16x8)vec_vsx_ld(0, addr - o0); \ + const i16x8 p ## 2 = (i16x8)vec_vsx_ld(0, addr + o1); \ + const i16x8 p ## 3 = (i16x8)vec_vsx_ld(0, addr - o1); + +#define LOAD_DIR4(p, addr, o0, o1) \ + LOAD_DIR(p ## a, addr, o0, o1) \ + LOAD_DIR(p ## b, addr + tmp_stride, o0, o1) \ + const i16x8 p ## 0 = vec_xxpermdi(p ## a ## 0, p ## b ## 0, 0); \ + const i16x8 p ## 1 = vec_xxpermdi(p ## a ## 1, p ## b ## 1, 0); \ + const i16x8 p ## 2 = vec_xxpermdi(p ## a ## 2, p ## b ## 2, 0); \ + const i16x8 p ## 3 = vec_xxpermdi(p ## a ## 3, p ## b ## 3, 0); + +#define CONSTRAIN(p, strength) \ + const i16x8 p ## _d0 = vec_sub(p ## 0, px); \ + const i16x8 p ## _d1 = vec_sub(p ## 1, px); \ + const i16x8 p ## _d2 = vec_sub(p ## 2, px); \ + const i16x8 p ## _d3 = vec_sub(p ## 3, px); \ +\ + i16x8 p ## _c0 = vconstrain(p ## _d0, strength, damping); \ + i16x8 p ## _c1 = vconstrain(p ## _d1, strength, damping); \ + i16x8 p ## _c2 = vconstrain(p ## _d2, strength, damping); \ + i16x8 p ## _c3 = vconstrain(p ## _d3, strength, damping); + +#define MIN_MAX(p) \ + max = max_mask(p ## 0, max); \ + min = vec_min(p ## 0, min); \ + max = max_mask(p ## 1, max); \ + min = vec_min(p ## 1, min); \ + max = max_mask(p ## 2, max); \ + min = vec_min(p ## 2, min); \ + max = max_mask(p ## 3, max); \ + min = vec_min(p ## 3, min); + +#define PRI_0(p) \ + p ## _c0 = vec_add(vec_sl(p ## _c0, vec_splat_u16(1)), vec_sl(p ## _c0, vec_splats(tap_even))); \ + p ## _c1 = vec_add(vec_sl(p ## _c1, vec_splat_u16(1)), vec_sl(p ## _c1, vec_splats(tap_even))); + +#define PRI_1(p) \ + p ## _c2 = vec_sub(vec_sl(p ## _c2, vec_splat_u16(2)), vec_sl(p ## _c2, vec_splats(tap_even))); \ + p ## _c3 = vec_sub(vec_sl(p ## _c3, vec_splat_u16(2)), vec_sl(p ## _c3, vec_splats(tap_even))); + +#define SEC_0(p) \ + p ## _c0 = vec_sl(p ## _c0, vec_splat_u16(1)); \ + p ## _c1 = vec_sl(p ## _c1, vec_splat_u16(1)); \ + p ## _c2 = vec_sl(p ## _c2, vec_splat_u16(1)); \ + p ## _c3 = vec_sl(p ## _c3, vec_splat_u16(1)); + +#define UPDATE_SUM(p) \ + const i16x8 p ## sum0 = vec_add(p ## _c0, p ## _c1); \ + const i16x8 p ## sum1 = vec_add(p ## _c2, p ## _c3); \ + sum = vec_add(sum, p ## sum0); \ + sum = vec_add(sum, p ## sum1); + +static inline void +filter_4xN(pixel *dst, const ptrdiff_t dst_stride, + const pixel (*left)[2], const pixel *const top, + const int w, const int h, const int pri_strength, + const int sec_strength, const int dir, + const int damping, const enum CdefEdgeFlags edges, + const ptrdiff_t tmp_stride, uint16_t *tmp) +{ + const int8_t cdef_directions[8 /* dir */][2 /* pass */] = { + { -1 * tmp_stride + 1, -2 * tmp_stride + 2 }, + { 0 * tmp_stride + 1, -1 * tmp_stride + 2 }, + { 0 * tmp_stride + 1, 0 * tmp_stride + 2 }, + { 0 * tmp_stride + 1, 1 * tmp_stride + 2 }, + { 1 * tmp_stride + 1, 2 * tmp_stride + 2 }, + { 1 * tmp_stride + 0, 2 * tmp_stride + 1 }, + { 1 * tmp_stride + 0, 2 * tmp_stride + 0 }, + { 1 * tmp_stride + 0, 2 * tmp_stride - 1 } + }; + const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; + const uint16_t tap_even = !((pri_strength >> bitdepth_min_8) & 1); + const int off1 = cdef_directions[dir][0]; + const int off1_1 = cdef_directions[dir][1]; + + const int off2 = cdef_directions[(dir + 2) & 7][0]; + const int off3 = cdef_directions[(dir + 6) & 7][0]; + + const int off2_1 = cdef_directions[(dir + 2) & 7][1]; + const int off3_1 = cdef_directions[(dir + 6) & 7][1]; + + + copy4xN(tmp - 2, tmp_stride, dst, dst_stride, left, top, w, h, edges); + for (int y = 0; y < h / 2; y++) { + LOAD_PIX4(tmp) + + // Primary pass + LOAD_DIR4(p, tmp, off1, off1_1) + + CONSTRAIN(p, pri_strength) + + MIN_MAX(p) + + PRI_0(p) + PRI_1(p) + + UPDATE_SUM(p) + + // Secondary pass 1 + LOAD_DIR4(s, tmp, off2, off3) + + CONSTRAIN(s, sec_strength) + + MIN_MAX(s) + + SEC_0(s) + + UPDATE_SUM(s) + + // Secondary pass 2 + LOAD_DIR4(s2, tmp, off2_1, off3_1) + + CONSTRAIN(s2, sec_strength) + + MIN_MAX(s2) + + UPDATE_SUM(s2) + + // Store + i16x8 bias = vec_and((i16x8)vec_cmplt(sum, vec_splat_s16(0)), vec_splat_s16(1)); + bias = vec_sub(vec_splat_s16(8), bias); + i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); + i16x8 vdst = vec_max(vec_min(unclamped, max), min); + + dst[0] = vdst[0]; + dst[1] = vdst[1]; + dst[2] = vdst[2]; + dst[3] = vdst[3]; + + tmp += tmp_stride; + dst += PXSTRIDE(dst_stride); + dst[0] = vdst[4]; + dst[1] = vdst[5]; + dst[2] = vdst[6]; + dst[3] = vdst[7]; + + tmp += tmp_stride; + dst += PXSTRIDE(dst_stride); + } +} + +static inline void +filter_8xN(pixel *dst, const ptrdiff_t dst_stride, + const pixel (*left)[2], const pixel *const top, + const int w, const int h, const int pri_strength, + const int sec_strength, const int dir, + const int damping, const enum CdefEdgeFlags edges, + const ptrdiff_t tmp_stride, uint16_t *tmp) +{ + const int8_t cdef_directions[8 /* dir */][2 /* pass */] = { + { -1 * tmp_stride + 1, -2 * tmp_stride + 2 }, + { 0 * tmp_stride + 1, -1 * tmp_stride + 2 }, + { 0 * tmp_stride + 1, 0 * tmp_stride + 2 }, + { 0 * tmp_stride + 1, 1 * tmp_stride + 2 }, + { 1 * tmp_stride + 1, 2 * tmp_stride + 2 }, + { 1 * tmp_stride + 0, 2 * tmp_stride + 1 }, + { 1 * tmp_stride + 0, 2 * tmp_stride + 0 }, + { 1 * tmp_stride + 0, 2 * tmp_stride - 1 } + }; + const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; + + + const uint16_t tap_even = !((pri_strength >> bitdepth_min_8) & 1); + const int off1 = cdef_directions[dir][0]; + const int off1_1 = cdef_directions[dir][1]; + + const int off2 = cdef_directions[(dir + 2) & 7][0]; + const int off3 = cdef_directions[(dir + 6) & 7][0]; + + const int off2_1 = cdef_directions[(dir + 2) & 7][1]; + const int off3_1 = cdef_directions[(dir + 6) & 7][1]; + + copy8xN(tmp - 2, tmp_stride, dst, dst_stride, left, top, w, h, edges); + + for (int y = 0; y < h; y++) { + LOAD_PIX(tmp) + + // Primary pass + LOAD_DIR(p, tmp, off1, off1_1) + + CONSTRAIN(p, pri_strength) + + MIN_MAX(p) + + PRI_0(p) + PRI_1(p) + + UPDATE_SUM(p) + + // Secondary pass 1 + LOAD_DIR(s, tmp, off2, off3) + + CONSTRAIN(s, sec_strength) + + MIN_MAX(s) + + SEC_0(s) + + UPDATE_SUM(s) + + // Secondary pass 2 + LOAD_DIR(s2, tmp, off2_1, off3_1) + + CONSTRAIN(s2, sec_strength) + + MIN_MAX(s2) + + UPDATE_SUM(s2) + + // Store + i16x8 bias = vec_and((i16x8)vec_cmplt(sum, vec_splat_s16(0)), vec_splat_s16(1)); + bias = vec_sub(vec_splat_s16(8), bias); + i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); + i16x8 vdst = vec_max(vec_min(unclamped, max), min); + + dst[0] = vdst[0]; + dst[1] = vdst[1]; + dst[2] = vdst[2]; + dst[3] = vdst[3]; + dst[4] = vdst[4]; + dst[5] = vdst[5]; + dst[6] = vdst[6]; + dst[7] = vdst[7]; + + tmp += tmp_stride; + dst += PXSTRIDE(dst_stride); + } + +} + + +#define cdef_fn(w, h, tmp_stride) \ +static void cdef_filter_##w##x##h##_vsx(pixel *const dst, \ + const ptrdiff_t dst_stride, \ + const pixel (*left)[2], \ + const pixel *const top, \ + const int pri_strength, \ + const int sec_strength, \ + const int dir, \ + const int damping, \ + const enum CdefEdgeFlags edges) \ +{ \ + ALIGN_STK_16(uint16_t, tmp_buf, 12 * tmp_stride,); \ + uint16_t *tmp = tmp_buf + 2 * tmp_stride + 2; \ + filter_##w##xN(dst, dst_stride, left, top, w, h, pri_strength, sec_strength, \ + dir, damping, edges, tmp_stride, tmp); \ +} + +cdef_fn(4, 4, 8); +cdef_fn(4, 8, 8); +cdef_fn(8, 8, 16); +#endif + +COLD void bitfn(dav1d_cdef_dsp_init_ppc)(Dav1dCdefDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_PPC_CPU_FLAG_VSX)) return; + +#if BITDEPTH == 8 + // c->dir = dav1d_cdef_find_dir_vsx; + c->fb[0] = cdef_filter_8x8_vsx; + c->fb[1] = cdef_filter_4x8_vsx; + c->fb[2] = cdef_filter_4x4_vsx; +#endif +} diff --git a/third_party/dav1d/src/ppc/cpu.c b/third_party/dav1d/src/ppc/cpu.c new file mode 100644 index 0000000000..fe77057c57 --- /dev/null +++ b/third_party/dav1d/src/ppc/cpu.c @@ -0,0 +1,51 @@ +/* + * Copyright © 2019, VideoLAN and dav1d authors + * Copyright © 2019, Janne Grunau + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include "common/attributes.h" + +#include "src/ppc/cpu.h" + +#if (defined(HAVE_GETAUXVAL) || defined(HAVE_ELF_AUX_INFO)) && ARCH_PPC64LE +#include +#define HAVE_AUX +#endif + +COLD unsigned dav1d_get_cpu_flags_ppc(void) { + unsigned flags = 0; +#if defined(HAVE_GETAUXVAL) && ARCH_PPC64LE + unsigned long hw_cap = getauxval(AT_HWCAP); +#elif defined(HAVE_ELF_AUX_INFO) && ARCH_PPC64LE + unsigned long hw_cap = 0; + elf_aux_info(AT_HWCAP, &hw_cap, sizeof(hw_cap)); +#endif +#ifdef HAVE_AUX + flags |= (hw_cap & PPC_FEATURE_HAS_VSX) ? DAV1D_PPC_CPU_FLAG_VSX : 0; +#endif + return flags; +} diff --git a/third_party/dav1d/src/ppc/cpu.h b/third_party/dav1d/src/ppc/cpu.h new file mode 100644 index 0000000000..cfd2ff4ff5 --- /dev/null +++ b/third_party/dav1d/src/ppc/cpu.h @@ -0,0 +1,37 @@ +/* + * Copyright © 2019, VideoLAN and dav1d authors + * Copyright © 2019, Janne Grunau + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_PPC_CPU_H +#define DAV1D_SRC_PPC_CPU_H + +enum CpuFlags { + DAV1D_PPC_CPU_FLAG_VSX = 1 << 0, +}; + +unsigned dav1d_get_cpu_flags_ppc(void); + +#endif /* DAV1D_SRC_PPC_CPU_H */ diff --git a/third_party/dav1d/src/ppc/looprestoration_init_tmpl.c b/third_party/dav1d/src/ppc/looprestoration_init_tmpl.c new file mode 100644 index 0000000000..78ff129703 --- /dev/null +++ b/third_party/dav1d/src/ppc/looprestoration_init_tmpl.c @@ -0,0 +1,350 @@ +/* + * Copyright © 2019, VideoLAN and dav1d authors + * Copyright © 2019, Michail Alvanos + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "common/intops.h" +#include "src/ppc/types.h" +#include "src/cpu.h" +#include "src/looprestoration.h" + +#if BITDEPTH == 8 + +#define REST_UNIT_STRIDE (400) + +static inline i32x4 iclip_vec(i32x4 v, const i32x4 minv, const i32x4 maxv) { + v = vec_max(minv, v); + v = vec_min(maxv, v); + return v; +} + +#define APPLY_FILTER_H(v, f, ssum1, ssum2) do { \ + i16x8 ktmp_u16_high = (i16x8) u8h_to_u16(v); \ + i16x8 ktmp_u16_low = (i16x8) u8l_to_u16(v); \ + ssum1 = vec_madd(ktmp_u16_high, f, ssum1); \ + ssum2 = vec_madd(ktmp_u16_low, f, ssum2); \ +} while (0) + +static void wiener_filter_h_vsx(int32_t *hor_ptr, + uint8_t *tmp_ptr, + const int16_t filterh[7], + const int w, const int h) +{ + static const i32x4 zerov = vec_splats(0); + static const i32x4 seven_vec = vec_splats(7); + static const i32x4 bitdepth_added_vec = vec_splats(1 << 14); + static const i32x4 round_bits_vec = vec_splats(3); + static const i32x4 rounding_off_vec = vec_splats(1<<2); + static const i32x4 clip_limit_v = vec_splats((1 << 13) - 1); + + i16x8 filterhvall = vec_vsx_ld(0, filterh); + i16x8 filterhv0 = vec_splat( filterhvall, 0); + i16x8 filterhv1 = vec_splat( filterhvall, 1); + i16x8 filterhv2 = vec_splat( filterhvall, 2); + i16x8 filterhv3 = vec_splat( filterhvall, 3); + i16x8 filterhv4 = vec_splat( filterhvall, 4); + i16x8 filterhv5 = vec_splat( filterhvall, 5); + i16x8 filterhv6 = vec_splat( filterhvall, 6); + + for (int j = 0; j < h + 6; j++) { + for (int i = 0; i < w; i+=16) { + i32x4 sum1 = bitdepth_added_vec; + i32x4 sum2 = bitdepth_added_vec; + i32x4 sum3 = bitdepth_added_vec; + i32x4 sum4 = bitdepth_added_vec; + + u8x16 tmp_v0 = vec_ld(0, &tmp_ptr[i]); + u8x16 tmp_v7 = vec_ld(0, &tmp_ptr[i+16]); + + u8x16 tmp_v1 = vec_sld( tmp_v7, tmp_v0, 15); + u8x16 tmp_v2 = vec_sld( tmp_v7, tmp_v0, 14); + u8x16 tmp_v3 = vec_sld( tmp_v7, tmp_v0, 13); + u8x16 tmp_v4 = vec_sld( tmp_v7, tmp_v0, 12); + u8x16 tmp_v5 = vec_sld( tmp_v7, tmp_v0, 11); + u8x16 tmp_v6 = vec_sld( tmp_v7, tmp_v0, 10); + + u16x8 tmp_u16_high = u8h_to_u16(tmp_v3); + u16x8 tmp_u16_low = u8l_to_u16(tmp_v3); + + i32x4 tmp_expanded1 = i16h_to_i32(tmp_u16_high); + i32x4 tmp_expanded2 = i16l_to_i32(tmp_u16_high); + i32x4 tmp_expanded3 = i16h_to_i32(tmp_u16_low); + i32x4 tmp_expanded4 = i16l_to_i32(tmp_u16_low); + + i16x8 ssum1 = (i16x8) zerov; + i16x8 ssum2 = (i16x8) zerov; + + APPLY_FILTER_H(tmp_v0, filterhv0, ssum1, ssum2); + APPLY_FILTER_H(tmp_v1, filterhv1, ssum1, ssum2); + APPLY_FILTER_H(tmp_v2, filterhv2, ssum1, ssum2); + APPLY_FILTER_H(tmp_v3, filterhv3, ssum1, ssum2); + APPLY_FILTER_H(tmp_v4, filterhv4, ssum1, ssum2); + APPLY_FILTER_H(tmp_v5, filterhv5, ssum1, ssum2); + APPLY_FILTER_H(tmp_v6, filterhv6, ssum1, ssum2); + + sum1 += i16h_to_i32(ssum1) + (tmp_expanded1 << seven_vec); + sum2 += i16l_to_i32(ssum1) + (tmp_expanded2 << seven_vec); + sum3 += i16h_to_i32(ssum2) + (tmp_expanded3 << seven_vec); + sum4 += i16l_to_i32(ssum2) + (tmp_expanded4 << seven_vec); + + sum1 = (sum1 + rounding_off_vec) >> round_bits_vec; + sum2 = (sum2 + rounding_off_vec) >> round_bits_vec; + sum3 = (sum3 + rounding_off_vec) >> round_bits_vec; + sum4 = (sum4 + rounding_off_vec) >> round_bits_vec; + + sum1 = iclip_vec(sum1, zerov, clip_limit_v); + sum2 = iclip_vec(sum2, zerov, clip_limit_v); + sum3 = iclip_vec(sum3, zerov, clip_limit_v); + sum4 = iclip_vec(sum4, zerov, clip_limit_v); + + vec_st(sum1, 0, &hor_ptr[i]); + vec_st(sum2, 16, &hor_ptr[i]); + vec_st(sum3, 32, &hor_ptr[i]); + vec_st(sum4, 48, &hor_ptr[i]); + } + tmp_ptr += REST_UNIT_STRIDE; + hor_ptr += REST_UNIT_STRIDE; + } +} + +static inline i16x8 iclip_u8_vec(i16x8 v) { + static const i16x8 zerov = vec_splats((int16_t)0); + static const i16x8 maxv = vec_splats((int16_t)255); + v = vec_max(zerov, v); + v = vec_min(maxv, v); + return v; +} + +#define APPLY_FILTER_V(index, f) do { \ + i32x4 v1 = vec_ld( 0, &hor[(j + index) * REST_UNIT_STRIDE + i]); \ + i32x4 v2 = vec_ld(16, &hor[(j + index) * REST_UNIT_STRIDE + i]); \ + i32x4 v3 = vec_ld(32, &hor[(j + index) * REST_UNIT_STRIDE + i]); \ + i32x4 v4 = vec_ld(48, &hor[(j + index) * REST_UNIT_STRIDE + i]); \ + sum1 = sum1 + v1 * f; \ + sum2 = sum2 + v2 * f; \ + sum3 = sum3 + v3 * f; \ + sum4 = sum4 + v4 * f; \ +} while (0) + +#define LOAD_AND_APPLY_FILTER_V(sumpixelv, hor) do { \ + i32x4 v_1 = (i32x4) vec_ld( 0, &hor[(j + 3) * REST_UNIT_STRIDE + i]); \ + i32x4 v_2 = (i32x4) vec_ld(16, &hor[(j + 3) * REST_UNIT_STRIDE + i]); \ + i32x4 v_3 = (i32x4) vec_ld(32, &hor[(j + 3) * REST_UNIT_STRIDE + i]); \ + i32x4 v_4 = (i32x4) vec_ld(48, &hor[(j + 3) * REST_UNIT_STRIDE + i]); \ + i32x4 sum1 = -round_offset_vec; \ + i32x4 sum2 = -round_offset_vec; \ + i32x4 sum3 = -round_offset_vec; \ + i32x4 sum4 = -round_offset_vec; \ + APPLY_FILTER_V(0, filterv0); \ + APPLY_FILTER_V(1, filterv1); \ + APPLY_FILTER_V(2, filterv2); \ + APPLY_FILTER_V(3, filterv3); \ + APPLY_FILTER_V(4, filterv4); \ + APPLY_FILTER_V(5, filterv5); \ + APPLY_FILTER_V(6, filterv6); \ + sum1 = (v_1 << seven_vec) + sum1 + rounding_off_vec; \ + sum2 = (v_2 << seven_vec) + sum2 + rounding_off_vec; \ + sum3 = (v_3 << seven_vec) + sum3 + rounding_off_vec; \ + sum4 = (v_4 << seven_vec) + sum4 + rounding_off_vec; \ + sum1 = sum1 >> round_bits_vec; \ + sum2 = sum2 >> round_bits_vec; \ + sum3 = sum3 >> round_bits_vec; \ + sum4 = sum4 >> round_bits_vec; \ + i16x8 sum_short_packed_1 = (i16x8) vec_pack( sum1, sum2 ); \ + i16x8 sum_short_packed_2 = (i16x8) vec_pack( sum3, sum4 ); \ + sum_short_packed_1 = iclip_u8_vec(sum_short_packed_1); \ + sum_short_packed_2 = iclip_u8_vec(sum_short_packed_2); \ + sum_pixel = (u8x16) vec_pack(sum_short_packed_1, sum_short_packed_2 ); \ +} while (0) + +static inline void wiener_filter_v_vsx(uint8_t *p, + const ptrdiff_t p_stride, + const int32_t *hor, + const int16_t filterv[7], + const int w, const int h) +{ + static const i32x4 round_bits_vec = vec_splats(11); + static const i32x4 rounding_off_vec = vec_splats(1 << 10); + static const i32x4 round_offset_vec = vec_splats(1 << 18); + static const i32x4 seven_vec = vec_splats(7); + + i32x4 filterv0 = vec_splats((int32_t) filterv[0]); + i32x4 filterv1 = vec_splats((int32_t) filterv[1]); + i32x4 filterv2 = vec_splats((int32_t) filterv[2]); + i32x4 filterv3 = vec_splats((int32_t) filterv[3]); + i32x4 filterv4 = vec_splats((int32_t) filterv[4]); + i32x4 filterv5 = vec_splats((int32_t) filterv[5]); + i32x4 filterv6 = vec_splats((int32_t) filterv[6]); + + for (int j = 0; j < h; j++) { + for (int i = 0; i <(w-w%16); i += 16) { + u8x16 sum_pixel; + LOAD_AND_APPLY_FILTER_V(sum_pixel, hor); + vec_vsx_st(sum_pixel, 0, &p[j * PXSTRIDE(p_stride) + i]); + } + // remaining loop + if (w & 0xf){ + int i=w-w%16; + ALIGN_STK_16(uint8_t, tmp_out, 16,); + u8x16 sum_pixel; + + LOAD_AND_APPLY_FILTER_V(sum_pixel, hor); + vec_vsx_st(sum_pixel, 0, tmp_out); + + for (int k=0; iwiener = wiener_filter_vsx; +#endif +} + + diff --git a/third_party/dav1d/src/ppc/types.h b/third_party/dav1d/src/ppc/types.h new file mode 100644 index 0000000000..0b4bd72f0e --- /dev/null +++ b/third_party/dav1d/src/ppc/types.h @@ -0,0 +1,54 @@ +/* + * Copyright © 2019, VideoLAN and dav1d authors + * Copyright © 2019, Luca Barbato + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_PPC_TYPES_H +#define DAV1D_SRC_PPC_TYPES_H + +#include +#undef pixel + +#define u8x16 vector unsigned char +#define i8x16 vector signed char +#define b8x16 vector bool char +#define u16x8 vector unsigned short +#define i16x8 vector signed short +#define b16x8 vector bool short +#define u32x4 vector unsigned int +#define i32x4 vector signed int +#define b32x4 vector bool int +#define u64x2 vector unsigned long long +#define i64x2 vector signed long long +#define b64x2 vector bool long long + +#define u8h_to_u16(v) ((u16x8) vec_mergeh((u8x16) v, vec_splat_u8(0))) +#define u8l_to_u16(v) ((u16x8) vec_mergel((u8x16) v, vec_splat_u8(0))) +#define u16h_to_i32(v) ((i32x4) vec_mergeh((u16x8) v, vec_splat_u16(0))) +#define i16h_to_i32(v) ((i32x4) vec_unpackh((i16x8)v)) +#define u16l_to_i32(v) ((i32x4) vec_mergel((u16x8) v, vec_splat_u16(0))) +#define i16l_to_i32(v) ((i32x4) vec_unpackl((i16x8)v)) + +#endif /* DAV1D_SRC_PPC_TYPES_H */ diff --git a/third_party/dav1d/src/qm.c b/third_party/dav1d/src/qm.c new file mode 100644 index 0000000000..8d9a0f954a --- /dev/null +++ b/third_party/dav1d/src/qm.c @@ -0,0 +1,3152 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include + +#include "common/attributes.h" + +#include "src/qm.h" + +static const uint8_t qm_tbl_4x4_t[][2][10] = { + { + { + 32, + 43, 67, + 73, 94, 137, + 97, 110, 150, 200, + }, { + 35, + 46, 60, + 57, 69, 90, + 66, 71, 90, 109, + }, + }, { + { + 32, + 41, 63, + 69, 88, 127, + 92, 103, 140, 184, + }, { + 33, + 45, 58, + 56, 66, 86, + 64, 69, 87, 105, + }, + }, { + { + 32, + 38, 56, + 63, 78, 113, + 86, 97, 130, 169, + }, { + 32, + 45, 55, + 53, 62, 80, + 63, 67, 84, 101, + }, + }, { + { + 32, + 37, 54, + 58, 72, 102, + 81, 91, 121, 156, + }, { + 32, + 45, 54, + 51, 59, 75, + 61, 65, 81, 97, + }, + }, { + { + 32, + 34, 49, + 53, 64, 91, + 75, 81, 112, 140, + }, { + 32, + 46, 53, + 49, 55, 70, + 58, 62, 78, 91, + }, + }, { + { + 32, + 34, 48, + 49, 60, 82, + 72, 79, 104, 134, + }, { + 32, + 46, 53, + 47, 54, 66, + 57, 60, 75, 89, + }, + }, { + { + 32, + 33, 39, + 45, 51, 71, + 62, 64, 87, 108, + }, { + 31, + 42, 48, + 47, 50, 61, + 53, 54, 67, 78, + }, + }, { + { + 32, + 33, 38, + 42, 46, 63, + 55, 57, 75, 92, + }, { + 31, + 41, 48, + 46, 48, 58, + 51, 51, 62, 71, + }, + }, { + { + 32, + 32, 35, + 38, 40, 54, + 51, 49, 64, 81, + }, { + 31, + 38, 47, + 47, 46, 54, + 49, 46, 57, 66, + }, + }, { + { + 32, + 32, 34, + 35, 37, 48, + 43, 43, 54, 65, + }, { + 31, + 37, 44, + 47, 47, 53, + 47, 45, 53, 59, + }, + }, { + { + 32, + 32, 33, + 34, 35, 39, + 38, 39, 45, 54, + }, { + 31, + 34, 39, + 42, 45, 48, + 47, 46, 49, 54, + }, + }, { + { + 32, + 32, 32, + 32, 33, 35, + 35, 35, 38, 46, + }, { + 31, + 32, 34, + 38, 41, 47, + 46, 46, 47, 52, + }, + }, { + { + 31, + 32, 32, + 32, 32, 33, + 32, 33, 34, 35, + }, { + 31, + 31, 32, + 34, 35, 39, + 38, 40, 43, 47, + }, + }, { + { + 31, + 31, 32, + 31, 32, 32, + 32, 32, 32, 33, + }, { + 31, + 31, 31, + 31, 31, 32, + 34, 35, 35, 39, + }, + }, { + { + 31, + 31, 32, + 31, 32, 32, + 31, 32, 32, 32, + }, { + 31, + 31, 31, + 31, 31, 31, + 31, 31, 31, 31, + }, + }, +}; + +static const uint8_t qm_tbl_8x4[][2][32] = { + { + { + 32, 33, 37, 49, 65, 80, 91, 104, + 42, 42, 58, 71, 84, 97, 100, 112, + 75, 69, 84, 103, 125, 142, 145, 146, + 91, 86, 91, 110, 128, 152, 178, 190, + }, { + 31, 40, 46, 48, 54, 61, 64, 68, + 47, 45, 56, 61, 65, 69, 68, 71, + 60, 54, 64, 75, 85, 92, 90, 87, + 66, 61, 64, 73, 82, 92, 102, 105, + }, + }, { + { + 32, 33, 36, 46, 60, 75, 86, 98, + 42, 42, 56, 67, 79, 92, 95, 105, + 69, 64, 77, 93, 112, 130, 136, 136, + 88, 83, 88, 105, 122, 144, 167, 177, + }, { + 31, 40, 46, 47, 52, 59, 63, 66, + 47, 45, 55, 60, 64, 68, 66, 69, + 57, 52, 61, 70, 79, 87, 88, 85, + 65, 61, 63, 72, 81, 90, 99, 102, + }, + }, { + { + 32, 32, 34, 44, 54, 72, 82, 92, + 38, 40, 51, 61, 69, 84, 89, 98, + 62, 58, 68, 85, 98, 118, 129, 127, + 86, 80, 85, 101, 117, 136, 157, 165, + }, { + 31, 38, 46, 46, 50, 57, 61, 65, + 47, 46, 53, 56, 59, 64, 65, 67, + 54, 50, 57, 66, 74, 82, 85, 82, + 64, 60, 62, 71, 79, 88, 97, 99, + }, + }, { + { + 32, 32, 34, 41, 51, 65, 75, 86, + 35, 36, 47, 53, 61, 73, 81, 92, + 59, 57, 65, 78, 92, 108, 117, 119, + 83, 78, 82, 97, 111, 129, 148, 154, + }, { + 31, 36, 46, 45, 49, 54, 59, 63, + 47, 47, 52, 53, 55, 58, 61, 65, + 53, 50, 55, 63, 71, 77, 81, 80, + 63, 59, 61, 70, 77, 86, 94, 95, + }, + }, { + { + 32, 32, 34, 38, 48, 60, 72, 81, + 35, 36, 42, 51, 59, 68, 79, 86, + 51, 50, 54, 67, 80, 92, 104, 112, + 77, 72, 75, 87, 103, 119, 135, 144, + }, { + 31, 36, 43, 45, 47, 52, 57, 61, + 47, 47, 50, 53, 54, 56, 60, 63, + 50, 47, 50, 58, 66, 70, 75, 77, + 61, 57, 58, 65, 74, 82, 90, 93, + }, + }, { + { + 32, 32, 34, 37, 45, 54, 65, 75, + 35, 36, 42, 50, 56, 63, 73, 81, + 51, 50, 54, 65, 76, 87, 97, 106, + 75, 71, 73, 84, 96, 110, 125, 136, + }, { + 31, 36, 43, 46, 46, 50, 54, 59, + 47, 47, 50, 53, 54, 55, 58, 61, + 50, 47, 50, 57, 64, 68, 72, 75, + 60, 56, 57, 64, 71, 78, 85, 90, + }, + }, { + { + 32, 32, 33, 35, 41, 49, 57, 66, + 34, 34, 37, 43, 48, 54, 60, 68, + 43, 42, 44, 54, 64, 71, 78, 86, + 62, 59, 58, 68, 79, 91, 101, 111, + }, { + 31, 33, 40, 47, 45, 48, 51, 55, + 42, 44, 47, 50, 49, 50, 52, 55, + 47, 45, 46, 54, 59, 61, 63, 66, + 54, 51, 50, 57, 64, 70, 75, 79, + }, + }, { + { + 32, 32, 32, 34, 38, 44, 50, 61, + 32, 33, 35, 37, 40, 45, 50, 58, + 42, 41, 42, 50, 58, 66, 71, 79, + 56, 53, 52, 59, 68, 78, 86, 97, + }, { + 31, 32, 39, 44, 46, 47, 48, 53, + 38, 40, 47, 47, 47, 46, 47, 50, + 47, 45, 45, 51, 56, 59, 61, 64, + 52, 49, 48, 53, 58, 64, 68, 73, + }, + }, { + { + 32, 32, 32, 34, 35, 40, 46, 52, + 32, 33, 34, 37, 38, 42, 46, 51, + 37, 36, 38, 44, 49, 55, 59, 64, + 52, 49, 49, 54, 60, 69, 76, 83, + }, { + 31, 31, 36, 42, 47, 46, 48, 50, + 38, 40, 44, 47, 48, 46, 46, 48, + 47, 46, 47, 50, 53, 54, 55, 56, + 50, 48, 47, 50, 54, 60, 64, 67, + }, + }, { + { + 31, 32, 32, 32, 34, 37, 42, 46, + 32, 33, 34, 35, 37, 40, 43, 46, + 35, 34, 36, 38, 43, 49, 53, 56, + 43, 41, 42, 42, 49, 56, 63, 67, + }, { + 31, 31, 35, 39, 43, 47, 46, 48, + 38, 40, 43, 47, 47, 47, 46, 46, + 47, 46, 47, 47, 50, 53, 53, 54, + 48, 45, 46, 45, 50, 55, 58, 59, + }, + }, { + { + 31, 32, 32, 32, 33, 34, 37, 40, + 32, 32, 33, 33, 34, 36, 38, 40, + 34, 34, 34, 36, 38, 41, 44, 46, + 39, 38, 38, 40, 42, 47, 52, 56, + }, { + 31, 31, 33, 36, 40, 45, 47, 47, + 34, 35, 37, 41, 44, 46, 47, 46, + 42, 42, 44, 46, 48, 49, 50, 49, + 48, 46, 46, 46, 48, 51, 54, 55, + }, + }, { + { + 31, 32, 32, 32, 32, 33, 34, 35, + 31, 32, 32, 32, 33, 33, 34, 34, + 32, 32, 33, 34, 35, 36, 37, 38, + 35, 35, 34, 36, 38, 40, 42, 48, + }, { + 31, 31, 31, 34, 37, 39, 42, 48, + 31, 31, 32, 36, 39, 41, 43, 46, + 37, 38, 40, 43, 46, 47, 47, 48, + 48, 47, 46, 47, 47, 48, 50, 53, + }, + }, { + { + 31, 31, 32, 32, 32, 32, 32, 33, + 31, 32, 32, 32, 32, 32, 33, 33, + 32, 32, 32, 32, 33, 34, 34, 35, + 32, 32, 32, 33, 34, 34, 35, 36, + }, { + 31, 31, 31, 31, 34, 35, 38, 41, + 31, 31, 32, 32, 36, 37, 40, 42, + 35, 36, 37, 37, 40, 42, 45, 45, + 37, 38, 39, 40, 43, 44, 47, 47, + }, + }, { + { + 31, 31, 31, 31, 31, 31, 32, 32, + 31, 32, 32, 32, 32, 32, 32, 32, + 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, + }, { + 31, 31, 31, 31, 31, 31, 34, 34, + 31, 31, 31, 32, 32, 33, 36, 36, + 31, 31, 31, 32, 32, 33, 36, 36, + 34, 35, 35, 36, 36, 37, 40, 40, + }, + }, { + { + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, + }, { + 31, 31, 31, 31, 31, 31, 31, 30, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 32, 32, + 31, 31, 31, 31, 31, 31, 32, 32, + }, + }, +}; + +static const uint8_t qm_tbl_8x8_t[][2][36] = { + { + { + 32, + 32, 35, + 38, 40, 54, + 51, 49, 65, 82, + 68, 63, 78, 97, 117, + 84, 76, 91, 111, 134, 152, + 95, 89, 98, 113, 138, 159, 183, + 109, 102, 106, 121, 142, 168, 199, 220, + }, { + 31, + 38, 47, + 47, 46, 54, + 50, 47, 57, 66, + 57, 52, 61, 72, 82, + 63, 57, 66, 77, 88, 96, + 67, 62, 67, 75, 86, 95, 104, + 71, 67, 68, 75, 84, 95, 107, 113, + }, + }, { + { + 32, + 32, 35, + 37, 39, 51, + 47, 46, 60, 73, + 62, 58, 71, 87, 105, + 78, 72, 84, 100, 121, 140, + 90, 84, 93, 106, 129, 148, 169, + 102, 96, 100, 113, 132, 155, 183, 201, + }, { + 31, + 38, 47, + 47, 47, 53, + 48, 46, 55, 62, + 54, 50, 58, 67, 76, + 61, 55, 63, 72, 83, 91, + 66, 61, 65, 73, 84, 92, 101, + 69, 65, 66, 73, 82, 92, 103, 109, + }, + }, { + { + 32, + 32, 34, + 35, 37, 48, + 46, 45, 56, 70, + 57, 54, 64, 80, 93, + 76, 70, 79, 96, 111, 134, + 85, 79, 87, 100, 121, 138, 156, + 96, 90, 93, 105, 122, 144, 168, 184, + }, { + 31, + 36, 43, + 47, 47, 53, + 48, 46, 54, 61, + 52, 49, 55, 65, 71, + 60, 55, 60, 70, 78, 89, + 64, 59, 63, 71, 81, 89, 97, + 67, 63, 64, 71, 79, 89, 99, 104, + }, + }, { + { + 32, + 32, 33, + 35, 36, 46, + 42, 42, 52, 63, + 53, 51, 60, 73, 86, + 68, 64, 72, 84, 100, 117, + 78, 74, 80, 92, 109, 128, 140, + 90, 84, 87, 98, 114, 133, 155, 168, + }, { + 31, + 34, 39, + 46, 47, 52, + 47, 45, 52, 58, + 50, 48, 54, 62, 68, + 57, 53, 58, 65, 73, 82, + 61, 57, 61, 68, 77, 86, 91, + 65, 61, 62, 68, 76, 86, 95, 100, + }, + }, { + { + 32, + 32, 33, + 34, 35, 39, + 39, 40, 46, 56, + 50, 48, 53, 65, 78, + 62, 59, 63, 75, 90, 105, + 76, 71, 74, 86, 101, 118, 134, + 84, 79, 81, 92, 106, 123, 142, 153, + }, { + 31, + 34, 39, + 42, 45, 48, + 47, 46, 49, 55, + 49, 47, 50, 58, 65, + 54, 51, 53, 61, 69, 76, + 60, 56, 57, 65, 73, 82, 89, + 64, 59, 60, 66, 74, 83, 92, 96, + }, + }, { + { + 32, + 32, 33, + 34, 35, 39, + 38, 39, 45, 54, + 46, 45, 51, 61, 71, + 56, 54, 58, 69, 80, 92, + 68, 64, 68, 78, 90, 103, 117, + 78, 74, 76, 86, 99, 113, 128, 140, + }, { + 31, + 34, 39, + 42, 45, 48, + 47, 46, 49, 54, + 48, 46, 50, 56, 61, + 52, 49, 52, 58, 65, 71, + 57, 53, 55, 61, 68, 75, 82, + 61, 57, 58, 64, 71, 79, 86, 91, + }, + }, { + { + 31, + 32, 32, + 32, 33, 35, + 35, 35, 38, 48, + 42, 41, 43, 54, 63, + 51, 49, 49, 59, 71, 81, + 59, 56, 56, 66, 77, 89, 98, + 69, 65, 64, 73, 85, 97, 108, 119, + }, { + 31, + 32, 35, + 38, 42, 47, + 48, 47, 48, 53, + 47, 45, 45, 53, 58, + 50, 47, 47, 54, 61, 66, + 53, 50, 49, 56, 63, 69, 73, + 57, 54, 52, 58, 65, 72, 77, 82, + }, + }, { + { + 31, + 32, 32, + 32, 32, 35, + 34, 34, 37, 42, + 38, 37, 40, 47, 54, + 46, 44, 45, 52, 60, 69, + 52, 49, 49, 56, 65, 75, 82, + 63, 59, 58, 65, 73, 84, 92, 105, + }, { + 31, + 31, 32, + 38, 40, 47, + 44, 44, 47, 50, + 47, 45, 46, 51, 54, + 48, 46, 46, 51, 56, 61, + 50, 47, 47, 52, 57, 63, 66, + 55, 52, 50, 54, 60, 66, 70, 76, + }, + }, { + { + 31, + 32, 32, + 32, 32, 34, + 34, 33, 35, 39, + 35, 34, 37, 42, 48, + 41, 40, 41, 47, 53, 60, + 47, 44, 45, 51, 57, 65, 71, + 53, 50, 51, 55, 61, 70, 77, 85, + }, { + 31, + 31, 32, + 35, 36, 41, + 42, 42, 45, 48, + 48, 46, 47, 50, 53, + 47, 45, 45, 49, 53, 57, + 49, 46, 46, 50, 54, 59, 61, + 51, 48, 48, 51, 54, 60, 64, 68, + }, + }, { + { + 31, + 31, 32, + 32, 32, 33, + 32, 32, 34, 35, + 34, 34, 35, 37, 41, + 37, 36, 38, 39, 45, 51, + 43, 41, 42, 42, 49, 56, 63, + 47, 44, 45, 46, 52, 59, 67, 71, + }, { + 31, + 31, 32, + 34, 35, 39, + 37, 40, 43, 47, + 43, 43, 45, 47, 49, + 48, 46, 46, 47, 50, 53, + 47, 45, 45, 45, 50, 55, 58, + 49, 46, 46, 46, 50, 55, 60, 61, + }, + }, { + { + 31, + 31, 32, + 32, 32, 32, + 32, 32, 33, 34, + 33, 33, 34, 35, 37, + 34, 34, 35, 36, 39, 43, + 37, 36, 37, 38, 41, 46, 51, + 41, 39, 40, 41, 44, 49, 54, 58, + }, { + 31, + 31, 31, + 32, 33, 35, + 35, 37, 39, 43, + 39, 41, 42, 45, 47, + 45, 44, 45, 47, 48, 50, + 48, 46, 46, 47, 48, 51, 53, + 48, 46, 45, 46, 47, 51, 54, 56, + }, + }, { + { + 31, + 31, 32, + 31, 32, 32, + 32, 32, 32, 33, + 32, 32, 32, 34, 35, + 32, 33, 33, 34, 35, 36, + 34, 34, 33, 35, 36, 38, 39, + 35, 35, 34, 36, 38, 40, 42, 48, + }, { + 31, + 31, 31, + 30, 31, 32, + 34, 34, 35, 39, + 36, 37, 39, 42, 46, + 39, 40, 41, 44, 47, 47, + 42, 42, 42, 45, 47, 48, 48, + 48, 47, 46, 47, 47, 49, 50, 53, + }, + }, { + { + 31, + 31, 32, + 31, 32, 32, + 31, 32, 32, 32, + 32, 32, 32, 32, 33, + 32, 32, 32, 32, 33, 34, + 32, 32, 32, 32, 34, 34, 35, + 33, 33, 33, 33, 35, 35, 36, 38, + }, { + 31, + 31, 31, + 31, 31, 31, + 30, 31, 31, 32, + 34, 34, 35, 35, 39, + 35, 35, 36, 36, 40, 41, + 37, 38, 39, 40, 43, 44, 47, + 40, 41, 41, 42, 44, 45, 47, 48, + }, + }, { + { + 31, + 31, 32, + 31, 32, 32, + 31, 32, 32, 32, + 31, 32, 32, 32, 32, + 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, + 32, 32, 32, 32, 32, 32, 33, 33, + }, { + 31, + 31, 31, + 31, 31, 31, + 31, 31, 31, 31, + 30, 31, 31, 31, 32, + 31, 32, 32, 32, 32, 33, + 33, 34, 34, 35, 35, 36, 39, + 33, 34, 34, 35, 35, 36, 39, 39, + }, + }, { + { + 31, + 31, 31, + 31, 31, 31, + 31, 31, 32, 32, + 31, 31, 32, 32, 32, + 31, 31, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, + }, { + 31, + 31, 31, + 31, 31, 31, + 31, 31, 31, 31, + 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, + 30, 31, 31, 31, 31, 31, 31, 31, + }, + }, +}; + +static const uint8_t qm_tbl_16x4[][2][64] = { + { + { + 31, 32, 32, 34, 34, 41, 45, 54, 60, 72, 75, 83, 88, 94, 101, 108, + 44, 41, 42, 48, 54, 63, 67, 75, 79, 90, 92, 100, 100, 101, 108, 115, + 79, 72, 71, 73, 78, 90, 96, 110, 118, 133, 136, 142, 140, 144, 141, 151, + 96, 90, 86, 83, 89, 95, 102, 111, 123, 135, 149, 160, 173, 180, 188, 197, + }, { + 31, 32, 36, 43, 46, 45, 46, 50, 52, 57, 59, 62, 63, 65, 67, 69, + 49, 45, 46, 49, 53, 58, 59, 62, 64, 67, 68, 71, 69, 68, 70, 72, + 63, 57, 56, 57, 60, 67, 71, 78, 82, 89, 90, 91, 89, 89, 86, 88, + 69, 65, 62, 60, 63, 66, 70, 74, 80, 85, 91, 96, 101, 103, 105, 107, + }, + }, { + { + 31, 32, 32, 33, 34, 37, 44, 49, 56, 65, 72, 78, 84, 89, 95, 101, + 44, 41, 42, 44, 54, 58, 66, 71, 77, 84, 90, 95, 95, 95, 101, 108, + 73, 67, 65, 66, 74, 79, 90, 99, 107, 119, 127, 133, 132, 136, 132, 141, + 93, 87, 83, 81, 86, 92, 98, 107, 117, 129, 141, 151, 163, 169, 175, 183, + }, { + 31, 32, 36, 41, 46, 46, 46, 48, 51, 54, 57, 60, 62, 64, 65, 67, + 49, 45, 46, 47, 53, 56, 59, 61, 63, 65, 67, 69, 67, 66, 68, 70, + 61, 55, 54, 54, 59, 62, 68, 73, 77, 82, 86, 88, 86, 87, 83, 86, + 69, 64, 61, 59, 62, 65, 68, 73, 78, 84, 89, 93, 98, 100, 102, 103, + }, + }, { + { + 31, 32, 32, 33, 34, 37, 41, 46, 53, 60, 65, 74, 79, 84, 89, 94, + 39, 38, 39, 40, 47, 54, 58, 62, 68, 73, 78, 85, 90, 90, 95, 101, + 65, 60, 59, 58, 65, 73, 79, 86, 97, 105, 111, 120, 125, 128, 124, 131, + 90, 84, 81, 78, 83, 89, 94, 102, 112, 123, 134, 143, 154, 158, 164, 170, + }, { + 31, 32, 36, 40, 44, 46, 45, 47, 49, 52, 54, 58, 60, 62, 64, 65, + 48, 46, 46, 46, 51, 54, 56, 57, 58, 60, 62, 64, 66, 64, 66, 68, + 57, 53, 51, 50, 54, 60, 64, 68, 73, 76, 79, 82, 84, 84, 81, 83, + 68, 63, 60, 58, 61, 64, 67, 71, 77, 82, 87, 91, 95, 97, 99, 100, + }, + }, { + { + 31, 32, 32, 33, 34, 34, 39, 44, 49, 54, 60, 68, 75, 79, 84, 88, + 36, 35, 36, 38, 42, 48, 51, 56, 60, 63, 68, 75, 81, 85, 89, 94, + 62, 58, 57, 56, 61, 66, 74, 82, 90, 95, 102, 110, 117, 120, 116, 123, + 88, 82, 79, 76, 81, 85, 91, 98, 107, 117, 127, 135, 145, 148, 153, 159, + }, { + 31, 32, 35, 40, 43, 46, 45, 46, 48, 50, 52, 56, 58, 60, 62, 63, + 48, 46, 47, 47, 50, 53, 53, 54, 54, 55, 56, 59, 61, 63, 64, 66, + 56, 52, 50, 49, 53, 56, 61, 65, 70, 72, 75, 79, 81, 82, 79, 81, + 67, 62, 60, 57, 60, 63, 66, 70, 75, 80, 85, 89, 93, 94, 96, 97, + }, + }, { + { + 31, 32, 32, 32, 33, 34, 37, 41, 45, 49, 54, 61, 68, 74, 78, 83, + 36, 35, 35, 37, 41, 48, 50, 53, 56, 60, 63, 69, 75, 80, 84, 88, + 53, 51, 49, 49, 53, 60, 65, 71, 76, 82, 87, 93, 100, 105, 109, 114, + 81, 76, 73, 71, 74, 80, 85, 91, 98, 105, 112, 121, 130, 137, 142, 148, + }, { + 31, 31, 33, 38, 42, 46, 46, 45, 46, 48, 50, 52, 56, 58, 60, 62, + 48, 47, 46, 47, 49, 53, 53, 53, 54, 54, 55, 57, 59, 61, 62, 64, + 52, 49, 48, 47, 50, 54, 57, 61, 64, 66, 68, 71, 73, 75, 76, 78, + 64, 60, 57, 56, 57, 61, 64, 68, 71, 75, 78, 83, 87, 90, 92, 94, + }, + }, { + { + 31, 32, 32, 32, 33, 34, 34, 37, 41, 45, 49, 54, 60, 65, 72, 75, + 36, 35, 34, 36, 38, 42, 48, 50, 53, 56, 60, 63, 68, 73, 79, 81, + 53, 51, 49, 50, 49, 54, 60, 65, 71, 76, 82, 87, 92, 97, 104, 106, + 79, 75, 72, 71, 69, 73, 78, 84, 90, 96, 103, 110, 118, 125, 133, 136, + }, { + 31, 31, 32, 36, 40, 43, 46, 46, 45, 46, 48, 50, 52, 54, 57, 59, + 48, 47, 46, 47, 47, 50, 53, 53, 53, 54, 54, 55, 56, 58, 60, 61, + 52, 50, 48, 47, 47, 50, 54, 57, 61, 64, 66, 68, 70, 72, 75, 75, + 63, 60, 57, 56, 54, 57, 60, 64, 67, 71, 75, 78, 82, 85, 89, 90, + }, + }, { + { + 31, 32, 32, 32, 32, 34, 34, 36, 39, 42, 45, 50, 54, 60, 66, 73, + 34, 34, 33, 35, 37, 39, 42, 44, 46, 48, 51, 54, 58, 63, 68, 74, + 44, 43, 41, 43, 43, 48, 53, 57, 60, 64, 67, 72, 76, 80, 85, 91, + 65, 62, 59, 59, 58, 63, 67, 71, 76, 81, 85, 92, 98, 105, 111, 118, + }, { + 31, 31, 32, 35, 40, 43, 46, 46, 46, 46, 47, 48, 50, 52, 55, 58, + 42, 42, 42, 45, 47, 48, 50, 50, 49, 49, 50, 50, 52, 53, 55, 58, + 49, 47, 45, 46, 46, 49, 53, 55, 57, 59, 60, 61, 63, 64, 66, 68, + 57, 54, 52, 51, 50, 53, 56, 58, 61, 64, 67, 71, 73, 76, 79, 82, + }, + }, { + { + 31, 32, 32, 32, 32, 32, 34, 35, 37, 39, 41, 45, 50, 54, 57, 61, + 32, 32, 33, 34, 34, 35, 37, 38, 40, 41, 43, 46, 50, 53, 56, 58, + 44, 42, 41, 42, 42, 42, 48, 54, 57, 60, 63, 67, 71, 74, 77, 79, + 58, 55, 53, 53, 53, 52, 57, 63, 67, 70, 74, 79, 86, 90, 93, 97, + }, { + 31, 31, 32, 34, 37, 39, 42, 47, 46, 46, 46, 47, 48, 50, 51, 53, + 37, 38, 40, 42, 44, 47, 47, 48, 47, 46, 46, 46, 47, 48, 49, 50, + 49, 47, 45, 45, 45, 45, 49, 53, 55, 57, 58, 59, 61, 62, 63, 64, + 54, 51, 49, 49, 48, 48, 51, 55, 58, 60, 62, 65, 68, 70, 71, 73, + }, + }, { + { + 31, 32, 32, 32, 32, 32, 33, 34, 35, 37, 38, 42, 45, 47, 51, 55, + 32, 32, 32, 33, 34, 35, 36, 37, 38, 40, 40, 43, 45, 47, 50, 54, + 38, 37, 36, 36, 38, 39, 41, 44, 49, 51, 52, 56, 58, 60, 63, 67, + 53, 51, 49, 49, 50, 49, 51, 54, 60, 63, 65, 71, 75, 77, 82, 87, + }, { + 31, 31, 31, 32, 35, 39, 40, 42, 47, 47, 46, 46, 47, 48, 49, 51, + 37, 38, 39, 40, 43, 47, 47, 47, 48, 47, 47, 46, 46, 47, 47, 49, + 48, 47, 46, 46, 46, 47, 48, 50, 53, 53, 54, 55, 55, 55, 56, 57, + 52, 50, 48, 48, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64, 66, 68, + }, + }, { + { + 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 35, 38, 40, 42, 45, 46, + 32, 32, 32, 33, 34, 34, 35, 36, 37, 38, 38, 40, 41, 43, 45, 46, + 36, 35, 35, 34, 36, 36, 38, 40, 42, 47, 48, 50, 51, 53, 56, 56, + 44, 42, 41, 41, 42, 42, 42, 44, 48, 52, 54, 58, 60, 63, 66, 67, + }, { + 31, 31, 31, 31, 34, 35, 39, 40, 42, 46, 47, 47, 47, 46, 48, 48, + 37, 38, 39, 40, 42, 43, 47, 47, 47, 48, 48, 47, 46, 46, 46, 46, + 48, 47, 46, 46, 47, 47, 47, 48, 50, 52, 53, 53, 53, 53, 54, 54, + 49, 47, 46, 45, 45, 46, 45, 47, 49, 53, 53, 56, 57, 58, 59, 59, + }, + }, { + { + 31, 32, 32, 32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, + 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 39, 39, 42, + 34, 34, 34, 33, 33, 35, 35, 37, 37, 39, 39, 43, 43, 45, 45, 48, + 39, 38, 38, 37, 37, 39, 39, 40, 40, 45, 45, 51, 51, 54, 54, 58, + }, { + 31, 31, 31, 31, 31, 34, 34, 38, 38, 42, 42, 48, 48, 47, 47, 47, + 33, 34, 34, 35, 35, 39, 39, 43, 43, 45, 45, 47, 47, 46, 46, 45, + 42, 42, 42, 42, 42, 45, 45, 47, 47, 48, 48, 50, 50, 50, 50, 49, + 48, 47, 47, 45, 45, 46, 46, 46, 46, 50, 50, 53, 53, 54, 54, 56, + }, + }, { + { + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, + 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, + 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 41, 42, 44, 48, 48, + }, { + 31, 31, 31, 31, 31, 31, 34, 34, 36, 38, 38, 41, 42, 44, 48, 48, + 31, 31, 31, 32, 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, + 37, 38, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, + 48, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 49, 50, 51, 53, 53, + }, + }, { + { + 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 36, + 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, + }, { + 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 34, 36, 38, 38, 39, 42, + 31, 31, 31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, + 35, 35, 36, 36, 36, 37, 37, 38, 40, 40, 40, 43, 45, 45, 45, 46, + 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, + }, + }, { + { + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, + }, { + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, + 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 36, 37, 39, 40, 40, 40, + }, + }, { + { + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + }, { + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, + }, + }, +}; + +static const uint8_t qm_tbl_16x8[][2][128] = { + { + { + 32, 31, 32, 34, 36, 44, 48, 58, 65, 79, 82, 91, 97, 103, 110, 118, + 32, 33, 34, 37, 38, 43, 46, 54, 58, 70, 72, 80, 86, 93, 100, 107, + 36, 34, 36, 42, 48, 53, 56, 63, 68, 79, 81, 88, 94, 98, 101, 105, + 53, 49, 50, 54, 60, 71, 76, 87, 92, 104, 106, 106, 107, 114, 117, 118, + 65, 59, 59, 63, 68, 79, 85, 98, 105, 118, 121, 130, 128, 131, 138, 136, + 87, 78, 77, 79, 84, 95, 102, 116, 124, 141, 144, 148, 157, 150, 161, 157, + 93, 86, 82, 80, 86, 94, 105, 112, 122, 135, 149, 162, 167, 174, 183, 182, + 99, 93, 89, 88, 90, 97, 105, 115, 124, 135, 146, 159, 171, 186, 193, 203, + }, { + 32, 30, 33, 42, 49, 49, 50, 54, 57, 63, 64, 68, 70, 72, 74, 76, + 37, 40, 43, 47, 48, 46, 46, 49, 50, 55, 56, 59, 62, 64, 67, 69, + 48, 46, 47, 50, 53, 53, 54, 55, 56, 60, 61, 64, 66, 66, 66, 67, + 52, 48, 47, 50, 54, 61, 64, 68, 70, 75, 75, 74, 73, 75, 74, 73, + 57, 52, 51, 53, 57, 64, 67, 73, 76, 82, 83, 86, 83, 83, 84, 82, + 66, 60, 59, 60, 62, 69, 73, 80, 84, 92, 93, 94, 96, 92, 94, 91, + 68, 63, 60, 59, 62, 66, 72, 76, 80, 87, 93, 98, 99, 101, 103, 101, + 71, 66, 63, 62, 62, 66, 70, 75, 79, 84, 89, 94, 98, 104, 106, 109, + }, + }, { + { + 32, 31, 32, 32, 36, 39, 47, 53, 61, 71, 79, 86, 92, 98, 104, 110, + 32, 32, 34, 35, 37, 40, 45, 50, 56, 64, 70, 76, 82, 88, 94, 100, + 36, 35, 36, 40, 48, 50, 56, 60, 65, 73, 79, 84, 89, 93, 95, 98, + 47, 44, 45, 47, 56, 60, 69, 75, 81, 89, 95, 100, 101, 108, 110, 111, + 65, 60, 59, 60, 68, 73, 84, 92, 100, 111, 118, 124, 121, 124, 129, 127, + 79, 72, 71, 71, 78, 84, 95, 103, 113, 125, 133, 140, 148, 141, 151, 147, + 90, 84, 80, 78, 83, 91, 101, 108, 116, 129, 142, 153, 157, 163, 171, 169, + 96, 90, 87, 85, 87, 94, 101, 110, 118, 129, 138, 150, 161, 174, 181, 188, + }, { + 32, 30, 33, 39, 49, 48, 50, 52, 55, 60, 63, 66, 68, 70, 72, 74, + 35, 38, 41, 46, 48, 46, 46, 47, 49, 53, 55, 58, 60, 62, 65, 67, + 48, 46, 47, 48, 53, 53, 54, 54, 56, 58, 60, 62, 64, 65, 65, 65, + 50, 46, 46, 47, 54, 56, 61, 63, 65, 68, 70, 72, 71, 73, 72, 71, + 57, 52, 51, 51, 57, 60, 66, 71, 74, 79, 82, 84, 81, 81, 82, 79, + 63, 58, 56, 55, 60, 64, 70, 75, 79, 85, 89, 91, 94, 89, 92, 89, + 68, 63, 60, 58, 61, 65, 71, 75, 79, 85, 91, 95, 97, 98, 100, 98, + 70, 65, 63, 61, 61, 65, 69, 74, 78, 82, 87, 91, 96, 101, 103, 105, + }, + }, { + { + 32, 31, 32, 32, 34, 39, 44, 49, 57, 65, 71, 81, 87, 92, 98, 103, + 32, 32, 33, 34, 36, 39, 42, 46, 53, 59, 64, 72, 77, 83, 88, 94, + 36, 35, 36, 38, 44, 50, 53, 57, 63, 68, 73, 80, 85, 88, 89, 92, + 44, 41, 42, 42, 50, 58, 63, 67, 74, 79, 84, 91, 96, 102, 103, 103, + 58, 54, 53, 52, 59, 68, 74, 81, 90, 97, 102, 110, 114, 117, 121, 119, + 79, 73, 71, 69, 75, 84, 90, 97, 108, 118, 125, 135, 140, 133, 141, 137, + 88, 81, 78, 76, 81, 88, 97, 104, 111, 123, 135, 145, 148, 153, 160, 158, + 93, 88, 84, 82, 84, 90, 97, 105, 113, 122, 131, 141, 151, 163, 169, 175, + }, { + 32, 31, 33, 37, 44, 48, 49, 51, 54, 57, 60, 64, 66, 68, 70, 72, + 34, 36, 40, 44, 46, 46, 45, 47, 49, 51, 53, 57, 59, 61, 63, 65, + 48, 46, 47, 47, 51, 53, 53, 54, 55, 56, 58, 61, 63, 63, 63, 63, + 49, 46, 46, 45, 51, 56, 58, 60, 62, 64, 65, 68, 69, 71, 70, 69, + 54, 50, 49, 48, 53, 58, 62, 65, 70, 73, 75, 78, 79, 79, 80, 77, + 63, 58, 56, 54, 59, 64, 67, 71, 77, 82, 85, 89, 91, 87, 89, 86, + 67, 62, 59, 57, 60, 64, 70, 73, 77, 83, 89, 93, 94, 96, 97, 95, + 69, 65, 62, 60, 61, 64, 68, 72, 76, 81, 85, 89, 93, 98, 100, 102, + }, + }, { + { + 32, 31, 31, 32, 34, 36, 41, 47, 53, 58, 65, 74, 82, 87, 92, 97, + 31, 32, 33, 34, 35, 36, 40, 44, 50, 54, 59, 67, 73, 78, 83, 88, + 35, 34, 35, 37, 41, 46, 49, 53, 57, 61, 66, 73, 79, 83, 84, 86, + 44, 41, 42, 42, 48, 54, 60, 66, 71, 75, 79, 86, 92, 96, 97, 97, + 53, 50, 49, 49, 54, 60, 67, 75, 82, 87, 92, 100, 105, 110, 114, 111, + 65, 61, 59, 58, 63, 68, 76, 84, 92, 98, 105, 113, 120, 125, 132, 128, + 82, 76, 73, 71, 76, 80, 88, 97, 106, 112, 120, 131, 139, 144, 150, 147, + 90, 85, 81, 79, 81, 87, 93, 101, 108, 116, 124, 134, 142, 153, 157, 163, + }, { + 32, 31, 33, 37, 42, 49, 48, 50, 52, 54, 57, 61, 64, 66, 68, 70, + 33, 34, 37, 43, 44, 47, 46, 46, 47, 49, 51, 55, 57, 59, 61, 63, + 45, 45, 46, 47, 49, 52, 51, 52, 53, 54, 55, 58, 60, 61, 61, 61, + 49, 46, 45, 45, 49, 53, 57, 59, 61, 62, 64, 66, 68, 69, 68, 67, + 52, 49, 47, 47, 50, 54, 59, 63, 66, 68, 70, 73, 75, 77, 77, 75, + 57, 53, 51, 50, 53, 57, 61, 66, 71, 73, 76, 80, 83, 84, 86, 83, + 64, 60, 57, 55, 58, 61, 66, 71, 75, 79, 83, 87, 91, 93, 94, 92, + 68, 64, 61, 59, 60, 63, 67, 71, 74, 79, 83, 87, 91, 95, 97, 98, + }, + }, { + { + 32, 31, 31, 32, 33, 36, 39, 44, 48, 53, 58, 66, 74, 81, 86, 91, + 31, 32, 32, 33, 34, 35, 38, 41, 45, 49, 54, 60, 67, 73, 78, 82, + 33, 33, 34, 36, 38, 42, 44, 46, 50, 53, 57, 63, 69, 75, 78, 80, + 40, 39, 38, 40, 44, 51, 54, 59, 62, 66, 70, 75, 81, 86, 90, 90, + 51, 49, 47, 48, 52, 58, 63, 69, 74, 79, 84, 90, 97, 102, 106, 103, + 65, 61, 59, 58, 62, 68, 73, 79, 85, 92, 98, 106, 113, 120, 124, 119, + 79, 74, 71, 69, 72, 78, 84, 90, 96, 103, 110, 119, 128, 135, 140, 137, + 87, 82, 79, 77, 78, 84, 89, 96, 103, 111, 118, 126, 134, 143, 147, 151, + }, { + 32, 31, 31, 35, 41, 49, 48, 49, 50, 52, 54, 57, 61, 64, 66, 68, + 32, 33, 35, 39, 43, 47, 46, 45, 46, 48, 50, 52, 55, 58, 59, 61, + 40, 41, 43, 46, 48, 50, 49, 48, 49, 50, 51, 53, 56, 58, 59, 59, + 49, 47, 46, 46, 49, 53, 54, 56, 57, 58, 59, 61, 63, 65, 66, 65, + 51, 49, 47, 47, 49, 54, 57, 61, 63, 65, 67, 69, 72, 73, 75, 72, + 57, 54, 51, 50, 52, 57, 60, 64, 67, 71, 73, 77, 80, 82, 84, 81, + 63, 59, 57, 55, 57, 60, 64, 67, 71, 75, 78, 82, 86, 89, 91, 89, + 67, 63, 60, 58, 59, 62, 65, 69, 73, 77, 81, 85, 88, 92, 94, 95, + }, + }, { + { + 32, 31, 31, 32, 32, 34, 36, 39, 44, 48, 53, 58, 65, 71, 79, 82, + 31, 32, 32, 32, 33, 34, 34, 37, 41, 45, 49, 54, 60, 65, 72, 75, + 32, 32, 33, 34, 35, 37, 38, 40, 43, 46, 50, 54, 58, 63, 70, 72, + 36, 35, 34, 36, 38, 42, 48, 50, 53, 56, 60, 63, 68, 73, 79, 81, + 44, 42, 41, 42, 42, 48, 54, 58, 63, 67, 71, 75, 79, 84, 90, 92, + 53, 51, 49, 50, 49, 54, 60, 65, 71, 76, 82, 87, 92, 97, 104, 106, + 65, 62, 59, 59, 58, 63, 68, 73, 79, 85, 92, 98, 105, 111, 118, 121, + 79, 75, 72, 71, 69, 73, 78, 84, 90, 96, 103, 110, 118, 125, 133, 136, + }, { + 32, 31, 30, 33, 37, 42, 49, 48, 49, 50, 52, 54, 57, 60, 63, 64, + 31, 31, 32, 36, 40, 43, 46, 46, 45, 46, 48, 50, 52, 54, 57, 59, + 37, 38, 40, 43, 47, 47, 48, 47, 46, 46, 47, 49, 50, 52, 55, 56, + 48, 47, 46, 47, 47, 50, 53, 53, 53, 54, 54, 55, 56, 58, 60, 61, + 49, 47, 45, 46, 45, 49, 53, 56, 58, 59, 61, 62, 64, 65, 67, 68, + 52, 50, 48, 47, 47, 50, 54, 57, 61, 64, 66, 68, 70, 72, 75, 75, + 57, 54, 52, 51, 50, 53, 57, 60, 64, 67, 71, 73, 76, 79, 82, 83, + 63, 60, 57, 56, 54, 57, 60, 64, 67, 71, 75, 78, 82, 85, 89, 90, + }, + }, { + { + 32, 31, 31, 32, 32, 34, 35, 38, 41, 44, 48, 53, 58, 65, 71, 79, + 31, 32, 32, 32, 33, 34, 34, 36, 39, 42, 45, 49, 54, 60, 65, 72, + 32, 32, 33, 34, 35, 37, 38, 40, 41, 43, 46, 50, 54, 58, 63, 70, + 36, 35, 34, 36, 38, 42, 47, 49, 51, 54, 56, 60, 63, 68, 73, 79, + 44, 42, 41, 42, 42, 48, 52, 56, 60, 64, 67, 71, 75, 79, 84, 90, + 53, 51, 49, 50, 49, 54, 59, 63, 67, 72, 76, 82, 87, 92, 97, 104, + 62, 59, 57, 57, 56, 61, 65, 69, 74, 79, 83, 90, 95, 102, 108, 115, + 73, 69, 66, 65, 64, 69, 73, 77, 81, 86, 91, 99, 105, 112, 119, 127, + }, { + 32, 31, 30, 33, 37, 42, 47, 48, 48, 49, 50, 52, 54, 57, 60, 63, + 31, 31, 32, 36, 40, 43, 46, 46, 45, 45, 46, 48, 50, 52, 54, 57, + 37, 38, 40, 43, 47, 47, 48, 47, 46, 46, 46, 47, 49, 50, 52, 55, + 48, 47, 46, 47, 47, 50, 52, 53, 53, 53, 54, 54, 55, 56, 58, 60, + 49, 47, 45, 46, 45, 49, 53, 55, 57, 58, 59, 61, 62, 64, 65, 67, + 52, 50, 48, 47, 47, 50, 53, 56, 59, 62, 64, 66, 68, 70, 72, 75, + 56, 53, 51, 50, 49, 53, 55, 58, 61, 64, 66, 70, 72, 75, 77, 80, + 61, 57, 55, 54, 52, 56, 58, 61, 63, 66, 69, 73, 76, 79, 82, 86, + }, + }, { + { + 32, 31, 31, 31, 32, 32, 34, 36, 38, 41, 44, 48, 53, 57, 61, 65, + 31, 32, 32, 32, 32, 33, 34, 34, 37, 39, 41, 45, 49, 53, 56, 60, + 32, 32, 33, 34, 34, 35, 37, 38, 40, 41, 43, 46, 50, 53, 56, 58, + 35, 35, 34, 35, 36, 37, 41, 46, 47, 49, 51, 54, 57, 60, 63, 66, + 39, 38, 37, 38, 39, 40, 44, 50, 52, 54, 57, 60, 64, 67, 69, 72, + 44, 42, 41, 42, 42, 42, 48, 54, 57, 60, 63, 67, 71, 74, 77, 79, + 53, 51, 49, 49, 49, 49, 54, 60, 64, 67, 71, 76, 82, 86, 89, 92, + 65, 62, 59, 59, 58, 58, 63, 68, 72, 76, 79, 85, 92, 97, 100, 105, + }, { + 32, 31, 30, 33, 35, 37, 42, 49, 48, 48, 49, 50, 52, 54, 55, 57, + 31, 31, 32, 35, 37, 40, 43, 46, 46, 45, 45, 46, 48, 49, 51, 52, + 37, 38, 40, 42, 44, 47, 47, 48, 47, 46, 46, 46, 47, 48, 49, 50, + 45, 45, 44, 46, 46, 47, 49, 52, 51, 51, 51, 52, 53, 54, 54, 55, + 48, 47, 45, 46, 46, 47, 50, 53, 54, 54, 55, 56, 57, 58, 58, 59, + 49, 47, 45, 45, 45, 45, 49, 53, 55, 57, 58, 59, 61, 62, 63, 64, + 52, 50, 48, 47, 47, 47, 50, 54, 57, 59, 61, 64, 66, 68, 69, 70, + 57, 54, 52, 51, 51, 50, 53, 57, 59, 61, 64, 67, 71, 73, 74, 76, + }, + }, { + { + 32, 31, 31, 31, 32, 32, 32, 34, 36, 38, 39, 44, 47, 49, 53, 58, + 31, 32, 32, 32, 32, 33, 33, 34, 34, 36, 37, 41, 44, 46, 49, 54, + 32, 32, 32, 33, 34, 35, 35, 36, 37, 39, 40, 42, 45, 47, 50, 54, + 32, 33, 33, 33, 34, 36, 36, 38, 40, 41, 42, 45, 47, 48, 51, 55, + 36, 35, 35, 35, 36, 38, 40, 42, 48, 49, 50, 53, 56, 57, 60, 63, + 44, 42, 41, 41, 42, 42, 44, 48, 54, 56, 58, 63, 66, 67, 71, 75, + 47, 45, 44, 44, 45, 45, 47, 50, 56, 58, 60, 66, 69, 71, 75, 79, + 53, 51, 49, 49, 50, 49, 51, 54, 60, 63, 65, 71, 75, 77, 82, 87, + }, { + 32, 31, 30, 31, 33, 37, 39, 42, 49, 48, 48, 49, 50, 51, 52, 54, + 31, 31, 32, 33, 36, 40, 41, 43, 46, 46, 46, 45, 46, 47, 48, 50, + 35, 37, 38, 38, 41, 45, 46, 46, 48, 47, 46, 45, 46, 47, 47, 49, + 38, 40, 40, 41, 44, 47, 47, 48, 49, 48, 48, 47, 48, 48, 48, 50, + 48, 47, 46, 46, 47, 47, 48, 50, 53, 53, 53, 53, 54, 54, 54, 55, + 49, 47, 45, 45, 46, 45, 47, 49, 53, 55, 56, 58, 59, 60, 61, 62, + 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59, 61, 61, 63, 65, + 52, 50, 48, 48, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64, 66, 68, + }, + }, { + { + 32, 31, 31, 31, 31, 32, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, + 31, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 37, 39, 41, 44, 45, + 31, 32, 32, 32, 33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 44, 45, + 32, 32, 32, 33, 34, 34, 35, 36, 37, 38, 38, 40, 41, 43, 45, 46, + 35, 35, 34, 34, 35, 36, 37, 39, 41, 45, 46, 48, 49, 51, 53, 54, + 36, 35, 35, 34, 36, 36, 38, 40, 42, 47, 48, 50, 51, 53, 56, 56, + 44, 42, 41, 41, 42, 42, 42, 44, 48, 52, 54, 58, 60, 63, 66, 67, + 47, 45, 45, 44, 44, 45, 45, 47, 50, 55, 56, 60, 62, 66, 69, 70, + }, { + 32, 31, 31, 30, 33, 33, 37, 39, 42, 47, 49, 48, 48, 49, 50, 50, + 31, 31, 32, 32, 35, 36, 40, 41, 43, 46, 46, 46, 45, 45, 46, 46, + 33, 34, 34, 35, 37, 38, 43, 43, 44, 46, 47, 46, 46, 45, 46, 46, + 37, 38, 39, 40, 42, 43, 47, 47, 47, 48, 48, 47, 46, 46, 46, 46, + 45, 45, 45, 44, 46, 46, 47, 48, 49, 51, 52, 51, 51, 51, 52, 52, + 48, 47, 46, 46, 47, 47, 47, 48, 50, 52, 53, 53, 53, 53, 54, 54, + 49, 47, 46, 45, 45, 46, 45, 47, 49, 53, 53, 56, 57, 58, 59, 59, + 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 57, 59, 61, 61, + }, + }, { + { + 32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 36, 36, 39, 39, 44, + 31, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, + 31, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, + 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 43, + 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 43, + 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 53, + 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 53, + 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58, 58, 63, + }, { + 32, 31, 31, 30, 30, 33, 33, 37, 37, 42, 42, 49, 49, 48, 48, 49, + 31, 31, 31, 32, 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, + 31, 31, 31, 32, 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, + 37, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 47, 47, 46, + 37, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 47, 47, 46, + 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, 50, 53, 53, 53, 53, 53, + 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, 50, 53, 53, 53, 53, 53, + 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56, 56, 58, + }, + }, { + { + 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 36, 36, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, + 31, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 35, 36, 36, + 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, + 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, + 35, 35, 35, 34, 34, 34, 35, 36, 36, 37, 37, 40, 41, 43, 46, 46, + 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 41, 42, 44, 48, 48, + }, { + 32, 31, 31, 31, 30, 30, 33, 33, 35, 37, 37, 41, 42, 44, 49, 49, + 31, 31, 31, 31, 32, 32, 34, 35, 37, 39, 39, 42, 42, 44, 47, 47, + 31, 31, 31, 32, 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, + 33, 34, 34, 34, 35, 35, 37, 38, 40, 43, 43, 44, 44, 45, 47, 47, + 37, 38, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, + 37, 38, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, + 45, 45, 45, 45, 44, 44, 46, 46, 46, 47, 47, 49, 49, 50, 52, 52, + 48, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 49, 50, 51, 53, 53, + }, + }, { + { + 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, + 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, + 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, + 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, + 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 36, 36, 36, 38, + }, { + 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 37, 37, 39, 42, + 31, 31, 31, 31, 31, 31, 31, 32, 34, 35, 35, 37, 39, 39, 40, 42, + 31, 31, 31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, + 31, 31, 31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, + 33, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 41, 43, 43, 43, 44, + 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, + 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, + 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, 44, 46, 47, 47, 47, 48, + }, + }, { + { + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, + }, { + 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, + 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, + 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 37, 38, 38, 38, + 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 40, 41, 41, 41, + }, + }, { + { + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + }, { + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, + }, + }, +}; + +static const uint8_t qm_tbl_32x8[][2][256] = { + { + { + 32, 31, 31, 31, 32, 32, 34, 35, 36, 39, 44, 46, 48, 53, 58, 61, 65, 71, 79, 81, 82, 88, 91, 94, 97, 100, 103, 107, 110, 114, 118, 122, + 32, 32, 33, 33, 34, 35, 37, 37, 38, 40, 43, 44, 46, 50, 54, 56, 58, 63, 70, 71, 72, 77, 80, 83, 86, 89, 93, 96, 100, 104, 107, 111, + 36, 35, 34, 35, 36, 38, 42, 45, 48, 50, 53, 55, 56, 60, 63, 66, 68, 73, 79, 80, 81, 85, 88, 91, 94, 97, 98, 100, 101, 103, 105, 107, + 53, 51, 49, 49, 50, 49, 54, 57, 60, 65, 71, 73, 76, 82, 87, 89, 92, 97, 104, 105, 106, 108, 106, 105, 107, 111, 114, 117, 117, 117, 118, 119, + 65, 62, 59, 59, 59, 58, 63, 65, 68, 73, 79, 82, 85, 92, 98, 101, 105, 111, 118, 119, 121, 126, 130, 131, 128, 127, 131, 136, 138, 137, 136, 136, + 87, 82, 78, 78, 77, 75, 79, 82, 84, 89, 95, 98, 102, 109, 116, 120, 124, 132, 141, 142, 144, 149, 148, 153, 157, 152, 150, 155, 161, 159, 157, 156, + 93, 88, 86, 84, 82, 82, 80, 84, 86, 91, 94, 98, 105, 107, 112, 119, 122, 130, 135, 140, 149, 153, 162, 165, 167, 173, 174, 177, 183, 185, 182, 179, + 99, 94, 93, 90, 89, 89, 88, 87, 90, 93, 97, 99, 105, 107, 115, 116, 124, 127, 135, 139, 146, 152, 159, 166, 171, 182, 186, 191, 193, 201, 203, 204, + }, { + 32, 31, 30, 32, 33, 37, 42, 45, 49, 48, 49, 49, 50, 52, 54, 55, 57, 60, 63, 64, 64, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, + 37, 38, 40, 41, 43, 47, 47, 47, 48, 47, 46, 46, 46, 47, 49, 49, 50, 52, 55, 55, 56, 58, 59, 60, 62, 63, 64, 65, 67, 68, 69, 70, + 48, 47, 46, 46, 47, 47, 50, 51, 53, 53, 53, 53, 54, 54, 55, 56, 56, 58, 60, 61, 61, 63, 64, 65, 66, 67, 66, 66, 66, 66, 67, 67, + 52, 50, 48, 48, 47, 47, 50, 52, 54, 57, 61, 62, 64, 66, 68, 69, 70, 72, 75, 75, 75, 76, 74, 72, 73, 74, 75, 75, 74, 74, 73, 73, + 57, 54, 52, 51, 51, 50, 53, 55, 57, 60, 64, 65, 67, 71, 73, 75, 76, 79, 82, 82, 83, 85, 86, 85, 83, 82, 83, 84, 84, 83, 82, 81, + 66, 63, 60, 59, 59, 57, 60, 61, 62, 66, 69, 71, 73, 77, 80, 82, 84, 88, 92, 92, 93, 95, 94, 95, 96, 93, 92, 93, 94, 93, 91, 90, + 68, 65, 63, 62, 60, 60, 59, 61, 62, 65, 66, 68, 72, 73, 76, 79, 80, 84, 87, 89, 93, 94, 98, 99, 99, 102, 101, 102, 103, 103, 101, 99, + 71, 67, 66, 64, 63, 62, 62, 61, 62, 64, 66, 67, 70, 71, 75, 76, 79, 81, 84, 86, 89, 91, 94, 97, 98, 102, 104, 106, 106, 109, 109, 108, + }, + }, { + { + 32, 31, 31, 31, 32, 32, 32, 34, 36, 38, 39, 44, 47, 49, 53, 58, 61, 65, 71, 76, 79, 82, 86, 89, 92, 95, 98, 101, 104, 107, 110, 114, + 32, 32, 32, 33, 34, 35, 35, 36, 37, 39, 40, 42, 45, 47, 50, 54, 56, 59, 64, 68, 70, 73, 76, 79, 82, 85, 88, 91, 94, 97, 100, 104, + 36, 35, 35, 35, 36, 38, 40, 42, 48, 49, 50, 53, 56, 57, 60, 63, 65, 68, 73, 76, 79, 81, 84, 87, 89, 92, 93, 94, 95, 96, 98, 100, + 47, 45, 44, 44, 45, 45, 47, 50, 56, 58, 60, 66, 69, 71, 75, 79, 81, 84, 89, 92, 95, 97, 100, 99, 101, 105, 108, 110, 110, 110, 111, 111, + 65, 62, 60, 59, 59, 58, 60, 63, 68, 71, 73, 79, 84, 86, 92, 98, 100, 105, 111, 115, 118, 121, 124, 124, 121, 120, 124, 128, 129, 128, 127, 127, + 79, 75, 72, 71, 71, 69, 71, 73, 78, 81, 84, 90, 95, 97, 103, 110, 113, 118, 125, 130, 133, 136, 140, 145, 148, 143, 141, 146, 151, 149, 147, 145, + 90, 86, 84, 82, 80, 80, 78, 82, 83, 88, 91, 94, 101, 103, 108, 114, 116, 124, 129, 134, 142, 145, 153, 156, 157, 163, 163, 166, 171, 173, 169, 166, + 96, 91, 90, 87, 87, 86, 85, 84, 87, 90, 94, 96, 101, 102, 110, 111, 118, 121, 129, 132, 138, 144, 150, 156, 161, 171, 174, 179, 181, 188, 188, 190, + }, { + 32, 31, 30, 31, 33, 37, 39, 42, 49, 48, 48, 49, 50, 51, 52, 54, 55, 57, 60, 62, 63, 64, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, + 35, 37, 38, 38, 41, 45, 46, 46, 48, 47, 46, 45, 46, 47, 47, 49, 49, 50, 53, 54, 55, 56, 58, 59, 60, 61, 62, 64, 65, 66, 67, 68, + 48, 47, 46, 46, 47, 47, 48, 50, 53, 53, 53, 53, 54, 54, 54, 55, 56, 56, 58, 60, 60, 61, 62, 63, 64, 65, 65, 65, 65, 65, 65, 65, + 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59, 61, 61, 63, 65, 65, 66, 68, 69, 70, 71, 72, 71, 71, 72, 73, 73, 72, 72, 71, 71, + 57, 54, 52, 52, 51, 50, 51, 53, 57, 58, 60, 64, 66, 68, 71, 73, 74, 76, 79, 81, 82, 83, 84, 83, 81, 80, 81, 82, 82, 81, 79, 78, + 63, 60, 58, 57, 56, 54, 55, 57, 60, 62, 64, 67, 70, 71, 75, 78, 79, 82, 85, 87, 89, 90, 91, 93, 94, 91, 89, 90, 92, 90, 89, 87, + 68, 64, 63, 61, 60, 59, 58, 60, 61, 64, 65, 67, 71, 72, 75, 78, 79, 83, 85, 87, 91, 92, 95, 96, 97, 99, 98, 99, 100, 100, 98, 96, + 70, 66, 65, 63, 63, 62, 61, 60, 61, 63, 65, 66, 69, 70, 74, 74, 78, 79, 82, 84, 87, 89, 91, 94, 96, 100, 101, 103, 103, 105, 105, 105, + }, + }, { + { + 32, 31, 31, 31, 32, 32, 32, 34, 34, 36, 39, 40, 44, 47, 49, 53, 57, 59, 65, 69, 71, 79, 81, 82, 87, 90, 92, 95, 98, 100, 103, 106, + 32, 32, 32, 32, 33, 34, 34, 35, 36, 37, 39, 40, 42, 45, 46, 50, 53, 54, 59, 62, 64, 71, 72, 73, 77, 80, 83, 85, 88, 91, 94, 97, + 36, 35, 35, 34, 36, 37, 38, 42, 44, 48, 50, 51, 53, 56, 57, 60, 63, 64, 68, 71, 73, 79, 80, 81, 85, 87, 88, 88, 89, 90, 92, 93, + 44, 42, 41, 41, 42, 42, 42, 48, 50, 54, 58, 59, 63, 66, 67, 71, 74, 75, 79, 83, 84, 90, 91, 92, 96, 99, 102, 103, 103, 103, 103, 104, + 58, 55, 54, 53, 53, 53, 52, 57, 59, 63, 68, 70, 74, 79, 81, 86, 90, 91, 97, 100, 102, 109, 110, 111, 114, 113, 117, 120, 121, 120, 119, 118, + 79, 75, 73, 72, 71, 70, 69, 73, 75, 78, 84, 85, 90, 95, 97, 103, 108, 111, 118, 122, 125, 133, 135, 136, 140, 135, 133, 137, 141, 139, 137, 135, + 88, 83, 81, 79, 78, 77, 76, 79, 81, 85, 88, 91, 97, 99, 104, 109, 111, 119, 123, 127, 135, 137, 145, 147, 148, 153, 153, 155, 160, 161, 158, 155, + 93, 88, 88, 84, 84, 83, 82, 81, 84, 86, 90, 92, 97, 98, 105, 106, 113, 115, 122, 125, 131, 136, 141, 147, 151, 160, 163, 168, 169, 175, 175, 176, + }, { + 32, 31, 31, 30, 33, 35, 37, 42, 44, 49, 48, 48, 49, 50, 51, 52, 54, 54, 57, 59, 60, 63, 64, 64, 66, 67, 68, 69, 70, 71, 72, 73, + 34, 35, 36, 36, 40, 42, 44, 45, 46, 47, 46, 46, 45, 46, 47, 47, 49, 49, 51, 52, 53, 56, 57, 57, 59, 60, 61, 62, 63, 64, 65, 66, + 48, 47, 46, 46, 47, 47, 47, 50, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 58, 58, 60, 61, 61, 63, 63, 63, 63, 63, 63, 63, 63, + 49, 47, 46, 45, 46, 45, 45, 49, 51, 53, 56, 56, 58, 59, 60, 61, 62, 62, 64, 65, 65, 67, 68, 68, 69, 70, 71, 71, 70, 70, 69, 69, + 54, 51, 50, 49, 49, 48, 48, 51, 53, 55, 58, 59, 62, 65, 65, 68, 70, 70, 73, 74, 75, 77, 78, 78, 79, 78, 79, 80, 80, 78, 77, 76, + 63, 60, 58, 57, 56, 55, 54, 57, 59, 60, 64, 65, 67, 70, 71, 75, 77, 78, 82, 84, 85, 89, 89, 90, 91, 88, 87, 88, 89, 88, 86, 84, + 67, 63, 62, 60, 59, 58, 57, 59, 60, 63, 64, 66, 70, 70, 73, 76, 77, 81, 83, 85, 89, 90, 93, 94, 94, 96, 96, 96, 97, 97, 95, 93, + 69, 65, 65, 62, 62, 61, 60, 59, 61, 62, 64, 65, 68, 68, 72, 73, 76, 77, 81, 82, 85, 87, 89, 92, 93, 97, 98, 100, 100, 102, 102, 101, + }, + }, { + { + 32, 31, 31, 31, 31, 32, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 53, 55, 58, 63, 65, 71, 74, 79, 82, 82, 87, 89, 92, 94, 97, 99, + 31, 32, 32, 32, 33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 44, 45, 50, 51, 54, 58, 59, 64, 67, 71, 73, 74, 78, 81, 83, 85, 88, 91, + 35, 35, 34, 34, 35, 36, 37, 39, 41, 45, 46, 48, 49, 51, 53, 54, 57, 59, 61, 65, 66, 71, 73, 77, 79, 79, 83, 83, 84, 85, 86, 87, + 44, 42, 41, 41, 42, 42, 42, 44, 48, 52, 54, 58, 60, 63, 66, 67, 71, 72, 75, 78, 79, 84, 86, 90, 92, 92, 96, 97, 97, 97, 97, 97, + 53, 51, 50, 49, 49, 50, 49, 51, 54, 59, 60, 65, 67, 71, 75, 76, 82, 84, 87, 91, 92, 97, 100, 104, 105, 106, 110, 113, 114, 112, 111, 110, + 65, 62, 61, 59, 59, 59, 58, 60, 63, 67, 68, 73, 76, 79, 84, 85, 92, 94, 98, 103, 105, 111, 113, 118, 120, 121, 125, 128, 132, 130, 128, 126, + 82, 78, 76, 74, 73, 73, 71, 73, 76, 79, 80, 86, 88, 92, 97, 98, 106, 108, 112, 118, 120, 127, 131, 136, 139, 139, 144, 145, 150, 151, 147, 144, + 90, 86, 85, 82, 81, 80, 79, 78, 81, 83, 87, 88, 93, 94, 101, 101, 108, 110, 116, 119, 124, 129, 134, 139, 142, 150, 153, 157, 157, 163, 163, 163, + }, { + 32, 31, 31, 30, 33, 33, 37, 39, 42, 47, 49, 48, 48, 49, 50, 50, 52, 53, 54, 56, 57, 60, 61, 63, 64, 64, 66, 67, 68, 69, 70, 70, + 33, 34, 34, 35, 37, 38, 43, 43, 44, 46, 47, 46, 46, 45, 46, 46, 47, 48, 49, 51, 51, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, + 45, 45, 45, 44, 46, 46, 47, 48, 49, 51, 52, 51, 51, 51, 52, 52, 53, 53, 54, 55, 55, 57, 58, 59, 60, 60, 61, 61, 61, 61, 61, 61, + 49, 47, 46, 45, 45, 46, 45, 47, 49, 53, 53, 56, 57, 58, 59, 59, 61, 61, 62, 63, 64, 65, 66, 67, 68, 68, 69, 69, 68, 68, 67, 67, + 52, 50, 49, 48, 47, 47, 47, 48, 50, 53, 54, 57, 59, 61, 63, 64, 66, 67, 68, 70, 70, 72, 73, 75, 75, 75, 77, 78, 77, 76, 75, 74, + 57, 54, 53, 52, 51, 51, 50, 51, 53, 56, 57, 60, 61, 64, 66, 67, 71, 72, 73, 76, 76, 79, 80, 82, 83, 83, 84, 85, 86, 85, 83, 82, + 64, 61, 60, 58, 57, 57, 55, 56, 58, 61, 61, 64, 66, 68, 71, 71, 75, 77, 79, 82, 83, 86, 87, 90, 91, 91, 93, 93, 94, 94, 92, 90, + 68, 64, 64, 61, 61, 60, 59, 58, 60, 61, 63, 64, 67, 67, 71, 71, 74, 75, 79, 80, 83, 85, 87, 89, 91, 94, 95, 97, 97, 99, 98, 98, + }, + }, { + { + 32, 31, 31, 31, 31, 32, 32, 32, 33, 34, 36, 36, 39, 40, 44, 46, 48, 52, 53, 58, 58, 65, 66, 71, 74, 79, 81, 82, 86, 88, 91, 93, + 31, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 38, 39, 41, 43, 45, 48, 49, 53, 54, 59, 60, 65, 67, 72, 73, 74, 78, 80, 82, 85, + 33, 33, 33, 33, 34, 35, 36, 36, 38, 39, 42, 42, 44, 45, 46, 48, 50, 52, 53, 57, 57, 62, 63, 67, 69, 73, 75, 75, 78, 80, 80, 81, + 40, 39, 39, 38, 38, 39, 40, 41, 44, 45, 51, 51, 54, 56, 59, 60, 62, 65, 66, 69, 70, 74, 75, 79, 81, 85, 86, 87, 90, 90, 90, 90, + 51, 49, 49, 47, 47, 48, 48, 48, 52, 53, 58, 59, 63, 65, 69, 72, 74, 78, 79, 83, 84, 89, 90, 94, 97, 101, 102, 103, 106, 105, 103, 103, + 65, 62, 61, 59, 59, 59, 58, 58, 62, 63, 68, 68, 73, 75, 79, 82, 85, 90, 92, 97, 98, 105, 106, 111, 113, 118, 120, 121, 124, 122, 119, 117, + 79, 75, 74, 72, 71, 71, 69, 69, 72, 73, 78, 79, 84, 85, 90, 93, 96, 101, 103, 109, 110, 118, 119, 125, 128, 133, 135, 136, 140, 140, 137, 134, + 87, 83, 82, 79, 79, 78, 77, 75, 78, 80, 84, 85, 89, 90, 96, 97, 103, 105, 111, 113, 118, 122, 126, 131, 134, 141, 143, 147, 147, 152, 151, 152, + }, { + 32, 31, 31, 30, 31, 33, 35, 37, 41, 42, 49, 49, 48, 48, 49, 49, 50, 51, 52, 54, 54, 57, 57, 60, 61, 63, 64, 64, 66, 67, 68, 68, + 32, 33, 33, 33, 35, 37, 39, 41, 43, 43, 47, 47, 46, 46, 45, 46, 46, 47, 48, 49, 50, 52, 52, 54, 55, 57, 58, 58, 59, 60, 61, 62, + 40, 41, 41, 42, 43, 44, 46, 47, 48, 48, 50, 50, 49, 49, 48, 49, 49, 49, 50, 51, 51, 52, 53, 55, 56, 57, 58, 58, 59, 59, 59, 59, + 49, 47, 47, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54, 55, 56, 57, 57, 58, 58, 59, 59, 60, 61, 62, 63, 64, 65, 65, 66, 66, 65, 65, + 51, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58, 61, 62, 63, 64, 65, 67, 67, 69, 69, 71, 72, 73, 73, 74, 75, 74, 72, 71, + 57, 54, 54, 52, 51, 51, 50, 50, 52, 53, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73, 73, 76, 77, 79, 80, 82, 82, 83, 84, 82, 81, 79, + 63, 60, 59, 57, 57, 56, 55, 54, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73, 75, 77, 78, 82, 82, 85, 86, 89, 89, 90, 91, 91, 89, 87, + 67, 63, 63, 60, 60, 59, 58, 57, 59, 60, 62, 63, 65, 66, 69, 70, 73, 74, 77, 78, 81, 83, 85, 87, 88, 92, 92, 94, 94, 96, 95, 95, + }, + }, { + { + 32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 36, 36, 39, 39, 44, 44, 48, 48, 53, 53, 58, 58, 65, 65, 71, 71, 79, 79, 82, 82, 87, + 31, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 41, 45, 45, 49, 49, 54, 54, 60, 60, 65, 65, 72, 72, 75, 75, 79, + 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 43, 43, 46, 46, 50, 50, 54, 54, 58, 58, 63, 63, 70, 70, 72, 72, 76, + 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 68, 68, 73, 73, 79, 79, 81, 81, 84, + 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58, 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, 90, 92, 92, 96, + 53, 51, 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82, 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, + 65, 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, 105, 105, 111, 111, 118, 118, 121, 121, 124, + 79, 75, 75, 72, 72, 71, 71, 69, 69, 73, 73, 78, 78, 84, 84, 90, 90, 96, 96, 103, 103, 110, 110, 118, 118, 125, 125, 133, 133, 136, 136, 141, + }, { + 32, 31, 31, 30, 30, 33, 33, 37, 37, 42, 42, 49, 49, 48, 48, 49, 49, 50, 50, 52, 52, 54, 54, 57, 57, 60, 60, 63, 63, 64, 64, 66, + 31, 31, 31, 32, 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 46, 46, 48, 48, 50, 50, 52, 52, 54, 54, 57, 57, 59, 59, 60, + 37, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 47, 47, 49, 49, 50, 50, 52, 52, 55, 55, 56, 56, 57, + 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 58, 58, 60, 60, 61, 61, 63, + 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56, 56, 58, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 65, 67, 67, 68, 68, 69, + 52, 50, 50, 48, 48, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 64, 64, 66, 66, 68, 68, 70, 70, 72, 72, 75, 75, 75, 75, 77, + 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 73, 73, 76, 76, 79, 79, 82, 82, 83, 83, 84, + 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 89, 90, 90, 92, + }, + }, { + { + 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 35, 36, 38, 39, 41, 44, 44, 48, 48, 53, 53, 57, 58, 61, 65, 67, 71, 72, 79, 79, + 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 36, 37, 39, 41, 42, 45, 45, 49, 49, 52, 54, 57, 60, 61, 65, 66, 72, 72, + 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 41, 43, 43, 46, 46, 49, 50, 52, 54, 56, 58, 60, 63, 64, 70, 70, + 36, 35, 35, 35, 34, 35, 36, 37, 38, 39, 42, 42, 47, 48, 49, 50, 51, 53, 54, 56, 56, 59, 60, 62, 63, 66, 68, 69, 73, 73, 79, 79, + 44, 42, 42, 41, 41, 41, 42, 42, 42, 43, 48, 48, 52, 54, 56, 58, 60, 63, 64, 67, 67, 71, 71, 74, 75, 77, 79, 81, 84, 85, 90, 90, + 53, 51, 51, 50, 49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76, 76, 81, 82, 85, 87, 89, 92, 94, 97, 98, 104, 104, + 62, 60, 59, 58, 57, 57, 57, 56, 56, 56, 61, 61, 65, 66, 69, 71, 74, 78, 79, 83, 83, 89, 90, 94, 95, 98, 102, 103, 108, 108, 115, 115, + 73, 70, 69, 67, 66, 66, 65, 65, 64, 64, 69, 69, 73, 74, 77, 79, 81, 85, 86, 91, 91, 98, 99, 103, 105, 108, 112, 114, 119, 119, 127, 127, + }, { + 32, 31, 31, 30, 30, 32, 33, 34, 37, 37, 42, 42, 47, 49, 48, 48, 48, 49, 49, 50, 50, 52, 52, 53, 54, 55, 57, 58, 60, 60, 63, 63, + 31, 31, 31, 32, 32, 34, 36, 37, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 45, 46, 46, 48, 48, 49, 50, 51, 52, 53, 54, 55, 57, 57, + 37, 38, 38, 39, 40, 41, 43, 44, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 48, 49, 49, 50, 51, 52, 53, 55, 55, + 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 50, 50, 52, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 57, 58, 59, 60, 60, + 49, 47, 47, 46, 45, 45, 46, 45, 45, 46, 49, 49, 53, 53, 55, 56, 57, 58, 58, 59, 59, 61, 61, 62, 62, 63, 64, 64, 65, 65, 67, 67, + 52, 50, 50, 48, 48, 48, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 62, 64, 64, 66, 66, 68, 68, 69, 70, 71, 72, 73, 75, 75, + 56, 54, 53, 52, 51, 51, 50, 50, 49, 49, 53, 53, 55, 56, 58, 59, 61, 63, 64, 66, 66, 69, 70, 71, 72, 74, 75, 76, 77, 78, 80, 80, + 61, 58, 57, 56, 55, 54, 54, 53, 52, 53, 56, 56, 58, 59, 61, 62, 63, 66, 66, 69, 69, 72, 73, 75, 76, 78, 79, 80, 82, 83, 86, 86, + }, + }, { + { + 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 36, 36, 38, 39, 41, 44, 44, 47, 48, 50, 53, 53, 57, 58, 61, 65, 65, 70, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 37, 37, 39, 41, 41, 44, 45, 46, 49, 49, 53, 54, 56, 60, 60, 64, + 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 43, 43, 45, 46, 47, 50, 50, 53, 54, 56, 58, 58, 62, + 35, 35, 35, 34, 34, 34, 35, 36, 36, 37, 37, 40, 41, 43, 46, 46, 47, 48, 49, 51, 51, 53, 54, 55, 57, 57, 60, 61, 63, 66, 66, 70, + 39, 38, 38, 37, 37, 37, 38, 38, 39, 40, 40, 43, 44, 46, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 67, 68, 69, 72, 72, 76, + 44, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 46, 48, 50, 54, 54, 57, 58, 60, 63, 63, 66, 67, 68, 71, 71, 74, 75, 77, 79, 79, 83, + 53, 52, 51, 50, 49, 49, 49, 50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, 82, 82, 86, 87, 89, 92, 92, 96, + 65, 63, 62, 61, 59, 59, 59, 59, 58, 58, 58, 62, 63, 65, 68, 68, 72, 73, 76, 79, 79, 84, 85, 88, 92, 92, 97, 98, 100, 105, 105, 109, + }, { + 32, 31, 31, 31, 30, 30, 33, 33, 35, 37, 37, 41, 42, 44, 49, 49, 48, 48, 48, 49, 49, 50, 50, 51, 52, 52, 54, 54, 55, 57, 57, 59, + 31, 31, 31, 32, 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, 46, 46, 45, 45, 45, 46, 46, 47, 48, 48, 49, 50, 51, 52, 52, 54, + 37, 38, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 49, 49, 50, 50, 52, + 45, 45, 45, 45, 44, 44, 46, 46, 46, 47, 47, 49, 49, 50, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 53, 53, 54, 54, 54, 55, 55, 57, + 48, 47, 47, 46, 45, 45, 46, 46, 46, 47, 47, 49, 50, 51, 53, 53, 54, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, 58, 58, 59, 59, 61, + 49, 47, 47, 46, 45, 45, 45, 46, 45, 45, 45, 48, 49, 51, 53, 53, 55, 56, 57, 58, 58, 59, 59, 60, 61, 61, 62, 62, 63, 64, 64, 65, + 52, 50, 50, 49, 48, 48, 47, 47, 47, 47, 47, 50, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 64, 65, 66, 66, 68, 68, 69, 70, 70, 72, + 57, 55, 54, 53, 52, 52, 51, 51, 51, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 66, 67, 68, 71, 71, 73, 73, 74, 76, 76, 78, + }, + }, { + { + 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 35, 36, 36, 38, 39, 39, 42, 44, 44, 47, 48, 49, 53, 53, 55, 58, 58, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 36, 37, 37, 40, 41, 41, 44, 45, 46, 49, 49, 51, 54, 54, + 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 36, 36, 37, 37, 37, 39, 40, 40, 42, 42, 43, 45, 46, 47, 49, 50, 51, 54, 54, + 32, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 36, 36, 38, 38, 39, 40, 40, 41, 42, 42, 44, 45, 45, 47, 48, 48, 51, 51, 53, 55, 55, + 36, 35, 35, 35, 35, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45, 48, 48, 49, 50, 50, 52, 53, 54, 56, 56, 57, 59, 60, 61, 63, 63, + 44, 43, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 48, 48, 50, 54, 54, 56, 58, 58, 61, 63, 63, 66, 67, 67, 71, 71, 72, 75, 75, + 47, 46, 45, 45, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66, 66, 69, 70, 71, 74, 75, 76, 79, 79, + 53, 52, 51, 51, 49, 49, 49, 49, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63, 65, 65, 69, 71, 72, 75, 76, 77, 81, 82, 83, 87, 87, + }, { + 32, 31, 31, 31, 30, 30, 31, 33, 33, 34, 37, 37, 39, 42, 42, 45, 49, 49, 48, 48, 48, 49, 49, 49, 50, 50, 51, 52, 52, 53, 54, 54, + 31, 31, 31, 31, 32, 32, 33, 35, 36, 37, 40, 40, 41, 43, 43, 44, 46, 46, 46, 46, 46, 45, 45, 45, 46, 46, 47, 48, 48, 48, 50, 50, + 35, 36, 37, 37, 38, 38, 38, 41, 41, 42, 45, 45, 46, 46, 46, 47, 48, 48, 47, 46, 46, 46, 45, 46, 46, 46, 47, 47, 47, 48, 49, 49, + 38, 39, 40, 40, 40, 41, 41, 43, 44, 45, 47, 47, 47, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 48, 48, 48, 48, 48, 49, 50, 50, + 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 54, 55, 55, + 49, 48, 47, 47, 45, 45, 45, 45, 46, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, 56, 57, 58, 58, 59, 59, 60, 61, 61, 61, 62, 62, + 50, 49, 48, 48, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59, 60, 61, 61, 61, 63, 63, 63, 65, 65, + 52, 50, 50, 50, 48, 48, 48, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 64, 64, 66, 66, 67, 68, 68, + }, + }, { + { + 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 34, 35, 36, 36, 38, 39, 39, 41, 44, 44, 44, 47, 48, 48, 51, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 36, 37, 37, 39, 41, 41, 42, 44, 45, 45, 47, + 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37, 39, 39, 40, 42, 42, 42, 44, 45, 45, 48, + 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 38, 38, 38, 40, 40, 40, 41, 43, 43, 43, 45, 46, 46, 48, + 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 37, 37, 37, 39, 41, 41, 42, 45, 46, 46, 47, 48, 48, 49, 51, 51, 51, 53, 54, 54, 56, + 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 43, 47, 48, 48, 49, 50, 50, 51, 53, 53, 54, 56, 56, 56, 58, + 44, 43, 42, 42, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44, 48, 48, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66, 67, 67, 69, + 47, 46, 45, 45, 45, 44, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 51, 55, 56, 56, 58, 60, 60, 62, 66, 66, 67, 69, 70, 70, 73, + }, { + 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 37, 37, 39, 42, 42, 43, 47, 49, 49, 48, 48, 48, 48, 49, 49, 49, 50, 50, 50, 51, + 31, 31, 31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 43, 43, 46, 46, 46, 46, 46, 46, 45, 45, 45, 45, 46, 46, 46, 47, + 33, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 41, 43, 43, 43, 44, 44, 45, 46, 47, 47, 46, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47, + 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 46, 46, 46, 46, 46, 46, 46, 47, + 45, 45, 45, 45, 45, 44, 44, 45, 46, 46, 46, 47, 47, 47, 48, 49, 49, 50, 51, 52, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 52, 52, + 48, 47, 47, 47, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 52, 53, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, + 49, 48, 47, 47, 46, 45, 45, 45, 45, 46, 46, 45, 45, 45, 47, 49, 49, 50, 53, 53, 53, 55, 56, 56, 57, 58, 58, 58, 59, 59, 59, 60, + 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 57, 59, 59, 60, 61, 61, 61, 62, + }, + }, { + { + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36, 36, 36, 37, 39, 39, 39, 41, 44, 44, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, + 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 53, 53, + 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 53, 53, + 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 45, 48, 48, 48, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63, 63, + }, { + 32, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 33, 33, 35, 37, 37, 37, 39, 42, 42, 42, 45, 49, 49, 49, 48, 48, 48, 48, 48, 49, 49, + 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, 46, 46, 46, 45, 45, 45, + 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, 46, 46, 46, 45, 45, 45, + 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46, + 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46, + 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, + 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, + 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 46, 46, 46, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 56, 56, 56, 57, 58, 58, + }, + }, { + { + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 36, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 35, 36, 36, 36, 36, 37, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, + 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, 39, 40, 41, 41, 41, 43, 45, 46, 46, 46, 46, + 36, 35, 35, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 41, 42, 42, 42, 44, 47, 48, 48, 48, 49, + }, { + 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 35, 36, 37, 37, 37, 39, 41, 42, 42, 42, 44, 47, 49, 49, 49, 49, + 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 37, 38, 39, 39, 39, 40, 42, 42, 42, 42, 44, 46, 47, 47, 47, 47, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 37, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 46, 46, 46, 46, + 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 37, 38, 38, 38, 40, 42, 43, 43, 43, 43, 44, 44, 44, 44, 45, 46, 47, 47, 47, 47, + 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, + 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, + 45, 45, 45, 45, 45, 45, 45, 44, 44, 44, 44, 45, 46, 46, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 49, 50, 51, 52, 52, 52, 52, + 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53, + }, + }, { + { + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 38, + }, { + 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 32, 33, 33, 33, 33, 33, 34, 35, 36, 37, 37, 37, 37, 39, 40, 42, 42, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 40, 41, 42, 42, + 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43, + 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43, + 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 36, 37, 38, 38, 38, 38, 39, 41, 42, 43, 43, 43, 43, 43, 44, 44, 44, + 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 38, 39, 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 43, 44, 44, 44, 44, 45, 46, 47, 47, 47, 47, 47, 47, 47, 48, 48, + }, + }, { + { + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, + }, { + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 31, 31, 32, 33, 33, 33, 33, 33, 33, 33, 34, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, + 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 38, 38, 38, 38, 38, 39, + 35, 35, 36, 36, 36, 37, 37, 37, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 39, 40, 40, 41, 41, 41, 41, 41, 41, 42, + }, + }, { + { + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + }, { + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + }, + }, +}; + +static const uint8_t qm_tbl_32x16[][2][512] = { + { + { + 32, 31, 31, 31, 32, 32, 34, 35, 36, 39, 44, 46, 48, 53, 58, 61, 65, 71, 79, 81, 82, 88, 91, 94, 97, 100, 103, 107, 110, 114, 118, 122, + 31, 32, 32, 32, 32, 33, 34, 34, 34, 37, 41, 43, 45, 49, 54, 57, 60, 65, 72, 74, 75, 80, 83, 85, 88, 91, 94, 97, 101, 104, 108, 111, + 32, 32, 33, 33, 34, 35, 37, 37, 38, 40, 43, 44, 46, 50, 54, 56, 58, 63, 70, 71, 72, 77, 80, 83, 86, 89, 93, 96, 100, 104, 107, 111, + 34, 34, 33, 34, 35, 37, 39, 41, 43, 45, 48, 49, 51, 54, 58, 60, 63, 68, 74, 75, 76, 80, 81, 82, 85, 87, 90, 93, 97, 100, 103, 107, + 36, 35, 34, 35, 36, 38, 42, 45, 48, 50, 53, 55, 56, 60, 63, 66, 68, 73, 79, 80, 81, 85, 88, 91, 94, 97, 98, 100, 101, 103, 105, 107, + 44, 42, 41, 41, 42, 42, 48, 50, 54, 58, 63, 65, 67, 71, 75, 77, 79, 84, 90, 91, 92, 97, 100, 100, 100, 100, 101, 104, 108, 112, 115, 119, + 53, 51, 49, 49, 50, 49, 54, 57, 60, 65, 71, 73, 76, 82, 87, 89, 92, 97, 104, 105, 106, 108, 106, 105, 107, 111, 114, 117, 117, 117, 118, 119, + 59, 56, 54, 54, 54, 53, 58, 61, 64, 69, 75, 78, 80, 87, 92, 95, 98, 103, 110, 112, 113, 115, 114, 118, 123, 121, 120, 119, 123, 127, 131, 136, + 65, 62, 59, 59, 59, 58, 63, 65, 68, 73, 79, 82, 85, 92, 98, 101, 105, 111, 118, 119, 121, 126, 130, 131, 128, 127, 131, 136, 138, 137, 136, 136, + 79, 75, 72, 71, 71, 69, 73, 76, 78, 84, 90, 93, 96, 103, 110, 114, 118, 125, 133, 135, 136, 142, 142, 137, 140, 145, 144, 142, 141, 146, 151, 156, + 87, 82, 78, 78, 77, 75, 79, 82, 84, 89, 95, 98, 102, 109, 116, 120, 124, 132, 141, 142, 144, 149, 148, 153, 157, 152, 150, 155, 161, 159, 157, 156, + 90, 85, 82, 81, 80, 78, 78, 83, 87, 89, 93, 100, 102, 107, 115, 118, 123, 132, 136, 140, 151, 153, 155, 160, 161, 164, 170, 168, 165, 167, 172, 178, + 93, 88, 86, 84, 82, 82, 80, 84, 86, 91, 94, 98, 105, 107, 112, 119, 122, 130, 135, 140, 149, 153, 162, 165, 167, 173, 174, 177, 183, 185, 182, 179, + 96, 91, 90, 87, 86, 86, 83, 84, 89, 91, 95, 100, 102, 110, 111, 118, 123, 128, 135, 138, 149, 152, 160, 167, 173, 178, 180, 187, 188, 190, 197, 203, + 99, 94, 93, 90, 89, 89, 88, 87, 90, 93, 97, 99, 105, 107, 115, 116, 124, 127, 135, 139, 146, 152, 159, 166, 171, 182, 186, 191, 193, 201, 203, 204, + 102, 97, 97, 93, 93, 92, 92, 90, 90, 96, 97, 103, 104, 111, 112, 120, 121, 130, 131, 142, 143, 154, 155, 168, 169, 181, 183, 198, 200, 206, 208, 217, + }, { + 32, 31, 30, 32, 33, 37, 42, 45, 49, 48, 49, 49, 50, 52, 54, 55, 57, 60, 63, 64, 64, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, + 31, 31, 32, 34, 36, 40, 43, 44, 46, 46, 45, 46, 46, 48, 50, 51, 52, 54, 57, 58, 59, 61, 62, 62, 63, 64, 65, 66, 67, 68, 69, 70, + 37, 38, 40, 41, 43, 47, 47, 47, 48, 47, 46, 46, 46, 47, 49, 49, 50, 52, 55, 55, 56, 58, 59, 60, 62, 63, 64, 65, 67, 68, 69, 70, + 42, 42, 42, 44, 45, 47, 48, 49, 50, 50, 49, 49, 50, 50, 52, 52, 53, 55, 58, 58, 58, 60, 60, 60, 60, 61, 62, 63, 64, 65, 66, 67, + 48, 47, 46, 46, 47, 47, 50, 51, 53, 53, 53, 53, 54, 54, 55, 56, 56, 58, 60, 61, 61, 63, 64, 65, 66, 67, 66, 66, 66, 66, 67, 67, + 49, 47, 45, 45, 46, 45, 49, 51, 53, 56, 58, 59, 59, 61, 62, 63, 64, 65, 67, 68, 68, 69, 71, 70, 69, 68, 68, 69, 70, 71, 72, 73, + 52, 50, 48, 48, 47, 47, 50, 52, 54, 57, 61, 62, 64, 66, 68, 69, 70, 72, 75, 75, 75, 76, 74, 72, 73, 74, 75, 75, 74, 74, 73, 73, + 54, 52, 50, 49, 49, 48, 52, 54, 55, 59, 62, 64, 65, 68, 71, 72, 73, 75, 78, 78, 79, 79, 78, 79, 81, 79, 78, 76, 77, 78, 80, 81, + 57, 54, 52, 51, 51, 50, 53, 55, 57, 60, 64, 65, 67, 71, 73, 75, 76, 79, 82, 82, 83, 85, 86, 85, 83, 82, 83, 84, 84, 83, 82, 81, + 63, 60, 57, 57, 56, 54, 57, 59, 60, 64, 67, 69, 71, 75, 78, 80, 82, 85, 89, 89, 90, 92, 91, 88, 89, 90, 89, 87, 86, 87, 88, 90, + 66, 63, 60, 59, 59, 57, 60, 61, 62, 66, 69, 71, 73, 77, 80, 82, 84, 88, 92, 92, 93, 95, 94, 95, 96, 93, 92, 93, 94, 93, 91, 90, + 67, 64, 62, 61, 60, 58, 58, 61, 63, 65, 67, 70, 72, 74, 78, 80, 82, 86, 88, 90, 95, 96, 96, 98, 97, 98, 100, 98, 96, 96, 97, 99, + 68, 65, 63, 62, 60, 60, 59, 61, 62, 65, 66, 68, 72, 73, 76, 79, 80, 84, 87, 89, 93, 94, 98, 99, 99, 102, 101, 102, 103, 103, 101, 99, + 69, 66, 65, 63, 62, 61, 60, 60, 63, 64, 66, 68, 70, 73, 74, 78, 80, 82, 85, 87, 91, 92, 96, 98, 101, 102, 103, 105, 105, 105, 107, 108, + 71, 67, 66, 64, 63, 62, 62, 61, 62, 64, 66, 67, 70, 71, 75, 76, 79, 81, 84, 86, 89, 91, 94, 97, 98, 102, 104, 106, 106, 109, 109, 108, + 72, 68, 68, 65, 65, 63, 63, 61, 62, 65, 65, 68, 69, 72, 73, 77, 77, 81, 81, 86, 87, 91, 91, 96, 97, 101, 102, 107, 107, 109, 110, 113, + }, + }, { + { + 32, 31, 31, 31, 32, 32, 32, 34, 36, 38, 39, 44, 47, 49, 53, 58, 61, 65, 71, 76, 79, 82, 86, 89, 92, 95, 98, 101, 104, 107, 110, 114, + 31, 32, 32, 32, 32, 33, 33, 34, 34, 36, 37, 41, 44, 46, 49, 54, 56, 60, 65, 69, 72, 75, 78, 81, 84, 86, 89, 92, 95, 98, 101, 104, + 32, 32, 32, 33, 34, 35, 35, 36, 37, 39, 40, 42, 45, 47, 50, 54, 56, 59, 64, 68, 70, 73, 76, 79, 82, 85, 88, 91, 94, 97, 100, 104, + 32, 33, 33, 33, 34, 36, 36, 38, 40, 41, 42, 45, 47, 48, 51, 55, 57, 60, 65, 69, 71, 74, 77, 78, 80, 83, 85, 88, 91, 94, 97, 100, + 36, 35, 35, 35, 36, 38, 40, 42, 48, 49, 50, 53, 56, 57, 60, 63, 65, 68, 73, 76, 79, 81, 84, 87, 89, 92, 93, 94, 95, 96, 98, 100, + 44, 42, 41, 41, 42, 42, 44, 48, 54, 56, 58, 63, 66, 67, 71, 75, 77, 79, 84, 88, 90, 92, 95, 95, 95, 95, 95, 98, 101, 105, 108, 111, + 47, 45, 44, 44, 45, 45, 47, 50, 56, 58, 60, 66, 69, 71, 75, 79, 81, 84, 89, 92, 95, 97, 100, 99, 101, 105, 108, 110, 110, 110, 111, 111, + 53, 51, 49, 49, 50, 49, 51, 54, 60, 63, 65, 71, 75, 77, 82, 87, 89, 92, 97, 101, 104, 106, 109, 112, 116, 114, 113, 112, 115, 119, 123, 126, + 65, 62, 60, 59, 59, 58, 60, 63, 68, 71, 73, 79, 84, 86, 92, 98, 100, 105, 111, 115, 118, 121, 124, 124, 121, 120, 124, 128, 129, 128, 127, 127, + 73, 69, 67, 66, 65, 64, 66, 69, 74, 77, 79, 85, 90, 93, 99, 105, 107, 112, 119, 123, 127, 130, 133, 130, 132, 136, 136, 133, 132, 136, 141, 145, + 79, 75, 72, 71, 71, 69, 71, 73, 78, 81, 84, 90, 95, 97, 103, 110, 113, 118, 125, 130, 133, 136, 140, 145, 148, 143, 141, 146, 151, 149, 147, 145, + 87, 83, 80, 79, 78, 76, 76, 80, 84, 86, 90, 96, 99, 103, 111, 114, 118, 126, 130, 134, 143, 146, 147, 152, 151, 155, 160, 158, 154, 156, 161, 166, + 90, 86, 84, 82, 80, 80, 78, 82, 83, 88, 91, 94, 101, 103, 108, 114, 116, 124, 129, 134, 142, 145, 153, 156, 157, 163, 163, 166, 171, 173, 169, 166, + 93, 88, 87, 84, 83, 83, 81, 81, 86, 88, 92, 96, 98, 105, 107, 113, 117, 122, 129, 131, 141, 144, 151, 157, 163, 167, 169, 175, 175, 177, 183, 189, + 96, 91, 90, 87, 87, 86, 85, 84, 87, 90, 94, 96, 101, 102, 110, 111, 118, 121, 129, 132, 138, 144, 150, 156, 161, 171, 174, 179, 181, 188, 188, 190, + 99, 94, 94, 90, 90, 88, 89, 86, 87, 93, 93, 99, 99, 106, 107, 115, 116, 124, 125, 135, 136, 145, 146, 158, 159, 170, 171, 185, 186, 192, 193, 201, + }, { + 32, 31, 30, 31, 33, 37, 39, 42, 49, 48, 48, 49, 50, 51, 52, 54, 55, 57, 60, 62, 63, 64, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, + 31, 31, 32, 33, 36, 40, 41, 43, 46, 46, 46, 45, 46, 47, 48, 50, 51, 52, 54, 56, 57, 59, 60, 61, 62, 63, 64, 65, 65, 66, 67, 68, + 35, 37, 38, 38, 41, 45, 46, 46, 48, 47, 46, 45, 46, 47, 47, 49, 49, 50, 53, 54, 55, 56, 58, 59, 60, 61, 62, 64, 65, 66, 67, 68, + 38, 40, 40, 41, 44, 47, 47, 48, 49, 48, 48, 47, 48, 48, 48, 50, 50, 51, 53, 55, 56, 57, 58, 58, 59, 60, 60, 61, 62, 63, 64, 65, + 48, 47, 46, 46, 47, 47, 48, 50, 53, 53, 53, 53, 54, 54, 54, 55, 56, 56, 58, 60, 60, 61, 62, 63, 64, 65, 65, 65, 65, 65, 65, 65, + 49, 47, 45, 45, 46, 45, 47, 49, 53, 55, 56, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 68, 67, 66, 66, 67, 68, 69, 70, 71, + 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59, 61, 61, 63, 65, 65, 66, 68, 69, 70, 71, 72, 71, 71, 72, 73, 73, 72, 72, 71, 71, + 52, 50, 48, 48, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64, 66, 68, 69, 70, 72, 74, 75, 75, 76, 78, 79, 77, 76, 74, 75, 76, 77, 78, + 57, 54, 52, 52, 51, 50, 51, 53, 57, 58, 60, 64, 66, 68, 71, 73, 74, 76, 79, 81, 82, 83, 84, 83, 81, 80, 81, 82, 82, 81, 79, 78, + 61, 57, 55, 55, 54, 52, 54, 56, 59, 61, 62, 66, 68, 70, 73, 76, 77, 79, 82, 84, 86, 87, 88, 86, 86, 88, 87, 85, 83, 85, 86, 87, + 63, 60, 58, 57, 56, 54, 55, 57, 60, 62, 64, 67, 70, 71, 75, 78, 79, 82, 85, 87, 89, 90, 91, 93, 94, 91, 89, 90, 92, 90, 89, 87, + 67, 63, 61, 60, 59, 57, 57, 60, 63, 64, 66, 69, 71, 73, 77, 79, 81, 85, 87, 88, 92, 93, 94, 96, 95, 96, 97, 95, 93, 93, 94, 96, + 68, 64, 63, 61, 60, 59, 58, 60, 61, 64, 65, 67, 71, 72, 75, 78, 79, 83, 85, 87, 91, 92, 95, 96, 97, 99, 98, 99, 100, 100, 98, 96, + 69, 65, 64, 62, 61, 61, 59, 59, 62, 63, 65, 67, 68, 72, 73, 76, 78, 81, 84, 85, 89, 90, 93, 96, 98, 99, 100, 102, 102, 102, 103, 105, + 70, 66, 65, 63, 63, 62, 61, 60, 61, 63, 65, 66, 69, 70, 74, 74, 78, 79, 82, 84, 87, 89, 91, 94, 96, 100, 101, 103, 103, 105, 105, 105, + 71, 67, 67, 64, 64, 62, 62, 60, 61, 64, 64, 67, 67, 71, 71, 75, 75, 79, 80, 84, 84, 89, 89, 94, 94, 98, 99, 104, 104, 106, 106, 109, + }, + }, { + { + 32, 31, 31, 31, 32, 32, 32, 34, 34, 36, 39, 40, 44, 47, 49, 53, 57, 59, 65, 69, 71, 79, 81, 82, 87, 90, 92, 95, 98, 100, 103, 106, + 31, 32, 32, 32, 32, 32, 33, 34, 34, 34, 37, 38, 41, 44, 46, 49, 53, 54, 60, 63, 65, 72, 74, 75, 79, 82, 84, 87, 89, 92, 94, 97, + 32, 32, 32, 32, 33, 34, 34, 35, 36, 37, 39, 40, 42, 45, 46, 50, 53, 54, 59, 62, 64, 71, 72, 73, 77, 80, 83, 85, 88, 91, 94, 97, + 32, 32, 32, 33, 34, 34, 35, 37, 37, 38, 40, 41, 43, 46, 47, 50, 53, 54, 58, 62, 63, 70, 71, 72, 76, 78, 81, 83, 85, 88, 90, 93, + 36, 35, 35, 34, 36, 37, 38, 42, 44, 48, 50, 51, 53, 56, 57, 60, 63, 64, 68, 71, 73, 79, 80, 81, 85, 87, 88, 88, 89, 90, 92, 93, + 39, 38, 38, 37, 39, 40, 40, 45, 47, 51, 54, 55, 58, 61, 62, 65, 68, 69, 73, 76, 78, 84, 85, 86, 90, 89, 90, 92, 95, 98, 101, 104, + 44, 42, 41, 41, 42, 42, 42, 48, 50, 54, 58, 59, 63, 66, 67, 71, 74, 75, 79, 83, 84, 90, 91, 92, 96, 99, 102, 103, 103, 103, 103, 104, + 53, 51, 50, 49, 50, 49, 49, 54, 56, 60, 65, 67, 71, 75, 77, 82, 86, 87, 92, 96, 97, 104, 105, 106, 110, 108, 106, 105, 108, 111, 114, 118, + 58, 55, 54, 53, 53, 53, 52, 57, 59, 63, 68, 70, 74, 79, 81, 86, 90, 91, 97, 100, 102, 109, 110, 111, 114, 113, 117, 120, 121, 120, 119, 118, + 65, 62, 60, 59, 59, 58, 58, 63, 65, 68, 73, 75, 79, 85, 86, 92, 97, 98, 105, 109, 111, 118, 120, 121, 125, 129, 128, 125, 124, 127, 131, 135, + 79, 75, 73, 72, 71, 70, 69, 73, 75, 78, 84, 85, 90, 95, 97, 103, 108, 111, 118, 122, 125, 133, 135, 136, 140, 135, 133, 137, 141, 139, 137, 135, + 81, 77, 75, 74, 72, 71, 70, 75, 77, 80, 85, 87, 91, 97, 99, 105, 110, 112, 119, 124, 127, 135, 137, 139, 143, 146, 150, 148, 144, 146, 150, 154, + 88, 83, 81, 79, 78, 77, 76, 79, 81, 85, 88, 91, 97, 99, 104, 109, 111, 119, 123, 127, 135, 137, 145, 147, 148, 153, 153, 155, 160, 161, 158, 155, + 90, 86, 84, 82, 81, 80, 78, 79, 83, 85, 89, 92, 94, 101, 102, 108, 112, 117, 123, 125, 134, 136, 143, 148, 154, 157, 158, 164, 164, 165, 170, 175, + 93, 88, 88, 84, 84, 83, 82, 81, 84, 86, 90, 92, 97, 98, 105, 106, 113, 115, 122, 125, 131, 136, 141, 147, 151, 160, 163, 168, 169, 175, 175, 176, + 96, 91, 91, 87, 87, 85, 86, 83, 84, 89, 89, 95, 95, 102, 102, 110, 110, 118, 119, 128, 129, 137, 138, 149, 149, 159, 160, 173, 174, 179, 180, 187, + }, { + 32, 31, 31, 30, 33, 35, 37, 42, 44, 49, 48, 48, 49, 50, 51, 52, 54, 54, 57, 59, 60, 63, 64, 64, 66, 67, 68, 69, 70, 71, 72, 73, + 31, 31, 32, 32, 36, 38, 40, 43, 44, 46, 46, 45, 45, 46, 47, 48, 49, 50, 52, 54, 54, 57, 58, 59, 60, 61, 62, 63, 64, 65, 65, 66, + 34, 35, 36, 36, 40, 42, 44, 45, 46, 47, 46, 46, 45, 46, 47, 47, 49, 49, 51, 52, 53, 56, 57, 57, 59, 60, 61, 62, 63, 64, 65, 66, + 37, 38, 39, 40, 43, 45, 47, 47, 47, 48, 47, 46, 46, 46, 47, 47, 48, 49, 50, 52, 52, 55, 55, 56, 57, 58, 59, 60, 60, 61, 62, 63, + 48, 47, 46, 46, 47, 47, 47, 50, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 58, 58, 60, 61, 61, 63, 63, 63, 63, 63, 63, 63, 63, + 48, 47, 46, 45, 46, 46, 46, 50, 51, 53, 54, 55, 56, 56, 57, 57, 58, 59, 60, 61, 62, 64, 64, 65, 66, 65, 64, 65, 66, 67, 68, 69, + 49, 47, 46, 45, 46, 45, 45, 49, 51, 53, 56, 56, 58, 59, 60, 61, 62, 62, 64, 65, 65, 67, 68, 68, 69, 70, 71, 71, 70, 70, 69, 69, + 52, 50, 48, 48, 47, 47, 47, 50, 52, 54, 57, 58, 61, 63, 64, 66, 68, 68, 70, 72, 72, 75, 75, 75, 77, 75, 74, 72, 73, 74, 75, 76, + 54, 51, 50, 49, 49, 48, 48, 51, 53, 55, 58, 59, 62, 65, 65, 68, 70, 70, 73, 74, 75, 77, 78, 78, 79, 78, 79, 80, 80, 78, 77, 76, + 57, 54, 53, 52, 51, 50, 50, 53, 54, 57, 60, 61, 64, 66, 68, 71, 73, 74, 76, 78, 79, 82, 82, 83, 84, 85, 84, 82, 81, 82, 83, 84, + 63, 60, 58, 57, 56, 55, 54, 57, 59, 60, 64, 65, 67, 70, 71, 75, 77, 78, 82, 84, 85, 89, 89, 90, 91, 88, 87, 88, 89, 88, 86, 84, + 64, 61, 59, 58, 57, 56, 55, 58, 59, 61, 64, 65, 68, 71, 72, 75, 78, 79, 82, 85, 86, 90, 90, 91, 93, 93, 94, 93, 90, 90, 92, 93, + 67, 63, 62, 60, 59, 58, 57, 59, 60, 63, 64, 66, 70, 70, 73, 76, 77, 81, 83, 85, 89, 90, 93, 94, 94, 96, 96, 96, 97, 97, 95, 93, + 68, 64, 63, 61, 60, 60, 58, 58, 61, 62, 64, 66, 67, 71, 71, 75, 77, 79, 82, 83, 87, 88, 91, 93, 95, 97, 97, 99, 99, 99, 100, 101, + 69, 65, 65, 62, 62, 61, 60, 59, 61, 62, 64, 65, 68, 68, 72, 73, 76, 77, 81, 82, 85, 87, 89, 92, 93, 97, 98, 100, 100, 102, 102, 101, + 69, 66, 66, 63, 63, 61, 61, 59, 60, 63, 63, 66, 66, 70, 70, 73, 74, 78, 78, 82, 82, 86, 87, 91, 91, 95, 96, 101, 101, 103, 103, 105, + }, + }, { + { + 32, 31, 31, 31, 31, 32, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 53, 55, 58, 63, 65, 71, 74, 79, 82, 82, 87, 89, 92, 94, 97, 99, + 31, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 37, 39, 41, 44, 45, 49, 51, 54, 58, 60, 65, 68, 72, 75, 75, 79, 82, 84, 86, 88, 91, + 31, 32, 32, 32, 33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 44, 45, 50, 51, 54, 58, 59, 64, 67, 71, 73, 74, 78, 81, 83, 85, 88, 91, + 32, 32, 32, 33, 34, 34, 35, 36, 37, 38, 38, 40, 41, 43, 45, 46, 50, 51, 54, 57, 58, 63, 66, 70, 72, 72, 76, 78, 80, 82, 85, 87, + 35, 35, 34, 34, 35, 36, 37, 39, 41, 45, 46, 48, 49, 51, 53, 54, 57, 59, 61, 65, 66, 71, 73, 77, 79, 79, 83, 83, 84, 85, 86, 87, + 36, 35, 35, 34, 36, 36, 38, 40, 42, 47, 48, 50, 51, 53, 56, 56, 60, 61, 63, 67, 68, 73, 75, 79, 81, 81, 85, 87, 89, 92, 94, 97, + 44, 42, 41, 41, 42, 42, 42, 44, 48, 52, 54, 58, 60, 63, 66, 67, 71, 72, 75, 78, 79, 84, 86, 90, 92, 92, 96, 97, 97, 97, 97, 97, + 47, 45, 45, 44, 44, 45, 45, 47, 50, 55, 56, 60, 62, 66, 69, 70, 75, 77, 79, 83, 84, 89, 91, 95, 97, 97, 100, 99, 101, 104, 107, 110, + 53, 51, 50, 49, 49, 50, 49, 51, 54, 59, 60, 65, 67, 71, 75, 76, 82, 84, 87, 91, 92, 97, 100, 104, 105, 106, 110, 113, 114, 112, 111, 110, + 62, 59, 58, 57, 57, 57, 56, 58, 61, 65, 66, 71, 74, 78, 82, 83, 90, 92, 95, 100, 102, 108, 110, 115, 117, 117, 120, 118, 116, 119, 123, 126, + 65, 62, 61, 59, 59, 59, 58, 60, 63, 67, 68, 73, 76, 79, 84, 85, 92, 94, 98, 103, 105, 111, 113, 118, 120, 121, 125, 128, 132, 130, 128, 126, + 79, 75, 74, 72, 71, 71, 69, 71, 73, 77, 78, 84, 86, 90, 95, 96, 103, 106, 110, 116, 118, 125, 128, 133, 136, 136, 141, 139, 135, 136, 140, 144, + 82, 78, 76, 74, 73, 73, 71, 73, 76, 79, 80, 86, 88, 92, 97, 98, 106, 108, 112, 118, 120, 127, 131, 136, 139, 139, 144, 145, 150, 151, 147, 144, + 88, 83, 82, 79, 79, 78, 76, 76, 81, 82, 85, 89, 91, 97, 98, 104, 107, 111, 117, 119, 127, 129, 135, 140, 145, 148, 148, 153, 153, 154, 159, 163, + 90, 86, 85, 82, 81, 80, 79, 78, 81, 83, 87, 88, 93, 94, 101, 101, 108, 110, 116, 119, 124, 129, 134, 139, 142, 150, 153, 157, 157, 163, 163, 163, + 93, 88, 88, 84, 84, 82, 83, 80, 80, 86, 86, 91, 91, 97, 98, 105, 105, 112, 113, 121, 122, 130, 130, 140, 140, 149, 150, 161, 162, 166, 167, 173, + }, { + 32, 31, 31, 30, 33, 33, 37, 39, 42, 47, 49, 48, 48, 49, 50, 50, 52, 53, 54, 56, 57, 60, 61, 63, 64, 64, 66, 67, 68, 69, 70, 70, + 31, 31, 32, 32, 35, 36, 40, 41, 43, 46, 46, 46, 45, 45, 46, 46, 48, 49, 50, 51, 52, 54, 56, 57, 58, 59, 60, 61, 62, 63, 63, 64, + 33, 34, 34, 35, 37, 38, 43, 43, 44, 46, 47, 46, 46, 45, 46, 46, 47, 48, 49, 51, 51, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, + 37, 38, 39, 40, 42, 43, 47, 47, 47, 48, 48, 47, 46, 46, 46, 46, 47, 48, 49, 50, 50, 52, 53, 55, 56, 56, 57, 58, 59, 59, 60, 61, + 45, 45, 45, 44, 46, 46, 47, 48, 49, 51, 52, 51, 51, 51, 52, 52, 53, 53, 54, 55, 55, 57, 58, 59, 60, 60, 61, 61, 61, 61, 61, 61, + 48, 47, 46, 46, 47, 47, 47, 48, 50, 52, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 56, 58, 59, 60, 61, 61, 63, 63, 64, 65, 66, 67, + 49, 47, 46, 45, 45, 46, 45, 47, 49, 53, 53, 56, 57, 58, 59, 59, 61, 61, 62, 63, 64, 65, 66, 67, 68, 68, 69, 69, 68, 68, 67, 67, + 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 57, 59, 61, 61, 63, 64, 65, 66, 66, 68, 69, 70, 71, 71, 72, 70, 71, 72, 73, 74, + 52, 50, 49, 48, 47, 47, 47, 48, 50, 53, 54, 57, 59, 61, 63, 64, 66, 67, 68, 70, 70, 72, 73, 75, 75, 75, 77, 78, 77, 76, 75, 74, + 56, 53, 52, 51, 50, 50, 49, 50, 53, 55, 56, 59, 61, 63, 65, 66, 70, 71, 72, 74, 75, 77, 79, 80, 81, 81, 82, 80, 79, 80, 81, 82, + 57, 54, 53, 52, 51, 51, 50, 51, 53, 56, 57, 60, 61, 64, 66, 67, 71, 72, 73, 76, 76, 79, 80, 82, 83, 83, 84, 85, 86, 85, 83, 82, + 63, 60, 59, 57, 56, 56, 54, 55, 57, 60, 60, 64, 65, 67, 70, 71, 75, 76, 78, 81, 82, 85, 86, 89, 90, 90, 92, 90, 88, 88, 89, 90, + 64, 61, 60, 58, 57, 57, 55, 56, 58, 61, 61, 64, 66, 68, 71, 71, 75, 77, 79, 82, 83, 86, 87, 90, 91, 91, 93, 93, 94, 94, 92, 90, + 67, 63, 62, 60, 60, 59, 57, 57, 60, 61, 63, 65, 66, 70, 70, 73, 75, 77, 80, 81, 85, 86, 89, 91, 93, 94, 94, 96, 96, 95, 97, 98, + 68, 64, 64, 61, 61, 60, 59, 58, 60, 61, 63, 64, 67, 67, 71, 71, 74, 75, 79, 80, 83, 85, 87, 89, 91, 94, 95, 97, 97, 99, 98, 98, + 68, 65, 65, 62, 62, 60, 61, 59, 59, 62, 62, 65, 65, 68, 68, 72, 72, 76, 76, 80, 80, 84, 84, 89, 89, 93, 93, 97, 98, 99, 99, 102, + }, + }, { + { + 32, 31, 31, 31, 31, 32, 32, 32, 33, 34, 36, 36, 39, 40, 44, 46, 48, 52, 53, 58, 58, 65, 66, 71, 74, 79, 81, 82, 86, 88, 91, 93, + 31, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 37, 38, 41, 43, 45, 48, 49, 53, 54, 60, 61, 65, 68, 72, 74, 75, 78, 81, 83, 85, + 31, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 38, 39, 41, 43, 45, 48, 49, 53, 54, 59, 60, 65, 67, 72, 73, 74, 78, 80, 82, 85, + 32, 32, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, 40, 41, 43, 44, 46, 49, 50, 53, 54, 58, 59, 63, 66, 70, 71, 72, 75, 77, 79, 81, + 33, 33, 33, 33, 34, 35, 36, 36, 38, 39, 42, 42, 44, 45, 46, 48, 50, 52, 53, 57, 57, 62, 63, 67, 69, 73, 75, 75, 78, 80, 80, 81, + 36, 35, 35, 34, 35, 36, 37, 38, 41, 42, 48, 48, 50, 51, 53, 55, 56, 59, 60, 63, 63, 68, 69, 73, 75, 79, 80, 81, 84, 86, 88, 90, + 40, 39, 39, 38, 38, 39, 40, 41, 44, 45, 51, 51, 54, 56, 59, 60, 62, 65, 66, 69, 70, 74, 75, 79, 81, 85, 86, 87, 90, 90, 90, 90, + 44, 42, 42, 41, 41, 42, 42, 42, 46, 48, 54, 54, 58, 59, 63, 65, 67, 70, 71, 74, 75, 79, 80, 84, 86, 90, 91, 92, 95, 98, 100, 102, + 51, 49, 49, 47, 47, 48, 48, 48, 52, 53, 58, 59, 63, 65, 69, 72, 74, 78, 79, 83, 84, 89, 90, 94, 97, 101, 102, 103, 106, 105, 103, 103, + 53, 51, 51, 49, 49, 50, 49, 49, 53, 54, 60, 60, 65, 67, 71, 73, 76, 80, 82, 86, 87, 92, 93, 97, 100, 104, 105, 106, 109, 112, 114, 117, + 65, 62, 61, 59, 59, 59, 58, 58, 62, 63, 68, 68, 73, 75, 79, 82, 85, 90, 92, 97, 98, 105, 106, 111, 113, 118, 120, 121, 124, 122, 119, 117, + 66, 63, 62, 60, 60, 60, 59, 59, 63, 64, 69, 69, 74, 76, 80, 83, 86, 91, 93, 98, 99, 106, 107, 112, 115, 119, 121, 122, 125, 127, 130, 134, + 79, 75, 74, 72, 71, 71, 69, 69, 72, 73, 78, 79, 84, 85, 90, 93, 96, 101, 103, 109, 110, 118, 119, 125, 128, 133, 135, 136, 140, 140, 137, 134, + 81, 77, 76, 74, 73, 72, 71, 70, 74, 75, 80, 80, 85, 87, 91, 94, 98, 103, 105, 111, 112, 119, 121, 127, 130, 135, 137, 139, 142, 144, 148, 151, + 87, 83, 82, 79, 79, 78, 77, 75, 78, 80, 84, 85, 89, 90, 96, 97, 103, 105, 111, 113, 118, 122, 126, 131, 134, 141, 143, 147, 147, 152, 151, 152, + 90, 85, 85, 81, 81, 80, 80, 77, 78, 83, 83, 87, 88, 93, 93, 100, 100, 107, 107, 115, 115, 123, 123, 132, 132, 140, 140, 151, 151, 155, 155, 160, + }, { + 32, 31, 31, 30, 31, 33, 35, 37, 41, 42, 49, 49, 48, 48, 49, 49, 50, 51, 52, 54, 54, 57, 57, 60, 61, 63, 64, 64, 66, 67, 68, 68, + 31, 31, 31, 32, 33, 36, 38, 40, 42, 43, 46, 46, 46, 45, 45, 46, 46, 47, 48, 50, 50, 52, 52, 54, 56, 57, 58, 59, 60, 61, 62, 62, + 32, 33, 33, 33, 35, 37, 39, 41, 43, 43, 47, 47, 46, 46, 45, 46, 46, 47, 48, 49, 50, 52, 52, 54, 55, 57, 58, 58, 59, 60, 61, 62, + 37, 38, 38, 40, 41, 43, 45, 47, 47, 47, 48, 48, 47, 46, 46, 46, 46, 47, 47, 48, 49, 50, 51, 52, 53, 55, 55, 56, 57, 58, 58, 59, + 40, 41, 41, 42, 43, 44, 46, 47, 48, 48, 50, 50, 49, 49, 48, 49, 49, 49, 50, 51, 51, 52, 53, 55, 56, 57, 58, 58, 59, 59, 59, 59, + 48, 47, 47, 46, 46, 47, 47, 47, 49, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 57, 58, 59, 60, 61, 61, 62, 63, 64, 65, + 49, 47, 47, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54, 55, 56, 57, 57, 58, 58, 59, 59, 60, 61, 62, 63, 64, 65, 65, 66, 66, 65, 65, + 49, 47, 47, 45, 45, 46, 45, 45, 48, 49, 53, 54, 56, 56, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 66, 67, 68, 68, 69, 70, 71, 71, + 51, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58, 61, 62, 63, 64, 65, 67, 67, 69, 69, 71, 72, 73, 73, 74, 75, 74, 72, 71, + 52, 50, 49, 48, 48, 47, 47, 47, 50, 50, 54, 55, 57, 58, 61, 62, 64, 66, 66, 68, 68, 70, 71, 72, 73, 75, 75, 75, 76, 77, 78, 79, + 57, 54, 54, 52, 51, 51, 50, 50, 52, 53, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73, 73, 76, 77, 79, 80, 82, 82, 83, 84, 82, 81, 79, + 58, 55, 54, 52, 52, 52, 51, 50, 53, 54, 57, 57, 60, 61, 64, 66, 67, 70, 71, 73, 74, 77, 77, 79, 81, 82, 83, 83, 85, 85, 86, 87, + 63, 60, 59, 57, 57, 56, 55, 54, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73, 75, 77, 78, 82, 82, 85, 86, 89, 89, 90, 91, 91, 89, 87, + 64, 61, 60, 58, 57, 57, 56, 55, 57, 58, 61, 61, 64, 65, 68, 69, 71, 74, 75, 78, 78, 82, 83, 86, 87, 90, 90, 91, 92, 93, 94, 95, + 67, 63, 63, 60, 60, 59, 58, 57, 59, 60, 62, 63, 65, 66, 69, 70, 73, 74, 77, 78, 81, 83, 85, 87, 88, 92, 92, 94, 94, 96, 95, 95, + 67, 64, 64, 61, 61, 60, 60, 58, 58, 61, 61, 64, 64, 67, 67, 70, 71, 74, 74, 78, 78, 82, 82, 86, 86, 90, 90, 95, 95, 96, 96, 98, + }, + }, { + { + 32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 36, 36, 39, 39, 44, 44, 48, 48, 53, 53, 58, 58, 65, 65, 71, 71, 79, 79, 82, 82, 87, + 31, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 41, 45, 45, 49, 49, 54, 54, 60, 60, 65, 65, 72, 72, 75, 75, 79, + 31, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 41, 45, 45, 49, 49, 54, 54, 60, 60, 65, 65, 72, 72, 75, 75, 79, + 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 43, 43, 46, 46, 50, 50, 54, 54, 58, 58, 63, 63, 70, 70, 72, 72, 76, + 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 43, 43, 46, 46, 50, 50, 54, 54, 58, 58, 63, 63, 70, 70, 72, 72, 76, + 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 68, 68, 73, 73, 79, 79, 81, 81, 84, + 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 68, 68, 73, 73, 79, 79, 81, 81, 84, + 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58, 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, 90, 92, 92, 96, + 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58, 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, 90, 92, 92, 96, + 53, 51, 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82, 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, + 53, 51, 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82, 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, + 65, 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, 105, 105, 111, 111, 118, 118, 121, 121, 124, + 65, 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, 105, 105, 111, 111, 118, 118, 121, 121, 124, + 79, 75, 75, 72, 72, 71, 71, 69, 69, 73, 73, 78, 78, 84, 84, 90, 90, 96, 96, 103, 103, 110, 110, 118, 118, 125, 125, 133, 133, 136, 136, 141, + 79, 75, 75, 72, 72, 71, 71, 69, 69, 73, 73, 78, 78, 84, 84, 90, 90, 96, 96, 103, 103, 110, 110, 118, 118, 125, 125, 133, 133, 136, 136, 141, + 87, 82, 82, 78, 78, 77, 77, 75, 75, 79, 79, 84, 84, 89, 89, 95, 95, 102, 102, 109, 109, 116, 116, 124, 124, 132, 132, 141, 141, 144, 144, 149, + }, { + 32, 31, 31, 30, 30, 33, 33, 37, 37, 42, 42, 49, 49, 48, 48, 49, 49, 50, 50, 52, 52, 54, 54, 57, 57, 60, 60, 63, 63, 64, 64, 66, + 31, 31, 31, 32, 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 46, 46, 48, 48, 50, 50, 52, 52, 54, 54, 57, 57, 59, 59, 60, + 31, 31, 31, 32, 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 46, 46, 48, 48, 50, 50, 52, 52, 54, 54, 57, 57, 59, 59, 60, + 37, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 47, 47, 49, 49, 50, 50, 52, 52, 55, 55, 56, 56, 57, + 37, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 47, 47, 49, 49, 50, 50, 52, 52, 55, 55, 56, 56, 57, + 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 58, 58, 60, 60, 61, 61, 63, + 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 58, 58, 60, 60, 61, 61, 63, + 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56, 56, 58, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 65, 67, 67, 68, 68, 69, + 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56, 56, 58, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 65, 67, 67, 68, 68, 69, + 52, 50, 50, 48, 48, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 64, 64, 66, 66, 68, 68, 70, 70, 72, 72, 75, 75, 75, 75, 77, + 52, 50, 50, 48, 48, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 64, 64, 66, 66, 68, 68, 70, 70, 72, 72, 75, 75, 75, 75, 77, + 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 73, 73, 76, 76, 79, 79, 82, 82, 83, 83, 84, + 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 73, 73, 76, 76, 79, 79, 82, 82, 83, 83, 84, + 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 89, 90, 90, 92, + 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 89, 90, 90, 92, + 66, 63, 63, 60, 60, 59, 59, 57, 57, 60, 60, 62, 62, 66, 66, 69, 69, 73, 73, 77, 77, 80, 80, 84, 84, 88, 88, 92, 92, 93, 93, 95, + }, + }, { + { + 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 35, 36, 38, 39, 41, 44, 44, 48, 48, 53, 53, 57, 58, 61, 65, 67, 71, 72, 79, 79, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36, 38, 39, 41, 42, 45, 45, 49, 50, 53, 54, 57, 60, 62, 66, 66, 73, 73, + 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 36, 37, 39, 41, 42, 45, 45, 49, 49, 52, 54, 57, 60, 61, 65, 66, 72, 72, + 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 36, 36, 37, 37, 38, 40, 41, 42, 43, 46, 46, 49, 50, 52, 54, 56, 59, 60, 64, 64, 71, 71, + 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 41, 43, 43, 46, 46, 49, 50, 52, 54, 56, 58, 60, 63, 64, 70, 70, + 34, 34, 34, 33, 33, 34, 35, 35, 37, 37, 39, 39, 42, 43, 44, 45, 46, 48, 48, 51, 51, 54, 54, 57, 58, 60, 63, 64, 68, 68, 74, 74, + 36, 35, 35, 35, 34, 35, 36, 37, 38, 39, 42, 42, 47, 48, 49, 50, 51, 53, 54, 56, 56, 59, 60, 62, 63, 66, 68, 69, 73, 73, 79, 79, + 38, 37, 37, 36, 36, 37, 38, 38, 39, 40, 44, 44, 48, 49, 51, 52, 54, 56, 56, 59, 59, 62, 63, 65, 67, 69, 71, 72, 76, 76, 82, 82, + 44, 42, 42, 41, 41, 41, 42, 42, 42, 43, 48, 48, 52, 54, 56, 58, 60, 63, 64, 67, 67, 71, 71, 74, 75, 77, 79, 81, 84, 85, 90, 90, + 44, 43, 43, 42, 41, 42, 43, 43, 43, 44, 48, 48, 53, 54, 57, 58, 60, 64, 64, 67, 67, 71, 72, 75, 76, 78, 80, 82, 85, 86, 91, 91, + 53, 51, 51, 50, 49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76, 76, 81, 82, 85, 87, 89, 92, 94, 97, 98, 104, 104, + 53, 51, 51, 50, 49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76, 76, 81, 82, 85, 87, 89, 92, 94, 97, 98, 104, 104, + 62, 60, 59, 58, 57, 57, 57, 56, 56, 56, 61, 61, 65, 66, 69, 71, 74, 78, 79, 83, 83, 89, 90, 94, 95, 98, 102, 103, 108, 108, 115, 115, + 65, 62, 62, 60, 59, 59, 59, 59, 58, 58, 63, 63, 67, 68, 71, 73, 76, 79, 81, 85, 85, 91, 92, 96, 98, 101, 105, 106, 111, 111, 118, 118, + 73, 70, 69, 67, 66, 66, 65, 65, 64, 64, 69, 69, 73, 74, 77, 79, 81, 85, 86, 91, 91, 98, 99, 103, 105, 108, 112, 114, 119, 119, 127, 127, + 79, 75, 75, 73, 72, 71, 71, 70, 69, 69, 73, 73, 77, 78, 81, 84, 86, 90, 91, 96, 96, 103, 103, 108, 110, 114, 118, 120, 125, 125, 133, 133, + }, { + 32, 31, 31, 30, 30, 32, 33, 34, 37, 37, 42, 42, 47, 49, 48, 48, 48, 49, 49, 50, 50, 52, 52, 53, 54, 55, 57, 58, 60, 60, 63, 63, + 31, 31, 31, 32, 32, 33, 35, 37, 40, 40, 43, 43, 46, 47, 46, 46, 46, 45, 46, 47, 47, 48, 48, 50, 50, 51, 52, 53, 55, 55, 58, 58, + 31, 31, 31, 32, 32, 34, 36, 37, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 45, 46, 46, 48, 48, 49, 50, 51, 52, 53, 54, 55, 57, 57, + 35, 36, 36, 37, 37, 39, 40, 42, 45, 45, 46, 46, 47, 47, 47, 46, 46, 45, 46, 46, 46, 47, 47, 48, 49, 50, 51, 51, 53, 53, 56, 56, + 37, 38, 38, 39, 40, 41, 43, 44, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 48, 49, 49, 50, 51, 52, 53, 55, 55, + 42, 42, 42, 42, 42, 44, 45, 45, 47, 47, 48, 48, 50, 50, 50, 50, 49, 49, 49, 50, 50, 50, 50, 51, 52, 52, 53, 54, 55, 55, 58, 58, + 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 50, 50, 52, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 57, 58, 59, 60, 60, + 48, 47, 47, 46, 46, 46, 46, 47, 47, 47, 50, 50, 52, 53, 53, 54, 54, 55, 55, 55, 55, 56, 56, 57, 57, 58, 58, 59, 60, 60, 62, 62, + 49, 47, 47, 46, 45, 45, 46, 45, 45, 46, 49, 49, 53, 53, 55, 56, 57, 58, 58, 59, 59, 61, 61, 62, 62, 63, 64, 64, 65, 65, 67, 67, + 49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 49, 49, 53, 54, 55, 56, 57, 59, 59, 60, 60, 61, 61, 62, 63, 63, 64, 65, 66, 66, 68, 68, + 52, 50, 50, 48, 48, 48, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 62, 64, 64, 66, 66, 68, 68, 69, 70, 71, 72, 73, 75, 75, + 52, 50, 50, 48, 48, 48, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 62, 64, 64, 66, 66, 68, 68, 69, 70, 71, 72, 73, 75, 75, + 56, 54, 53, 52, 51, 51, 50, 50, 49, 49, 53, 53, 55, 56, 58, 59, 61, 63, 64, 66, 66, 69, 70, 71, 72, 74, 75, 76, 77, 78, 80, 80, + 57, 54, 54, 52, 52, 51, 51, 51, 50, 50, 53, 53, 56, 57, 58, 60, 61, 64, 64, 67, 67, 70, 71, 72, 73, 75, 76, 77, 79, 79, 82, 82, + 61, 58, 57, 56, 55, 54, 54, 53, 52, 53, 56, 56, 58, 59, 61, 62, 63, 66, 66, 69, 69, 72, 73, 75, 76, 78, 79, 80, 82, 83, 86, 86, + 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57, 57, 60, 60, 62, 64, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83, 85, 85, 89, 89, + }, + }, { + { + 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 36, 36, 38, 39, 41, 44, 44, 47, 48, 50, 53, 53, 57, 58, 61, 65, 65, 70, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 37, 38, 39, 41, 41, 44, 45, 47, 50, 50, 54, 55, 57, 61, 61, 65, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 37, 37, 39, 41, 41, 44, 45, 46, 49, 49, 53, 54, 56, 60, 60, 64, + 31, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 35, 36, 36, 38, 39, 40, 42, 42, 44, 45, 47, 50, 50, 53, 54, 56, 59, 59, 63, + 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 43, 43, 45, 46, 47, 50, 50, 53, 54, 56, 58, 58, 62, + 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 43, 43, 45, 46, 47, 50, 50, 53, 54, 56, 58, 58, 62, + 35, 35, 35, 34, 34, 34, 35, 36, 36, 37, 37, 40, 41, 43, 46, 46, 47, 48, 49, 51, 51, 53, 54, 55, 57, 57, 60, 61, 63, 66, 66, 70, + 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 41, 42, 44, 48, 48, 50, 50, 51, 53, 53, 56, 56, 58, 60, 60, 63, 63, 65, 68, 68, 72, + 39, 38, 38, 37, 37, 37, 38, 38, 39, 40, 40, 43, 44, 46, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 67, 68, 69, 72, 72, 76, + 44, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 46, 48, 50, 54, 54, 57, 58, 60, 63, 63, 66, 67, 68, 71, 71, 74, 75, 77, 79, 79, 83, + 44, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 46, 48, 50, 54, 54, 57, 58, 60, 63, 63, 66, 67, 68, 71, 71, 74, 75, 77, 79, 79, 83, + 51, 49, 49, 48, 47, 47, 48, 48, 48, 48, 48, 52, 53, 55, 58, 58, 62, 63, 66, 69, 69, 73, 74, 76, 79, 79, 83, 84, 86, 89, 89, 93, + 53, 52, 51, 50, 49, 49, 49, 50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, 82, 82, 86, 87, 89, 92, 92, 96, + 58, 56, 55, 54, 53, 53, 53, 53, 53, 52, 52, 56, 57, 59, 63, 63, 67, 68, 70, 74, 74, 78, 79, 82, 86, 86, 90, 91, 93, 97, 97, 101, + 65, 63, 62, 61, 59, 59, 59, 59, 58, 58, 58, 62, 63, 65, 68, 68, 72, 73, 76, 79, 79, 84, 85, 88, 92, 92, 97, 98, 100, 105, 105, 109, + 65, 63, 62, 61, 59, 59, 59, 59, 58, 58, 58, 62, 63, 65, 68, 68, 72, 73, 76, 79, 79, 84, 85, 88, 92, 92, 97, 98, 100, 105, 105, 109, + }, { + 32, 31, 31, 31, 30, 30, 33, 33, 35, 37, 37, 41, 42, 44, 49, 49, 48, 48, 48, 49, 49, 50, 50, 51, 52, 52, 54, 54, 55, 57, 57, 59, + 31, 31, 31, 31, 32, 32, 34, 35, 37, 39, 39, 42, 42, 44, 47, 47, 46, 46, 46, 46, 46, 47, 47, 48, 48, 48, 50, 51, 51, 53, 53, 55, + 31, 31, 31, 32, 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, 46, 46, 45, 45, 45, 46, 46, 47, 48, 48, 49, 50, 51, 52, 52, 54, + 33, 34, 34, 34, 35, 35, 37, 38, 40, 43, 43, 44, 44, 45, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 49, 49, 50, 51, 51, 53, + 37, 38, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 49, 49, 50, 50, 52, + 37, 38, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 49, 49, 50, 50, 52, + 45, 45, 45, 45, 44, 44, 46, 46, 46, 47, 47, 49, 49, 50, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 53, 53, 54, 54, 54, 55, 55, 57, + 48, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 49, 50, 51, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 55, 55, 56, 56, 56, 58, + 48, 47, 47, 46, 45, 45, 46, 46, 46, 47, 47, 49, 50, 51, 53, 53, 54, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, 58, 58, 59, 59, 61, + 49, 47, 47, 46, 45, 45, 45, 46, 45, 45, 45, 48, 49, 51, 53, 53, 55, 56, 57, 58, 58, 59, 59, 60, 61, 61, 62, 62, 63, 64, 64, 65, + 49, 47, 47, 46, 45, 45, 45, 46, 45, 45, 45, 48, 49, 51, 53, 53, 55, 56, 57, 58, 58, 59, 59, 60, 61, 61, 62, 62, 63, 64, 64, 65, + 51, 50, 49, 48, 47, 47, 47, 47, 47, 46, 46, 49, 50, 52, 54, 54, 56, 57, 58, 61, 61, 62, 63, 64, 65, 65, 67, 67, 68, 69, 69, 70, + 52, 50, 50, 49, 48, 48, 47, 47, 47, 47, 47, 50, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 64, 65, 66, 66, 68, 68, 69, 70, 70, 72, + 54, 52, 51, 51, 49, 49, 49, 49, 48, 48, 48, 51, 51, 53, 55, 55, 58, 58, 60, 62, 62, 64, 65, 66, 68, 68, 70, 70, 71, 73, 73, 74, + 57, 55, 54, 53, 52, 52, 51, 51, 51, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 66, 67, 68, 71, 71, 73, 73, 74, 76, 76, 78, + 57, 55, 54, 53, 52, 52, 51, 51, 51, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 66, 67, 68, 71, 71, 73, 73, 74, 76, 76, 78, + }, + }, { + { + 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 35, 36, 36, 38, 39, 39, 42, 44, 44, 47, 48, 49, 53, 53, 55, 58, 58, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 37, 38, 38, 40, 42, 42, 45, 46, 47, 50, 51, 52, 55, 55, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 36, 37, 37, 40, 41, 41, 44, 45, 46, 49, 49, 51, 54, 54, + 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 35, 35, 37, 38, 38, 40, 41, 41, 44, 45, 46, 49, 49, 51, 54, 54, + 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 36, 36, 37, 37, 37, 39, 40, 40, 42, 42, 43, 45, 46, 47, 49, 50, 51, 54, 54, + 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 37, 38, 38, 40, 40, 40, 42, 43, 43, 45, 46, 47, 49, 50, 51, 54, 54, + 32, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 36, 36, 38, 38, 39, 40, 40, 41, 42, 42, 44, 45, 45, 47, 48, 48, 51, 51, 53, 55, 55, + 35, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 39, 42, 42, 44, 47, 47, 48, 49, 49, 51, 52, 52, 54, 55, 56, 58, 59, 60, 62, 62, + 36, 35, 35, 35, 35, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45, 48, 48, 49, 50, 50, 52, 53, 54, 56, 56, 57, 59, 60, 61, 63, 63, + 38, 37, 37, 37, 36, 36, 36, 38, 38, 38, 39, 39, 41, 44, 44, 46, 49, 49, 51, 52, 52, 55, 56, 56, 58, 59, 60, 62, 63, 64, 67, 67, + 44, 43, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 48, 48, 50, 54, 54, 56, 58, 58, 61, 63, 63, 66, 67, 67, 71, 71, 72, 75, 75, + 44, 43, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 48, 48, 50, 54, 54, 56, 58, 58, 61, 63, 63, 66, 67, 67, 71, 71, 72, 75, 75, + 47, 46, 45, 45, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66, 66, 69, 70, 71, 74, 75, 76, 79, 79, + 53, 52, 51, 51, 49, 49, 49, 49, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63, 65, 65, 69, 71, 72, 75, 76, 77, 81, 82, 83, 87, 87, + 53, 52, 51, 51, 49, 49, 49, 49, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63, 65, 65, 69, 71, 72, 75, 76, 77, 81, 82, 83, 87, 87, + 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53, 53, 55, 58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86, 87, 88, 92, 92, + }, { + 32, 31, 31, 31, 30, 30, 31, 33, 33, 34, 37, 37, 39, 42, 42, 45, 49, 49, 48, 48, 48, 49, 49, 49, 50, 50, 51, 52, 52, 53, 54, 54, + 31, 31, 31, 31, 31, 31, 32, 35, 35, 36, 39, 39, 40, 42, 42, 45, 47, 47, 47, 46, 46, 46, 46, 46, 47, 48, 48, 49, 49, 50, 51, 51, + 31, 31, 31, 31, 32, 32, 33, 35, 36, 37, 40, 40, 41, 43, 43, 44, 46, 46, 46, 46, 46, 45, 45, 45, 46, 46, 47, 48, 48, 48, 50, 50, + 31, 32, 32, 32, 32, 33, 33, 36, 36, 37, 41, 41, 42, 43, 43, 45, 47, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47, 48, 48, 48, 50, 50, + 35, 36, 37, 37, 38, 38, 38, 41, 41, 42, 45, 45, 46, 46, 46, 47, 48, 48, 47, 46, 46, 46, 45, 46, 46, 46, 47, 47, 47, 48, 49, 49, + 37, 38, 38, 38, 39, 40, 40, 43, 43, 44, 47, 47, 47, 47, 47, 47, 48, 48, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 49, 49, + 38, 39, 40, 40, 40, 41, 41, 43, 44, 45, 47, 47, 47, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 48, 48, 48, 48, 48, 49, 50, 50, + 47, 46, 46, 46, 45, 45, 45, 46, 46, 47, 47, 47, 48, 50, 50, 51, 52, 52, 52, 52, 52, 52, 52, 52, 53, 53, 53, 53, 53, 54, 55, 55, + 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 54, 55, 55, + 48, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 51, 53, 53, 53, 54, 54, 54, 55, 55, 55, 55, 55, 56, 56, 56, 57, 57, + 49, 48, 47, 47, 45, 45, 45, 45, 46, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, 56, 57, 58, 58, 59, 59, 60, 61, 61, 61, 62, 62, + 49, 48, 47, 47, 45, 45, 45, 45, 46, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, 56, 57, 58, 58, 59, 59, 60, 61, 61, 61, 62, 62, + 50, 49, 48, 48, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59, 60, 61, 61, 61, 63, 63, 63, 65, 65, + 52, 50, 50, 50, 48, 48, 48, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 64, 64, 66, 66, 67, 68, 68, + 52, 50, 50, 50, 48, 48, 48, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 64, 64, 66, 66, 67, 68, 68, + 54, 53, 52, 52, 50, 50, 50, 49, 49, 49, 48, 48, 50, 52, 52, 54, 55, 55, 57, 59, 59, 61, 62, 63, 65, 65, 66, 68, 68, 69, 71, 71, + }, + }, { + { + 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 34, 35, 36, 36, 38, 39, 39, 41, 44, 44, 44, 47, 48, 48, 51, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 40, 42, 42, 43, 45, 46, 46, 49, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 36, 37, 37, 39, 41, 41, 42, 44, 45, 45, 47, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 36, 37, 37, 39, 41, 41, 42, 44, 45, 45, 47, + 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37, 39, 39, 40, 42, 42, 42, 44, 45, 45, 48, + 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 38, 38, 38, 40, 40, 40, 41, 43, 43, 43, 45, 46, 46, 48, + 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 38, 38, 38, 40, 40, 40, 41, 43, 43, 43, 45, 46, 46, 48, + 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 36, 36, 36, 38, 38, 38, 39, 40, 40, 41, 42, 42, 43, 45, 45, 45, 47, 48, 48, 50, + 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 37, 37, 37, 39, 41, 41, 42, 45, 46, 46, 47, 48, 48, 49, 51, 51, 51, 53, 54, 54, 56, + 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 43, 47, 48, 48, 49, 50, 50, 51, 53, 53, 54, 56, 56, 56, 58, + 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 43, 47, 48, 48, 49, 50, 50, 51, 53, 53, 54, 56, 56, 56, 58, + 40, 39, 39, 39, 39, 38, 38, 38, 39, 39, 39, 40, 41, 41, 42, 45, 45, 46, 50, 51, 51, 53, 54, 54, 56, 59, 59, 59, 61, 62, 62, 64, + 44, 43, 42, 42, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44, 48, 48, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66, 67, 67, 69, + 44, 43, 42, 42, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44, 48, 48, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66, 67, 67, 69, + 47, 46, 45, 45, 45, 44, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 51, 55, 56, 56, 58, 60, 60, 62, 66, 66, 67, 69, 70, 70, 73, + 53, 52, 51, 51, 50, 49, 49, 49, 49, 50, 50, 49, 49, 49, 51, 54, 54, 55, 59, 60, 60, 63, 65, 65, 67, 71, 71, 72, 75, 76, 76, 79, + }, { + 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 37, 37, 39, 42, 42, 43, 47, 49, 49, 48, 48, 48, 48, 49, 49, 49, 50, 50, 50, 51, + 31, 31, 31, 31, 31, 31, 31, 32, 34, 35, 35, 37, 39, 39, 40, 42, 42, 43, 46, 47, 47, 47, 47, 47, 47, 46, 46, 47, 48, 48, 48, 49, + 31, 31, 31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 43, 43, 46, 46, 46, 46, 46, 46, 45, 45, 45, 45, 46, 46, 46, 47, + 31, 31, 31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 43, 43, 46, 46, 46, 46, 46, 46, 45, 45, 45, 45, 46, 46, 46, 47, + 33, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 41, 43, 43, 43, 44, 44, 45, 46, 47, 47, 46, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47, + 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 46, 46, 46, 46, 46, 46, 46, 47, + 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 46, 46, 46, 46, 46, 46, 46, 47, + 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, 44, 46, 47, 47, 47, 48, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 47, 48, 48, 48, 48, + 45, 45, 45, 45, 45, 44, 44, 45, 46, 46, 46, 47, 47, 47, 48, 49, 49, 50, 51, 52, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 52, 52, + 48, 47, 47, 47, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 52, 53, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, + 48, 47, 47, 47, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 52, 53, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, + 49, 48, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 47, 49, 49, 50, 52, 53, 53, 54, 54, 54, 55, 56, 56, 56, 57, 57, 57, 58, + 49, 48, 47, 47, 46, 45, 45, 45, 45, 46, 46, 45, 45, 45, 47, 49, 49, 50, 53, 53, 53, 55, 56, 56, 57, 58, 58, 58, 59, 59, 59, 60, + 49, 48, 47, 47, 46, 45, 45, 45, 45, 46, 46, 45, 45, 45, 47, 49, 49, 50, 53, 53, 53, 55, 56, 56, 57, 58, 58, 58, 59, 59, 59, 60, + 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 57, 59, 59, 60, 61, 61, 61, 62, + 52, 51, 50, 50, 49, 48, 48, 48, 47, 47, 47, 47, 47, 47, 48, 50, 50, 51, 53, 54, 54, 56, 57, 57, 59, 61, 61, 62, 63, 64, 64, 65, + }, + }, { + { + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36, 36, 36, 37, 39, 39, 39, 41, 44, 44, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 37, 38, 38, 38, 40, 42, 42, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 36, 36, 36, 38, 39, 39, 39, 40, 42, 42, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, + 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, 39, 39, 39, 41, 43, 43, 43, 44, 45, 45, 45, 46, 48, 48, + 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 53, 53, + 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 53, 53, + 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 53, 53, + 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45, 45, 45, 47, 51, 51, 51, 52, 54, 54, 54, 56, 58, 58, + 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 45, 48, 48, 48, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63, 63, + 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 45, 48, 48, 48, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63, 63, + }, { + 32, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 33, 33, 35, 37, 37, 37, 39, 42, 42, 42, 45, 49, 49, 49, 48, 48, 48, 48, 48, 49, 49, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 34, 34, 36, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 47, 47, 47, 47, 47, 47, 47, + 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, 46, 46, 46, 45, 45, 45, + 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, 46, 46, 46, 45, 45, 45, + 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, 46, 46, 46, 45, 45, 45, + 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 39, 39, 39, 41, 43, 43, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 46, 46, 46, 46, 45, 45, + 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46, + 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46, + 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 44, 45, 45, 45, 46, 47, 47, 47, 48, 48, 48, 48, 49, 50, 50, 50, 50, 50, 50, 50, 49, 49, 49, + 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, + 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, + 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, + 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 50, 50, 50, 51, 53, 53, 53, 54, 54, 54, 54, 55, 56, 56, + 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 46, 46, 46, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 56, 56, 56, 57, 58, 58, + 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 46, 46, 46, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 56, 56, 56, 57, 58, 58, + }, + }, { + { + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 36, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 35, 36, 36, 36, 36, 37, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, 37, 38, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 37, 38, 39, 39, 39, 40, 41, 42, 42, 42, 42, + 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, 39, 40, 41, 41, 41, 43, 45, 46, 46, 46, 46, + 36, 35, 35, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 41, 42, 42, 42, 44, 47, 48, 48, 48, 49, + 36, 35, 35, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 41, 42, 42, 42, 44, 47, 48, 48, 48, 49, + 36, 35, 35, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 41, 42, 42, 42, 44, 47, 48, 48, 48, 49, + }, { + 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 35, 36, 37, 37, 37, 39, 41, 42, 42, 42, 44, 47, 49, 49, 49, 49, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 36, 37, 38, 38, 38, 39, 41, 42, 42, 42, 44, 46, 48, 48, 48, 48, + 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 37, 38, 39, 39, 39, 40, 42, 42, 42, 42, 44, 46, 47, 47, 47, 47, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 37, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 46, 46, 46, 46, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 37, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 46, 46, 46, 46, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 37, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 46, 46, 46, 46, + 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 37, 38, 38, 38, 40, 42, 43, 43, 43, 43, 44, 44, 44, 44, 45, 46, 47, 47, 47, 47, + 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 40, 41, 41, 41, 43, 44, 45, 45, 45, 46, 46, 46, 46, 46, 47, 47, 48, 48, 48, 47, + 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, + 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, + 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, + 40, 41, 41, 41, 41, 41, 41, 42, 42, 42, 42, 43, 44, 44, 44, 44, 45, 47, 47, 47, 47, 48, 48, 48, 48, 48, 49, 49, 50, 50, 50, 49, + 45, 45, 45, 45, 45, 45, 45, 44, 44, 44, 44, 45, 46, 46, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 49, 50, 51, 52, 52, 52, 52, + 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53, + 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53, + 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53, + }, + }, { + { + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 36, 36, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 38, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 39, 39, + }, { + 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 32, 33, 33, 33, 33, 33, 34, 35, 36, 37, 37, 37, 37, 39, 40, 42, 42, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 35, 36, 37, 38, 38, 38, 38, 39, 41, 42, 42, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 40, 41, 42, 42, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 35, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43, + 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43, + 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43, + 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 35, 36, 36, 36, 36, 37, 39, 40, 41, 41, 41, 41, 42, 42, 43, 43, + 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 36, 37, 38, 38, 38, 38, 39, 41, 42, 43, 43, 43, 43, 43, 44, 44, 44, + 35, 35, 35, 36, 36, 36, 36, 36, 36, 37, 37, 37, 37, 37, 38, 39, 40, 40, 40, 40, 40, 42, 43, 44, 45, 45, 45, 45, 45, 45, 46, 46, + 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 38, 39, 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 43, 44, 44, 44, 44, 45, 46, 47, 47, 47, 47, 47, 47, 47, 48, 48, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47, 48, 48, 48, 48, + }, + }, { + { + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, + }, { + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 31, 31, 32, 33, 33, 33, 33, 33, 33, 33, 34, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33, 34, 35, 35, 35, 35, 35, 35, 35, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, + 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 35, 35, 36, 37, 37, 37, 37, 37, 37, 38, + 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 38, 38, 38, 38, 38, 39, + 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36, 36, 36, 36, 36, 36, 37, 37, 38, 39, 40, 40, 40, 40, 40, 40, 40, + 35, 35, 36, 36, 36, 37, 37, 37, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 39, 40, 40, 41, 41, 41, 41, 41, 41, 42, + 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 42, 43, 43, 43, 43, 43, 43, 44, + }, + }, { + { + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + }, { + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + }, + }, +}; + +static const uint8_t qm_tbl_32x32_t[][2][528] = { + { + { + 32, + 31, 32, + 31, 32, 32, + 31, 32, 32, 32, + 31, 32, 32, 33, 33, + 32, 32, 32, 33, 34, 35, + 34, 34, 33, 34, 35, 37, 39, + 35, 34, 34, 35, 36, 37, 41, 43, + 36, 35, 34, 35, 36, 38, 42, 45, 48, + 39, 38, 37, 38, 39, 40, 45, 47, 50, 54, + 44, 42, 41, 41, 42, 42, 47, 50, 54, 58, 63, + 46, 44, 42, 43, 44, 44, 49, 52, 55, 59, 65, 67, + 48, 46, 44, 45, 45, 46, 51, 53, 57, 61, 67, 69, 71, + 54, 51, 49, 49, 50, 49, 54, 57, 60, 65, 71, 74, 76, 82, + 59, 56, 54, 54, 54, 53, 58, 61, 64, 69, 75, 78, 80, 87, 92, + 62, 59, 56, 56, 56, 55, 60, 63, 66, 71, 77, 80, 83, 89, 95, 98, + 65, 62, 59, 59, 59, 58, 63, 65, 68, 73, 79, 82, 85, 92, 98, 101, 105, + 71, 68, 65, 64, 64, 63, 68, 70, 73, 78, 84, 87, 90, 97, 103, 107, 111, 117, + 80, 76, 72, 72, 71, 69, 74, 76, 79, 84, 90, 93, 96, 104, 110, 114, 118, 125, 134, + 81, 77, 73, 73, 72, 70, 75, 77, 80, 85, 91, 94, 97, 105, 111, 115, 119, 126, 135, 137, + 83, 78, 75, 74, 74, 72, 76, 79, 81, 86, 92, 95, 99, 106, 113, 117, 121, 128, 137, 138, 140, + 88, 84, 80, 79, 78, 76, 80, 82, 85, 91, 95, 98, 103, 111, 115, 119, 126, 134, 139, 144, 147, 152, + 91, 86, 83, 82, 81, 79, 81, 84, 88, 92, 95, 100, 107, 110, 115, 123, 127, 132, 140, 147, 151, 154, 159, + 94, 89, 86, 85, 84, 82, 82, 86, 90, 92, 97, 103, 105, 111, 119, 121, 128, 136, 139, 146, 156, 158, 161, 166, + 97, 92, 90, 88, 86, 85, 84, 89, 91, 95, 100, 102, 108, 114, 116, 125, 130, 133, 143, 148, 152, 163, 166, 168, 174, + 101, 95, 93, 91, 89, 89, 87, 91, 93, 98, 101, 105, 111, 113, 120, 126, 130, 138, 142, 149, 157, 159, 171, 174, 176, 183, + 104, 99, 97, 94, 93, 93, 90, 92, 96, 100, 102, 108, 111, 116, 122, 125, 134, 137, 144, 151, 155, 165, 169, 179, 182, 184, 191, + 107, 102, 101, 97, 96, 96, 93, 93, 99, 101, 105, 110, 113, 120, 122, 129, 133, 140, 146, 150, 161, 163, 173, 178, 187, 191, 193, 200, + 111, 105, 104, 101, 100, 99, 97, 96, 102, 103, 109, 111, 117, 120, 125, 131, 135, 143, 146, 156, 158, 168, 173, 180, 189, 195, 200, 202, 210, + 115, 109, 108, 104, 104, 102, 101, 100, 103, 106, 111, 113, 119, 121, 129, 131, 140, 142, 151, 155, 162, 168, 176, 183, 188, 199, 204, 210, 212, 220, + 119, 113, 112, 107, 107, 106, 105, 103, 105, 110, 112, 117, 120, 125, 130, 135, 140, 145, 152, 157, 165, 169, 179, 183, 193, 197, 210, 214, 220, 222, 231, + 123, 116, 116, 111, 111, 109, 110, 107, 107, 114, 114, 121, 122, 130, 130, 140, 140, 150, 151, 163, 164, 176, 177, 190, 191, 204, 206, 222, 224, 230, 232, 242, + }, { + 32, + 31, 31, + 30, 31, 32, + 32, 33, 33, 35, + 33, 34, 35, 37, 39, + 36, 38, 40, 41, 43, 47, + 41, 42, 42, 43, 45, 47, 48, + 45, 45, 44, 45, 46, 47, 49, 50, + 49, 47, 46, 47, 47, 48, 50, 51, 53, + 48, 47, 45, 46, 46, 46, 49, 51, 53, 54, + 49, 47, 45, 45, 45, 45, 49, 51, 53, 55, 58, + 50, 47, 45, 46, 46, 46, 49, 51, 54, 56, 59, 60, + 50, 48, 46, 46, 46, 46, 50, 52, 54, 56, 60, 60, 61, + 52, 50, 47, 47, 47, 47, 50, 52, 54, 57, 61, 62, 63, 66, + 54, 52, 49, 49, 49, 48, 52, 53, 55, 58, 62, 64, 65, 68, 71, + 56, 53, 51, 50, 50, 49, 52, 54, 56, 59, 63, 64, 66, 69, 72, 73, + 57, 54, 52, 51, 51, 50, 53, 55, 56, 60, 63, 65, 67, 70, 73, 75, 76, + 60, 57, 54, 54, 53, 52, 55, 57, 58, 61, 65, 67, 68, 72, 75, 77, 79, 82, + 63, 60, 57, 57, 56, 54, 57, 59, 60, 63, 67, 69, 71, 75, 78, 80, 82, 85, 89, + 64, 61, 58, 57, 57, 55, 58, 59, 61, 64, 67, 69, 71, 75, 78, 80, 82, 85, 89, 90, + 65, 61, 58, 58, 57, 55, 58, 60, 61, 64, 68, 70, 71, 75, 79, 81, 83, 86, 90, 91, 91, + 67, 63, 61, 60, 59, 57, 60, 61, 63, 66, 69, 70, 73, 77, 79, 81, 85, 88, 90, 92, 94, 96, + 68, 64, 62, 61, 60, 58, 59, 61, 64, 66, 67, 71, 74, 75, 78, 82, 84, 86, 90, 93, 94, 96, 98, + 69, 65, 63, 62, 61, 59, 59, 62, 64, 65, 68, 71, 72, 75, 79, 80, 83, 87, 89, 92, 96, 97, 98, 100, + 70, 66, 64, 63, 62, 61, 60, 63, 64, 66, 69, 70, 73, 76, 77, 81, 84, 85, 89, 92, 93, 98, 99, 100, 102, + 71, 67, 66, 64, 63, 62, 61, 63, 64, 67, 68, 70, 74, 75, 78, 81, 83, 86, 88, 91, 94, 95, 100, 101, 102, 104, + 72, 68, 67, 65, 64, 64, 61, 63, 65, 67, 68, 71, 73, 75, 78, 79, 84, 85, 88, 91, 93, 97, 98, 102, 103, 104, 106, + 73, 69, 68, 66, 65, 65, 63, 63, 66, 67, 69, 71, 73, 76, 77, 81, 82, 85, 88, 90, 94, 95, 99, 101, 104, 105, 106, 109, + 74, 70, 70, 67, 66, 66, 64, 63, 66, 67, 70, 71, 74, 75, 78, 80, 82, 86, 87, 91, 92, 96, 98, 101, 104, 106, 108, 108, 111, + 75, 71, 71, 68, 68, 67, 66, 64, 66, 68, 70, 71, 74, 75, 79, 79, 84, 84, 88, 90, 93, 95, 98, 101, 103, 107, 108, 110, 111, 113, + 76, 72, 72, 69, 69, 68, 67, 65, 66, 69, 70, 72, 74, 76, 78, 81, 83, 85, 88, 90, 93, 95, 98, 100, 104, 105, 109, 111, 112, 113, 116, + 78, 74, 74, 70, 70, 69, 69, 66, 66, 70, 70, 74, 74, 77, 78, 82, 82, 86, 87, 92, 92, 96, 97, 102, 102, 107, 107, 112, 113, 115, 115, 118, + }, + }, { + { + 32, + 31, 32, + 31, 32, 32, + 31, 32, 32, 32, + 31, 32, 32, 32, 33, + 32, 32, 32, 33, 34, 35, + 32, 33, 33, 33, 34, 36, 36, + 34, 34, 33, 34, 35, 37, 38, 39, + 36, 35, 34, 35, 36, 38, 40, 42, 48, + 38, 37, 36, 36, 38, 39, 41, 44, 50, 51, + 39, 38, 37, 38, 39, 40, 42, 45, 50, 52, 54, + 44, 42, 41, 41, 42, 42, 44, 47, 54, 56, 58, 63, + 47, 45, 44, 44, 45, 45, 47, 50, 56, 58, 60, 66, 69, + 49, 47, 46, 45, 46, 46, 48, 51, 57, 60, 62, 68, 71, 73, + 54, 51, 50, 49, 50, 49, 51, 54, 60, 63, 65, 71, 75, 77, 82, + 59, 56, 54, 54, 54, 53, 55, 58, 64, 67, 69, 75, 79, 81, 87, 92, + 61, 58, 56, 56, 56, 55, 57, 60, 65, 68, 70, 77, 81, 83, 89, 94, 97, + 65, 62, 60, 59, 59, 58, 60, 63, 68, 71, 73, 79, 84, 87, 92, 98, 101, 105, + 71, 68, 65, 65, 64, 63, 65, 68, 73, 76, 78, 84, 89, 92, 97, 103, 106, 111, 117, + 76, 72, 70, 69, 68, 66, 68, 71, 76, 79, 81, 88, 92, 95, 101, 107, 110, 115, 122, 127, + 80, 76, 73, 72, 71, 69, 71, 74, 79, 82, 84, 90, 95, 98, 104, 110, 113, 118, 125, 130, 134, + 83, 78, 76, 75, 74, 72, 73, 76, 81, 84, 86, 92, 97, 100, 106, 113, 116, 121, 128, 133, 137, 140, + 86, 82, 79, 78, 77, 74, 76, 79, 84, 87, 89, 95, 100, 103, 109, 116, 119, 124, 131, 136, 140, 144, 147, + 89, 85, 82, 81, 79, 78, 78, 82, 86, 87, 92, 97, 100, 105, 112, 114, 120, 128, 131, 136, 146, 147, 150, 155, + 92, 88, 85, 84, 82, 81, 80, 85, 86, 90, 95, 97, 102, 107, 110, 117, 122, 125, 134, 138, 142, 152, 154, 156, 162, + 95, 90, 88, 86, 85, 84, 82, 86, 88, 93, 95, 99, 105, 106, 113, 118, 121, 129, 132, 139, 146, 148, 159, 161, 163, 169, + 98, 93, 91, 89, 88, 87, 85, 87, 90, 94, 96, 102, 104, 109, 114, 117, 126, 128, 134, 141, 145, 154, 157, 166, 168, 170, 176, + 101, 96, 95, 92, 91, 90, 88, 88, 93, 95, 99, 103, 106, 112, 114, 121, 124, 131, 136, 140, 149, 151, 160, 165, 173, 176, 178, 184, + 104, 99, 98, 95, 94, 93, 91, 90, 95, 96, 102, 103, 109, 112, 117, 122, 125, 133, 136, 145, 146, 156, 160, 167, 174, 180, 184, 186, 193, + 108, 102, 101, 98, 97, 96, 95, 93, 97, 100, 104, 106, 111, 113, 121, 122, 130, 132, 140, 143, 150, 155, 162, 169, 174, 183, 188, 192, 194, 201, + 111, 105, 105, 101, 100, 99, 98, 96, 98, 103, 105, 109, 112, 117, 121, 125, 130, 135, 141, 146, 152, 156, 165, 169, 178, 181, 193, 196, 201, 202, 210, + 114, 109, 109, 104, 104, 102, 102, 99, 100, 106, 106, 113, 113, 120, 121, 129, 130, 139, 140, 151, 151, 162, 162, 175, 176, 187, 188, 203, 204, 210, 211, 219, + }, { + 32, + 31, 31, + 30, 31, 31, + 31, 32, 32, 33, + 33, 34, 35, 36, 39, + 36, 38, 39, 40, 43, 47, + 38, 40, 41, 41, 44, 47, 47, + 41, 42, 42, 43, 45, 47, 48, 48, + 49, 47, 46, 46, 47, 48, 49, 50, 53, + 49, 47, 46, 46, 46, 47, 48, 50, 53, 53, + 48, 47, 46, 45, 46, 46, 48, 49, 53, 54, 54, + 49, 47, 45, 45, 45, 45, 47, 49, 53, 55, 55, 58, + 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59, 61, + 51, 48, 47, 46, 47, 46, 47, 50, 54, 55, 56, 60, 61, 62, + 52, 50, 48, 47, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64, 66, + 54, 52, 50, 49, 49, 48, 49, 52, 55, 57, 58, 62, 64, 66, 68, 71, + 55, 53, 51, 50, 50, 49, 50, 52, 56, 58, 59, 63, 65, 66, 69, 72, 73, + 57, 54, 52, 51, 51, 50, 51, 53, 56, 58, 60, 63, 66, 67, 70, 73, 74, 76, + 60, 57, 55, 54, 53, 52, 53, 55, 58, 60, 61, 65, 68, 69, 72, 75, 77, 79, 82, + 62, 59, 57, 56, 55, 53, 54, 56, 59, 61, 63, 66, 69, 70, 74, 77, 78, 80, 84, 86, + 63, 60, 58, 57, 56, 54, 55, 57, 60, 62, 63, 67, 70, 71, 75, 78, 79, 82, 85, 87, 89, + 65, 61, 59, 58, 57, 55, 56, 58, 61, 63, 64, 68, 71, 72, 75, 79, 80, 83, 86, 88, 90, 91, + 66, 63, 60, 59, 58, 56, 58, 59, 62, 64, 65, 69, 72, 73, 76, 80, 81, 84, 87, 90, 91, 93, 94, + 67, 64, 62, 61, 59, 58, 58, 60, 63, 64, 66, 69, 71, 73, 77, 78, 81, 85, 86, 89, 93, 94, 95, 97, + 68, 65, 63, 62, 60, 59, 58, 61, 62, 64, 67, 68, 71, 74, 75, 79, 81, 83, 87, 89, 91, 95, 96, 97, 99, + 69, 66, 64, 63, 61, 61, 59, 61, 62, 65, 66, 68, 72, 73, 76, 78, 80, 84, 85, 88, 91, 92, 97, 98, 98, 101, + 70, 67, 65, 63, 62, 62, 60, 61, 63, 65, 66, 69, 71, 73, 76, 77, 81, 83, 85, 88, 90, 94, 95, 99, 100, 100, 103, + 71, 67, 67, 64, 63, 63, 61, 61, 64, 65, 67, 69, 71, 74, 75, 78, 80, 83, 85, 87, 91, 92, 95, 97, 100, 102, 102, 105, + 72, 68, 68, 65, 65, 64, 62, 62, 64, 65, 68, 69, 72, 73, 76, 78, 80, 83, 84, 88, 89, 93, 95, 97, 100, 102, 104, 104, 107, + 73, 69, 69, 66, 66, 65, 64, 63, 64, 66, 68, 69, 72, 73, 77, 77, 81, 82, 86, 87, 90, 92, 95, 97, 99, 103, 104, 106, 106, 109, + 74, 70, 70, 67, 67, 66, 65, 63, 64, 67, 68, 70, 72, 74, 76, 78, 80, 82, 85, 87, 90, 91, 95, 96, 100, 101, 105, 106, 108, 108, 111, + 75, 71, 71, 68, 68, 66, 66, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 84, 88, 89, 93, 93, 98, 98, 102, 103, 108, 108, 110, 110, 113, + }, + }, { + { + 32, + 31, 32, + 31, 32, 32, + 31, 32, 32, 32, + 31, 32, 32, 32, 33, + 32, 32, 32, 32, 33, 34, + 32, 32, 32, 32, 34, 34, 35, + 34, 34, 33, 33, 35, 36, 37, 39, + 34, 34, 34, 34, 36, 36, 37, 41, 42, + 36, 35, 34, 34, 36, 37, 38, 42, 45, 48, + 39, 38, 38, 37, 39, 40, 40, 45, 47, 50, 54, + 41, 39, 39, 38, 40, 40, 41, 46, 48, 51, 55, 56, + 44, 42, 41, 41, 42, 42, 42, 47, 50, 54, 58, 59, 63, + 48, 46, 45, 44, 45, 45, 45, 50, 53, 56, 61, 62, 66, 70, + 49, 47, 46, 45, 46, 46, 46, 51, 53, 57, 62, 63, 68, 71, 73, + 54, 51, 50, 49, 50, 49, 49, 54, 56, 60, 65, 67, 71, 76, 77, 82, + 58, 55, 54, 53, 53, 53, 52, 57, 59, 63, 68, 70, 74, 79, 81, 86, 90, + 59, 57, 55, 54, 54, 54, 54, 59, 61, 64, 69, 71, 75, 80, 82, 87, 91, 93, + 65, 62, 60, 59, 59, 58, 58, 63, 65, 68, 73, 75, 79, 85, 87, 92, 97, 99, 105, + 69, 66, 64, 63, 63, 62, 61, 66, 68, 71, 76, 78, 83, 88, 90, 96, 100, 102, 109, 113, + 71, 68, 66, 65, 64, 63, 63, 68, 70, 73, 78, 80, 84, 90, 92, 97, 102, 104, 111, 115, 117, + 80, 76, 73, 72, 71, 70, 69, 74, 76, 79, 84, 86, 90, 96, 98, 104, 109, 111, 118, 123, 125, 134, + 81, 77, 75, 74, 73, 72, 71, 75, 77, 80, 85, 87, 91, 97, 99, 105, 110, 112, 120, 125, 127, 136, 137, + 83, 78, 76, 75, 74, 73, 72, 76, 78, 81, 86, 88, 92, 98, 100, 106, 111, 113, 121, 126, 128, 137, 139, 140, + 87, 83, 81, 79, 78, 77, 75, 80, 82, 85, 90, 91, 96, 101, 103, 110, 114, 117, 125, 129, 133, 142, 143, 145, 150, + 90, 85, 83, 81, 80, 79, 78, 81, 83, 87, 89, 93, 98, 100, 106, 110, 114, 121, 124, 130, 136, 138, 148, 149, 151, 156, + 93, 88, 86, 84, 83, 82, 80, 82, 85, 89, 90, 96, 98, 102, 107, 109, 118, 120, 125, 131, 134, 143, 145, 153, 156, 157, 163, + 95, 90, 89, 86, 85, 85, 83, 83, 88, 89, 93, 97, 99, 105, 106, 113, 116, 122, 127, 130, 139, 140, 148, 153, 159, 162, 164, 169, + 98, 93, 92, 89, 88, 87, 86, 85, 89, 90, 96, 97, 102, 105, 109, 114, 117, 124, 126, 134, 136, 144, 148, 154, 160, 166, 169, 170, 176, + 101, 96, 95, 91, 91, 90, 89, 87, 90, 93, 97, 99, 104, 105, 112, 113, 121, 122, 130, 133, 139, 144, 150, 155, 160, 168, 172, 176, 177, 184, + 104, 99, 98, 94, 94, 92, 92, 90, 92, 96, 98, 102, 104, 109, 112, 116, 121, 125, 130, 135, 141, 144, 152, 155, 163, 166, 177, 179, 184, 185, 191, + 107, 101, 101, 97, 97, 95, 95, 93, 93, 99, 99, 105, 105, 112, 112, 120, 120, 129, 129, 139, 140, 149, 149, 161, 161, 172, 172, 185, 186, 191, 192, 199, + }, { + 32, + 31, 31, + 30, 31, 31, + 30, 31, 31, 32, + 33, 34, 35, 35, 39, + 35, 36, 37, 37, 41, 43, + 36, 38, 39, 40, 43, 45, 47, + 41, 42, 42, 42, 45, 46, 47, 48, + 44, 44, 44, 44, 46, 46, 47, 49, 50, + 49, 47, 47, 46, 47, 47, 48, 50, 51, 53, + 48, 47, 46, 45, 46, 46, 46, 49, 51, 53, 54, + 48, 47, 46, 45, 46, 46, 46, 49, 51, 53, 54, 55, + 49, 47, 46, 45, 45, 45, 45, 49, 51, 53, 55, 56, 58, + 50, 48, 47, 46, 46, 46, 46, 50, 51, 54, 56, 57, 59, 61, + 51, 48, 47, 46, 47, 46, 46, 50, 51, 54, 56, 57, 60, 62, 62, + 52, 50, 48, 47, 47, 47, 47, 50, 52, 54, 57, 58, 61, 63, 64, 66, + 54, 51, 50, 49, 49, 48, 48, 51, 53, 55, 58, 59, 62, 64, 65, 68, 70, + 55, 52, 51, 50, 49, 49, 48, 52, 53, 55, 59, 60, 62, 65, 66, 68, 70, 71, + 57, 54, 53, 52, 51, 50, 50, 53, 54, 56, 60, 61, 63, 66, 67, 70, 73, 73, 76, + 59, 56, 54, 53, 53, 52, 51, 54, 56, 58, 61, 62, 65, 68, 69, 72, 74, 75, 78, 80, + 60, 57, 55, 54, 53, 53, 52, 55, 56, 58, 61, 63, 65, 68, 69, 72, 75, 76, 79, 81, 82, + 63, 60, 58, 57, 56, 55, 54, 57, 59, 60, 63, 65, 67, 70, 71, 75, 77, 78, 82, 84, 85, 89, + 64, 61, 59, 58, 57, 56, 55, 58, 59, 61, 64, 65, 68, 71, 72, 75, 78, 79, 82, 85, 86, 89, 90, + 65, 61, 60, 58, 57, 56, 55, 58, 59, 61, 64, 65, 68, 71, 72, 75, 78, 79, 83, 85, 86, 90, 91, 91, + 67, 63, 61, 60, 59, 58, 57, 60, 61, 63, 65, 66, 69, 72, 73, 77, 79, 80, 84, 86, 88, 92, 93, 93, 95, + 68, 64, 63, 61, 60, 59, 58, 60, 61, 63, 65, 67, 70, 71, 74, 76, 78, 81, 83, 86, 88, 89, 94, 94, 95, 97, + 68, 65, 64, 62, 61, 60, 58, 59, 61, 64, 64, 68, 69, 71, 74, 75, 79, 80, 83, 86, 87, 91, 92, 95, 96, 97, 99, + 69, 66, 65, 63, 62, 61, 59, 59, 62, 63, 65, 67, 69, 72, 72, 76, 78, 80, 83, 84, 88, 89, 92, 94, 97, 98, 99, 101, + 70, 67, 66, 63, 63, 62, 61, 60, 63, 63, 66, 67, 69, 71, 73, 76, 77, 81, 82, 85, 86, 90, 91, 94, 96, 99, 100, 100, 103, + 71, 67, 67, 64, 64, 63, 62, 61, 62, 64, 66, 67, 70, 71, 74, 74, 78, 79, 83, 84, 87, 89, 91, 94, 95, 99, 100, 102, 102, 104, + 72, 68, 68, 65, 65, 64, 63, 61, 62, 65, 66, 68, 69, 71, 73, 75, 77, 79, 82, 84, 87, 88, 92, 93, 96, 97, 101, 102, 104, 104, 106, + 73, 69, 69, 66, 66, 64, 64, 62, 62, 66, 66, 69, 69, 72, 73, 76, 77, 81, 81, 85, 85, 89, 90, 94, 94, 99, 99, 104, 104, 106, 106, 108, + }, + }, { + { + 32, + 31, 32, + 31, 32, 32, + 31, 32, 32, 32, + 31, 32, 32, 32, 33, + 31, 32, 32, 32, 33, 33, + 32, 32, 32, 32, 33, 34, 35, + 32, 33, 33, 33, 34, 34, 36, 36, + 34, 34, 34, 33, 35, 35, 37, 38, 39, + 35, 35, 34, 34, 36, 36, 38, 39, 42, 46, + 36, 35, 35, 34, 36, 36, 38, 40, 42, 47, 48, + 39, 38, 38, 37, 39, 39, 40, 42, 45, 49, 50, 54, + 41, 40, 39, 38, 40, 40, 41, 43, 46, 50, 52, 55, 57, + 44, 42, 42, 41, 42, 42, 42, 44, 47, 52, 54, 58, 60, 63, + 47, 45, 45, 44, 44, 45, 45, 47, 50, 55, 56, 60, 62, 66, 69, + 48, 46, 45, 44, 45, 45, 46, 47, 51, 55, 57, 61, 63, 67, 70, 71, + 54, 51, 50, 49, 49, 50, 49, 51, 54, 59, 60, 65, 67, 71, 75, 76, 82, + 56, 53, 52, 51, 51, 51, 51, 53, 56, 60, 61, 66, 69, 73, 77, 78, 84, 86, + 59, 56, 55, 54, 54, 54, 53, 55, 58, 62, 64, 69, 71, 75, 79, 80, 87, 89, 92, + 64, 61, 60, 58, 58, 58, 57, 59, 62, 66, 67, 72, 75, 79, 83, 84, 91, 93, 97, 102, + 65, 62, 61, 59, 59, 59, 58, 60, 63, 67, 68, 73, 75, 79, 84, 85, 92, 94, 98, 103, 105, + 71, 68, 67, 65, 64, 64, 63, 65, 68, 72, 73, 78, 80, 84, 89, 90, 97, 100, 103, 109, 111, 117, + 74, 71, 69, 68, 67, 67, 65, 67, 70, 74, 75, 80, 83, 86, 91, 93, 100, 102, 106, 112, 114, 120, 123, + 80, 76, 74, 72, 71, 71, 69, 71, 74, 78, 79, 84, 86, 90, 95, 96, 104, 106, 110, 116, 118, 125, 128, 134, + 82, 78, 76, 74, 73, 73, 71, 73, 76, 79, 80, 86, 88, 92, 97, 98, 106, 108, 112, 118, 120, 127, 131, 136, 139, + 83, 78, 77, 75, 74, 74, 72, 73, 76, 80, 81, 86, 89, 92, 97, 99, 106, 109, 113, 119, 121, 128, 131, 137, 139, 140, + 87, 83, 81, 79, 78, 78, 75, 77, 80, 83, 85, 90, 92, 96, 100, 102, 110, 112, 117, 122, 125, 133, 135, 142, 144, 145, 150, + 90, 85, 84, 81, 80, 80, 78, 78, 82, 84, 87, 91, 93, 98, 99, 106, 108, 113, 118, 121, 129, 130, 137, 141, 147, 150, 151, 156, + 92, 88, 87, 84, 83, 82, 80, 80, 84, 85, 90, 91, 95, 98, 102, 106, 109, 115, 117, 125, 126, 134, 137, 142, 148, 152, 155, 156, 162, + 95, 90, 89, 86, 85, 84, 83, 82, 85, 87, 91, 92, 97, 98, 105, 105, 112, 114, 121, 123, 129, 133, 138, 143, 147, 155, 158, 161, 162, 168, + 97, 92, 92, 88, 88, 86, 86, 84, 85, 90, 91, 95, 97, 101, 104, 108, 112, 116, 121, 125, 130, 133, 140, 143, 150, 152, 162, 164, 168, 168, 174, + 100, 95, 95, 90, 90, 89, 89, 86, 86, 92, 92, 97, 98, 104, 104, 111, 111, 119, 119, 128, 129, 137, 137, 147, 148, 157, 158, 169, 170, 174, 175, 181, + }, { + 32, + 31, 31, + 31, 31, 31, + 30, 31, 31, 32, + 33, 34, 34, 34, 37, + 33, 34, 35, 35, 38, 39, + 36, 38, 39, 40, 42, 43, 47, + 38, 40, 40, 41, 43, 44, 47, 47, + 41, 42, 42, 42, 44, 45, 47, 48, 48, + 47, 46, 46, 45, 46, 47, 47, 48, 50, 52, + 49, 47, 47, 46, 47, 47, 48, 49, 50, 52, 53, + 48, 47, 46, 45, 46, 46, 46, 48, 49, 52, 53, 54, + 49, 47, 46, 45, 46, 46, 46, 47, 49, 52, 53, 55, 55, + 49, 47, 46, 45, 45, 45, 45, 47, 49, 52, 53, 55, 57, 58, + 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 57, 59, 61, + 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 58, 60, 61, 61, + 52, 50, 49, 47, 47, 47, 47, 48, 50, 53, 54, 57, 59, 61, 63, 63, 66, + 53, 50, 50, 48, 48, 48, 47, 49, 51, 54, 55, 58, 59, 62, 64, 64, 67, 68, + 54, 52, 51, 49, 49, 49, 48, 49, 52, 55, 55, 58, 60, 62, 64, 65, 68, 69, 71, + 56, 54, 53, 51, 51, 51, 49, 51, 53, 55, 56, 59, 61, 63, 66, 66, 70, 71, 73, 75, + 57, 54, 53, 52, 51, 51, 50, 51, 53, 56, 56, 60, 61, 63, 66, 67, 70, 71, 73, 76, 76, + 60, 57, 56, 54, 53, 53, 52, 53, 55, 58, 58, 61, 63, 65, 68, 68, 72, 73, 75, 78, 79, 82, + 61, 58, 57, 55, 55, 54, 53, 54, 56, 58, 59, 62, 64, 66, 69, 69, 73, 74, 76, 79, 80, 83, 84, + 63, 60, 59, 57, 56, 56, 54, 55, 57, 60, 60, 63, 65, 67, 70, 71, 75, 76, 78, 81, 82, 85, 86, 89, + 64, 61, 60, 58, 57, 57, 55, 56, 58, 60, 61, 64, 66, 68, 70, 71, 75, 77, 79, 82, 82, 86, 87, 90, 91, + 65, 61, 60, 58, 57, 57, 55, 56, 58, 61, 61, 64, 66, 68, 71, 71, 75, 77, 79, 82, 83, 86, 88, 90, 91, 91, + 67, 63, 62, 60, 59, 59, 57, 58, 60, 62, 63, 66, 67, 69, 72, 73, 77, 78, 80, 83, 84, 88, 89, 92, 93, 93, 95, + 67, 64, 63, 61, 60, 60, 58, 58, 61, 61, 63, 65, 67, 70, 70, 74, 75, 78, 80, 81, 85, 86, 89, 91, 93, 94, 95, 97, + 68, 65, 64, 62, 61, 60, 59, 58, 61, 61, 64, 65, 67, 69, 71, 73, 75, 78, 79, 83, 83, 87, 88, 91, 93, 95, 96, 97, 99, + 69, 65, 65, 62, 62, 61, 60, 59, 61, 62, 64, 65, 68, 68, 72, 72, 76, 76, 80, 81, 84, 86, 88, 90, 92, 95, 96, 98, 98, 100, + 70, 66, 66, 63, 63, 62, 61, 60, 60, 63, 64, 66, 67, 69, 71, 73, 75, 77, 79, 81, 84, 85, 88, 89, 93, 93, 97, 98, 100, 100, 102, + 71, 67, 67, 64, 64, 62, 62, 60, 60, 64, 64, 67, 67, 70, 70, 74, 74, 78, 78, 82, 82, 86, 86, 91, 91, 95, 95, 100, 100, 101, 101, 104, + }, + }, { + { + 32, + 31, 32, + 31, 32, 32, + 31, 32, 32, 32, + 31, 32, 32, 32, 32, + 31, 32, 32, 32, 33, 33, + 32, 32, 32, 32, 33, 33, 34, + 32, 32, 32, 32, 33, 34, 35, 35, + 33, 33, 33, 33, 34, 35, 36, 36, 38, + 34, 34, 34, 33, 34, 35, 36, 37, 39, 39, + 36, 35, 35, 34, 35, 36, 37, 38, 42, 42, 48, + 36, 35, 35, 34, 35, 36, 38, 38, 42, 43, 48, 49, + 39, 38, 38, 37, 38, 39, 40, 40, 44, 45, 50, 51, 54, + 41, 39, 39, 38, 39, 40, 40, 41, 45, 46, 51, 52, 55, 56, + 44, 42, 42, 41, 41, 42, 42, 42, 46, 47, 54, 54, 58, 59, 63, + 46, 44, 44, 42, 43, 44, 44, 44, 48, 49, 55, 55, 59, 61, 65, 67, + 48, 46, 46, 44, 45, 45, 45, 46, 50, 51, 57, 57, 61, 63, 67, 69, 71, + 52, 50, 49, 48, 48, 48, 48, 48, 52, 53, 59, 59, 64, 65, 70, 72, 74, 78, + 54, 51, 51, 49, 49, 50, 49, 49, 53, 54, 60, 60, 65, 67, 71, 74, 76, 80, 82, + 58, 56, 55, 53, 53, 53, 53, 53, 57, 58, 63, 64, 68, 70, 75, 77, 80, 84, 86, 91, + 59, 56, 56, 54, 54, 54, 53, 53, 57, 58, 64, 64, 69, 70, 75, 78, 80, 85, 87, 91, 92, + 65, 62, 61, 59, 59, 59, 58, 58, 62, 63, 68, 68, 73, 75, 79, 82, 85, 90, 92, 97, 98, 105, + 66, 63, 63, 60, 60, 60, 59, 59, 63, 64, 69, 69, 74, 76, 80, 83, 86, 91, 93, 98, 99, 106, 107, + 71, 68, 67, 65, 65, 64, 63, 63, 67, 68, 73, 73, 78, 80, 84, 87, 90, 95, 97, 103, 103, 111, 112, 117, + 74, 71, 70, 68, 67, 67, 66, 65, 69, 70, 75, 75, 80, 82, 86, 89, 93, 97, 100, 105, 106, 114, 115, 120, 123, + 80, 76, 75, 72, 72, 71, 70, 69, 73, 74, 79, 79, 84, 86, 90, 93, 96, 101, 104, 110, 110, 118, 119, 125, 128, 134, + 81, 77, 77, 74, 73, 73, 71, 71, 74, 75, 80, 80, 85, 87, 91, 94, 98, 103, 105, 111, 112, 120, 121, 127, 130, 136, 137, + 83, 78, 78, 75, 74, 74, 72, 72, 75, 76, 81, 81, 86, 88, 92, 95, 99, 104, 106, 112, 113, 121, 122, 128, 131, 137, 139, 140, + 86, 82, 81, 78, 77, 77, 75, 74, 78, 79, 84, 84, 89, 91, 95, 98, 101, 106, 109, 115, 116, 124, 125, 131, 135, 140, 142, 144, 147, + 89, 84, 84, 80, 80, 79, 78, 77, 79, 81, 85, 86, 91, 92, 97, 98, 104, 106, 112, 114, 119, 123, 128, 132, 135, 142, 145, 148, 149, 153, + 91, 86, 86, 82, 82, 81, 80, 79, 80, 84, 85, 88, 91, 94, 97, 100, 104, 107, 112, 115, 120, 123, 129, 132, 138, 140, 148, 150, 153, 154, 159, + 93, 88, 88, 84, 84, 83, 83, 80, 81, 86, 86, 91, 91, 96, 97, 103, 103, 110, 110, 118, 119, 126, 126, 135, 136, 144, 144, 155, 155, 159, 159, 164, + }, { + 32, + 31, 31, + 31, 31, 31, + 30, 31, 31, 32, + 31, 32, 32, 33, 34, + 33, 34, 35, 35, 37, 39, + 35, 37, 37, 38, 39, 41, 44, + 36, 38, 39, 40, 41, 43, 46, 47, + 40, 41, 41, 42, 43, 44, 46, 47, 48, + 41, 42, 42, 42, 43, 45, 46, 47, 48, 48, + 49, 47, 47, 46, 46, 47, 47, 48, 50, 50, 53, + 49, 47, 47, 46, 46, 47, 47, 47, 49, 50, 53, 53, + 48, 47, 47, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54, + 48, 47, 46, 45, 45, 46, 46, 46, 49, 49, 53, 53, 54, 55, + 49, 47, 46, 45, 45, 45, 45, 45, 48, 49, 53, 54, 55, 56, 58, + 50, 47, 47, 45, 46, 46, 46, 46, 49, 49, 54, 54, 56, 57, 59, 60, + 50, 48, 48, 46, 46, 46, 46, 46, 49, 50, 54, 54, 56, 57, 60, 60, 61, + 52, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58, 61, 62, 63, 65, + 52, 50, 49, 47, 47, 47, 47, 47, 49, 50, 54, 54, 57, 58, 61, 62, 63, 65, 66, + 54, 52, 51, 49, 49, 49, 48, 48, 51, 52, 55, 55, 58, 59, 62, 63, 65, 67, 68, 70, + 54, 52, 51, 49, 49, 49, 48, 48, 51, 52, 55, 56, 58, 60, 62, 64, 65, 67, 68, 70, 71, + 57, 54, 54, 52, 51, 51, 50, 50, 52, 53, 56, 57, 60, 61, 63, 65, 67, 69, 70, 73, 73, 76, + 57, 55, 54, 52, 52, 51, 51, 50, 53, 53, 57, 57, 60, 61, 64, 65, 67, 70, 71, 73, 74, 77, 77, + 60, 57, 56, 54, 54, 53, 52, 52, 54, 55, 58, 59, 61, 63, 65, 67, 68, 71, 72, 75, 75, 79, 79, 82, + 61, 58, 57, 55, 55, 54, 53, 53, 55, 56, 59, 59, 62, 63, 66, 68, 69, 72, 73, 76, 76, 80, 80, 83, 84, + 63, 60, 59, 57, 57, 56, 55, 54, 57, 57, 60, 61, 63, 65, 67, 69, 71, 73, 75, 78, 78, 82, 82, 85, 86, 89, + 64, 61, 60, 58, 57, 57, 56, 55, 57, 58, 61, 61, 64, 65, 68, 69, 71, 74, 75, 78, 78, 82, 83, 86, 87, 89, 90, + 65, 61, 61, 58, 58, 57, 56, 55, 58, 58, 61, 62, 64, 65, 68, 70, 71, 74, 75, 78, 79, 83, 83, 86, 88, 90, 91, 91, + 66, 63, 62, 60, 59, 58, 57, 56, 59, 59, 62, 63, 65, 66, 69, 70, 72, 75, 76, 79, 80, 84, 84, 87, 89, 91, 92, 93, 94, + 67, 64, 63, 61, 60, 59, 58, 57, 59, 60, 62, 63, 66, 66, 70, 70, 73, 74, 77, 78, 81, 83, 85, 87, 89, 92, 93, 94, 94, 96, + 68, 64, 64, 61, 61, 60, 59, 58, 59, 61, 62, 64, 65, 67, 69, 71, 72, 74, 77, 78, 81, 82, 85, 86, 89, 90, 94, 94, 96, 96, 98, + 69, 65, 65, 62, 62, 61, 61, 58, 59, 62, 62, 65, 65, 68, 68, 71, 71, 75, 75, 79, 79, 83, 83, 87, 87, 91, 91, 96, 96, 97, 97, 99, + }, + }, { + { + 32, + 31, 32, + 31, 32, 32, + 31, 32, 32, 32, + 31, 32, 32, 32, 32, + 31, 32, 32, 32, 32, 33, + 31, 32, 32, 32, 32, 33, 33, + 32, 32, 32, 32, 32, 34, 34, 35, + 32, 32, 32, 32, 32, 34, 34, 35, 35, + 34, 34, 34, 33, 33, 35, 35, 37, 37, 39, + 34, 34, 34, 33, 33, 35, 35, 37, 37, 39, 39, + 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, + 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, 48, + 39, 38, 38, 37, 37, 39, 39, 40, 40, 45, 45, 50, 50, 54, + 39, 38, 38, 37, 37, 39, 39, 40, 40, 45, 45, 50, 50, 54, 54, + 44, 42, 42, 41, 41, 42, 42, 42, 42, 47, 47, 54, 54, 58, 58, 63, + 44, 42, 42, 41, 41, 42, 42, 42, 42, 47, 47, 54, 54, 58, 58, 63, 63, + 48, 46, 46, 44, 44, 45, 45, 46, 46, 51, 51, 57, 57, 61, 61, 67, 67, 71, + 48, 46, 46, 44, 44, 45, 45, 46, 46, 51, 51, 57, 57, 61, 61, 67, 67, 71, 71, + 54, 51, 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, + 54, 51, 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82, + 59, 56, 56, 54, 54, 54, 54, 53, 53, 58, 58, 64, 64, 69, 69, 75, 75, 80, 80, 87, 87, 92, + 59, 56, 56, 54, 54, 54, 54, 53, 53, 58, 58, 64, 64, 69, 69, 75, 75, 80, 80, 87, 87, 92, 92, + 65, 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, 105, + 65, 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, 105, 105, + 71, 68, 68, 65, 65, 64, 64, 63, 63, 68, 68, 73, 73, 78, 78, 84, 84, 90, 90, 97, 97, 103, 103, 111, 111, 117, + 71, 68, 68, 65, 65, 64, 64, 63, 63, 68, 68, 73, 73, 78, 78, 84, 84, 90, 90, 97, 97, 103, 103, 111, 111, 117, 117, + 80, 76, 76, 72, 72, 71, 71, 69, 69, 74, 74, 79, 79, 84, 84, 90, 90, 96, 96, 104, 104, 110, 110, 118, 118, 125, 125, 134, + 80, 76, 76, 72, 72, 71, 71, 69, 69, 74, 74, 79, 79, 84, 84, 90, 90, 96, 96, 104, 104, 110, 110, 118, 118, 125, 125, 134, 134, + 83, 78, 78, 75, 75, 74, 74, 72, 72, 76, 76, 81, 81, 86, 86, 92, 92, 99, 99, 106, 106, 113, 113, 121, 121, 128, 128, 137, 137, 140, + 83, 78, 78, 75, 75, 74, 74, 72, 72, 76, 76, 81, 81, 86, 86, 92, 92, 99, 99, 106, 106, 113, 113, 121, 121, 128, 128, 137, 137, 140, 140, + 87, 83, 83, 79, 79, 77, 77, 75, 75, 80, 80, 84, 84, 90, 90, 96, 96, 102, 102, 109, 109, 116, 116, 124, 124, 132, 132, 141, 141, 144, 144, 149, + }, { + 32, + 31, 31, + 31, 31, 31, + 30, 31, 31, 32, + 30, 31, 31, 32, 32, + 33, 34, 34, 35, 35, 39, + 33, 34, 34, 35, 35, 39, 39, + 36, 38, 38, 40, 40, 43, 43, 47, + 36, 38, 38, 40, 40, 43, 43, 47, 47, + 41, 42, 42, 42, 42, 45, 45, 47, 47, 48, + 41, 42, 42, 42, 42, 45, 45, 47, 47, 48, 48, + 49, 47, 47, 46, 46, 47, 47, 48, 48, 50, 50, 53, + 49, 47, 47, 46, 46, 47, 47, 48, 48, 50, 50, 53, 53, + 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54, + 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54, 54, + 49, 47, 47, 45, 45, 45, 45, 45, 45, 49, 49, 53, 53, 55, 55, 58, + 49, 47, 47, 45, 45, 45, 45, 45, 45, 49, 49, 53, 53, 55, 55, 58, 58, + 50, 48, 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56, 56, 60, 60, 61, + 50, 48, 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56, 56, 60, 60, 61, 61, + 52, 50, 50, 47, 47, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 63, 63, 66, + 52, 50, 50, 47, 47, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 63, 63, 66, 66, + 54, 52, 52, 49, 49, 49, 49, 48, 48, 52, 52, 55, 55, 58, 58, 62, 62, 65, 65, 68, 68, 71, + 54, 52, 52, 49, 49, 49, 49, 48, 48, 52, 52, 55, 55, 58, 58, 62, 62, 65, 65, 68, 68, 71, 71, + 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 67, 67, 70, 70, 73, 73, 76, + 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 67, 67, 70, 70, 73, 73, 76, 76, + 60, 57, 57, 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 61, 61, 65, 65, 68, 68, 72, 72, 75, 75, 79, 79, 82, + 60, 57, 57, 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 61, 61, 65, 65, 68, 68, 72, 72, 75, 75, 79, 79, 82, 82, + 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 63, 63, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, + 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 63, 63, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 89, + 65, 61, 61, 58, 58, 57, 57, 55, 55, 58, 58, 61, 61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 83, 86, 86, 90, 90, 91, + 65, 61, 61, 58, 58, 57, 57, 55, 55, 58, 58, 61, 61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 83, 86, 86, 90, 90, 91, 91, + 67, 63, 63, 60, 60, 59, 59, 57, 57, 60, 60, 62, 62, 66, 66, 69, 69, 72, 72, 76, 76, 80, 80, 84, 84, 88, 88, 92, 92, 93, 93, 95, + }, + }, { + { + 32, + 31, 31, + 31, 32, 32, + 31, 32, 32, 32, + 31, 32, 32, 32, 32, + 31, 32, 32, 32, 32, 32, + 31, 32, 32, 32, 32, 33, 33, + 32, 32, 32, 32, 32, 33, 33, 34, + 32, 32, 32, 32, 32, 33, 34, 34, 35, + 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, + 34, 34, 34, 33, 33, 34, 35, 35, 37, 37, 39, + 34, 34, 34, 33, 33, 34, 35, 35, 37, 37, 39, 39, + 35, 35, 35, 34, 34, 35, 36, 36, 38, 38, 42, 42, 46, + 36, 35, 35, 34, 34, 35, 36, 37, 38, 38, 42, 42, 47, 48, + 38, 37, 37, 36, 36, 37, 38, 38, 39, 40, 44, 44, 48, 50, 51, + 39, 38, 38, 38, 37, 38, 39, 39, 40, 41, 45, 45, 49, 50, 52, 54, + 41, 40, 40, 39, 38, 39, 40, 40, 41, 41, 46, 46, 50, 52, 54, 55, 57, + 44, 42, 42, 41, 41, 41, 42, 42, 42, 43, 47, 47, 52, 54, 56, 58, 60, 63, + 45, 43, 43, 42, 41, 42, 42, 43, 43, 43, 48, 48, 53, 54, 57, 58, 60, 64, 65, + 48, 46, 46, 45, 44, 45, 45, 45, 46, 46, 51, 51, 55, 57, 59, 61, 63, 67, 68, 71, + 48, 46, 46, 45, 44, 45, 45, 45, 46, 46, 51, 51, 55, 57, 59, 61, 63, 67, 68, 71, 71, + 53, 51, 51, 49, 49, 49, 49, 49, 49, 49, 54, 54, 58, 59, 62, 64, 67, 71, 72, 75, 75, 81, + 54, 52, 51, 50, 49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76, 76, 81, 82, + 57, 55, 55, 53, 52, 52, 52, 52, 52, 52, 57, 57, 61, 62, 65, 67, 70, 74, 75, 79, 79, 85, 85, 89, + 59, 56, 56, 54, 54, 54, 54, 54, 53, 54, 58, 58, 62, 64, 67, 69, 71, 75, 76, 80, 80, 86, 87, 90, 92, + 62, 59, 59, 57, 56, 56, 56, 56, 55, 56, 60, 60, 64, 66, 69, 71, 73, 77, 78, 83, 83, 89, 89, 93, 95, 98, + 65, 62, 62, 60, 59, 59, 59, 59, 58, 58, 63, 63, 67, 68, 71, 73, 75, 79, 81, 85, 85, 91, 92, 96, 98, 101, 105, + 67, 64, 64, 62, 61, 61, 60, 60, 59, 60, 64, 64, 68, 69, 72, 74, 77, 81, 82, 87, 87, 93, 94, 98, 99, 103, 106, 108, + 71, 68, 68, 66, 65, 64, 64, 64, 63, 63, 68, 68, 72, 73, 76, 78, 80, 84, 85, 90, 90, 97, 97, 102, 103, 107, 111, 113, 117, + 72, 69, 69, 66, 65, 65, 65, 64, 63, 64, 68, 68, 72, 73, 76, 78, 81, 85, 86, 91, 91, 97, 98, 102, 104, 108, 111, 113, 118, 119, + 80, 76, 76, 73, 72, 72, 71, 70, 69, 70, 74, 74, 78, 79, 82, 84, 86, 90, 91, 96, 96, 103, 104, 108, 110, 114, 118, 120, 125, 126, 134, + 80, 76, 76, 73, 72, 72, 71, 70, 69, 70, 74, 74, 78, 79, 82, 84, 86, 90, 91, 96, 96, 103, 104, 108, 110, 114, 118, 120, 125, 126, 134, 134, + }, { + 32, + 31, 31, + 31, 31, 31, + 30, 31, 31, 31, + 30, 31, 31, 31, 32, + 32, 32, 33, 33, 33, 35, + 33, 34, 34, 35, 35, 37, 39, + 34, 35, 35, 36, 36, 38, 40, 41, + 36, 38, 38, 39, 40, 41, 43, 44, 47, + 37, 38, 39, 40, 40, 42, 43, 44, 47, 47, + 41, 42, 42, 42, 42, 43, 45, 45, 47, 47, 48, + 41, 42, 42, 42, 42, 43, 45, 45, 47, 47, 48, 48, + 47, 46, 46, 46, 45, 46, 47, 47, 47, 48, 50, 50, 52, + 49, 48, 47, 47, 46, 47, 47, 47, 48, 48, 50, 50, 52, 53, + 49, 47, 47, 46, 46, 46, 46, 47, 47, 47, 50, 50, 52, 53, 53, + 48, 47, 47, 46, 45, 46, 46, 46, 46, 47, 49, 49, 52, 53, 54, 54, + 49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 49, 49, 52, 53, 54, 55, 55, + 49, 47, 47, 45, 45, 45, 45, 45, 45, 45, 49, 49, 52, 53, 55, 55, 57, 58, + 49, 47, 47, 46, 45, 45, 45, 45, 45, 46, 49, 49, 52, 53, 55, 56, 57, 59, 59, + 50, 48, 48, 47, 46, 46, 46, 46, 46, 46, 50, 50, 53, 54, 55, 56, 58, 60, 60, 61, + 50, 48, 48, 47, 46, 46, 46, 46, 46, 46, 50, 50, 53, 54, 55, 56, 58, 60, 60, 61, 61, + 52, 50, 49, 48, 47, 47, 47, 47, 46, 47, 50, 50, 53, 54, 56, 57, 59, 61, 61, 63, 63, 66, + 52, 50, 50, 48, 47, 47, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 61, 63, 63, 66, 66, + 54, 51, 51, 50, 49, 49, 49, 48, 48, 48, 51, 51, 54, 55, 57, 58, 60, 62, 62, 65, 65, 67, 68, 69, + 54, 52, 52, 50, 49, 49, 49, 49, 48, 48, 52, 52, 55, 55, 57, 58, 60, 62, 63, 65, 65, 68, 68, 70, 71, + 56, 53, 53, 51, 51, 50, 50, 50, 49, 49, 52, 52, 55, 56, 58, 59, 61, 63, 63, 66, 66, 69, 69, 71, 72, 73, + 57, 54, 54, 52, 52, 51, 51, 51, 50, 50, 53, 53, 56, 56, 58, 60, 61, 63, 64, 67, 67, 70, 70, 72, 73, 75, 76, + 58, 55, 55, 53, 52, 52, 52, 51, 50, 51, 54, 54, 56, 57, 59, 60, 62, 64, 65, 67, 67, 71, 71, 73, 74, 75, 77, 78, + 60, 57, 57, 55, 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 60, 61, 63, 65, 66, 68, 68, 72, 72, 74, 75, 77, 79, 80, 82, + 60, 57, 57, 55, 54, 54, 54, 53, 52, 52, 55, 55, 58, 58, 60, 62, 63, 65, 66, 69, 69, 72, 73, 75, 76, 77, 79, 80, 82, 82, + 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57, 57, 60, 60, 62, 63, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83, 85, 85, 89, + 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57, 57, 60, 60, 62, 63, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83, 85, 85, 89, 89, + }, + }, { + { + 32, + 31, 31, + 31, 31, 32, + 31, 32, 32, 32, + 31, 32, 32, 32, 32, + 31, 32, 32, 32, 32, 32, + 31, 32, 32, 32, 32, 32, 33, + 31, 32, 32, 32, 32, 32, 33, 33, + 32, 32, 32, 32, 32, 32, 33, 33, 34, + 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, + 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, + 33, 33, 33, 33, 33, 33, 34, 35, 35, 36, 36, 38, + 34, 34, 34, 34, 33, 33, 35, 35, 36, 37, 37, 39, 39, + 34, 34, 34, 34, 34, 34, 35, 36, 36, 37, 37, 40, 41, 42, + 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 42, 42, 45, 48, + 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 42, 42, 45, 48, 48, + 38, 38, 38, 37, 37, 37, 38, 38, 39, 40, 40, 43, 44, 46, 50, 50, 52, + 39, 38, 38, 38, 37, 37, 39, 39, 39, 40, 40, 44, 45, 47, 50, 50, 53, 54, + 41, 40, 40, 39, 38, 38, 40, 40, 40, 41, 41, 45, 46, 48, 52, 52, 54, 55, 57, + 44, 42, 42, 42, 41, 41, 42, 42, 42, 42, 42, 46, 47, 50, 54, 54, 57, 58, 60, 63, + 44, 42, 42, 42, 41, 41, 42, 42, 42, 42, 42, 46, 47, 50, 54, 54, 57, 58, 60, 63, 63, + 47, 46, 45, 45, 44, 44, 44, 45, 45, 45, 45, 49, 50, 52, 56, 56, 59, 60, 62, 66, 66, 69, + 48, 47, 46, 45, 44, 44, 45, 45, 45, 46, 46, 50, 51, 53, 57, 57, 60, 61, 63, 67, 67, 70, 71, + 50, 49, 48, 47, 46, 46, 47, 47, 47, 47, 47, 51, 52, 54, 58, 58, 61, 62, 65, 68, 68, 72, 73, 75, + 54, 52, 51, 50, 49, 49, 49, 50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, 82, + 54, 52, 51, 50, 49, 49, 49, 50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, 82, 82, + 58, 56, 55, 54, 53, 53, 53, 53, 53, 52, 52, 56, 57, 59, 63, 63, 67, 68, 70, 74, 74, 78, 79, 82, 86, 86, 90, + 59, 57, 56, 55, 54, 54, 54, 54, 54, 53, 53, 57, 58, 60, 64, 64, 68, 69, 71, 75, 75, 79, 80, 83, 87, 87, 91, 92, + 61, 59, 58, 57, 56, 56, 56, 56, 55, 55, 55, 59, 60, 62, 65, 65, 69, 70, 73, 77, 77, 81, 82, 85, 89, 89, 93, 94, 97, + 65, 63, 62, 61, 59, 59, 59, 59, 59, 58, 58, 62, 63, 65, 68, 68, 72, 73, 75, 79, 79, 84, 85, 88, 92, 92, 97, 98, 101, 105, + 65, 63, 62, 61, 59, 59, 59, 59, 59, 58, 58, 62, 63, 65, 68, 68, 72, 73, 75, 79, 79, 84, 85, 88, 92, 92, 97, 98, 101, 105, 105, + 70, 67, 67, 65, 64, 64, 63, 63, 63, 62, 62, 66, 67, 69, 72, 72, 76, 77, 79, 83, 83, 88, 89, 92, 96, 96, 101, 102, 105, 109, 109, 114, + }, { + 32, + 31, 31, + 31, 31, 31, + 31, 31, 31, 31, + 30, 31, 31, 31, 32, + 30, 31, 31, 31, 32, 32, + 33, 33, 34, 34, 34, 34, 37, + 33, 34, 34, 35, 35, 35, 38, 39, + 34, 36, 36, 36, 37, 37, 40, 40, 42, + 36, 38, 38, 39, 40, 40, 42, 43, 45, 47, + 36, 38, 38, 39, 40, 40, 42, 43, 45, 47, 47, + 40, 41, 41, 41, 42, 42, 44, 44, 45, 47, 47, 48, + 41, 42, 42, 42, 42, 42, 44, 45, 46, 47, 47, 48, 48, + 44, 44, 44, 44, 44, 44, 45, 46, 46, 47, 47, 49, 49, 50, + 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48, 50, 50, 51, 53, + 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48, 50, 50, 51, 53, 53, + 48, 47, 47, 46, 45, 45, 46, 46, 46, 47, 47, 49, 50, 51, 53, 53, 54, + 48, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51, 53, 53, 54, 54, + 49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51, 53, 53, 54, 55, 55, + 49, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 48, 49, 51, 53, 53, 55, 55, 57, 58, + 49, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 48, 49, 51, 53, 53, 55, 55, 57, 58, 58, + 50, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 49, 50, 51, 54, 54, 56, 56, 57, 59, 59, 61, + 50, 49, 48, 47, 46, 46, 46, 46, 46, 46, 46, 49, 50, 51, 54, 54, 56, 56, 58, 60, 60, 61, 61, + 51, 49, 49, 48, 47, 47, 47, 47, 47, 46, 46, 49, 50, 51, 54, 54, 56, 57, 58, 60, 60, 62, 62, 63, + 52, 50, 50, 49, 47, 47, 47, 47, 47, 47, 47, 49, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 63, 65, 66, + 52, 50, 50, 49, 47, 47, 47, 47, 47, 47, 47, 49, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 63, 65, 66, 66, + 54, 52, 51, 50, 49, 49, 49, 49, 48, 48, 48, 51, 51, 53, 55, 55, 58, 58, 60, 62, 62, 64, 65, 66, 68, 68, 70, + 54, 52, 52, 51, 49, 49, 49, 49, 49, 48, 48, 51, 52, 53, 55, 55, 58, 58, 60, 62, 62, 64, 65, 66, 68, 68, 70, 71, + 55, 53, 53, 52, 50, 50, 50, 50, 49, 49, 49, 51, 52, 54, 56, 56, 58, 59, 60, 63, 63, 65, 66, 67, 69, 69, 71, 72, 73, + 57, 55, 54, 53, 52, 52, 51, 51, 50, 50, 50, 52, 53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68, 70, 70, 73, 73, 74, 76, + 57, 55, 54, 53, 52, 52, 51, 51, 50, 50, 50, 52, 53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68, 70, 70, 73, 73, 74, 76, 76, + 59, 57, 56, 55, 54, 54, 53, 53, 52, 51, 51, 54, 55, 56, 58, 58, 60, 61, 63, 65, 65, 67, 68, 70, 72, 72, 74, 75, 76, 78, 78, 80, + }, + }, { + { + 32, + 31, 31, + 31, 31, 32, + 31, 31, 32, 32, + 31, 32, 32, 32, 32, + 31, 32, 32, 32, 32, 32, + 31, 32, 32, 32, 32, 32, 32, + 31, 32, 32, 32, 32, 32, 32, 33, + 31, 32, 32, 32, 32, 32, 32, 33, 33, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, + 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, + 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, + 32, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 36, 36, + 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 37, 37, 38, 39, + 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 37, 37, 38, 39, 39, + 35, 34, 34, 34, 34, 34, 34, 35, 36, 36, 37, 37, 39, 41, 41, 43, + 36, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45, 48, + 36, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45, 48, 48, + 38, 37, 37, 37, 36, 36, 36, 38, 38, 38, 39, 39, 41, 44, 44, 47, 50, 50, 51, + 39, 39, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 42, 45, 45, 47, 50, 50, 52, 54, + 39, 39, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 42, 45, 45, 47, 50, 50, 52, 54, 54, + 42, 41, 41, 41, 40, 40, 40, 41, 41, 41, 42, 42, 44, 47, 47, 49, 53, 53, 55, 56, 56, 60, + 44, 43, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 47, 47, 50, 54, 54, 56, 58, 58, 61, 63, + 44, 43, 43, 42, 41, 41, 41, 42, 42, 42, 43, 43, 45, 48, 48, 51, 54, 54, 56, 58, 58, 62, 64, 64, + 47, 46, 45, 45, 44, 44, 44, 44, 45, 45, 45, 45, 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66, 66, 69, + 48, 47, 46, 46, 45, 44, 45, 45, 45, 45, 46, 46, 47, 51, 51, 53, 57, 57, 59, 61, 61, 65, 67, 67, 70, 71, + 49, 48, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 48, 51, 51, 54, 57, 57, 60, 62, 62, 66, 68, 68, 71, 72, 73, + 53, 51, 51, 51, 49, 49, 49, 49, 49, 49, 49, 49, 51, 54, 54, 57, 59, 59, 62, 64, 64, 69, 71, 71, 74, 75, 77, 81, + 54, 52, 51, 51, 50, 49, 49, 50, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63, 65, 65, 69, 71, 72, 75, 76, 77, 81, 82, + 55, 53, 53, 52, 51, 50, 50, 51, 51, 51, 50, 50, 52, 55, 55, 58, 61, 61, 64, 66, 66, 70, 72, 73, 76, 77, 78, 83, 83, 85, + 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53, 53, 55, 58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86, 87, 88, 92, + 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53, 53, 55, 58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86, 87, 88, 92, 92, + }, { + 32, + 31, 31, + 31, 31, 31, + 31, 31, 31, 31, + 30, 31, 31, 31, 31, + 30, 31, 31, 31, 31, 32, + 31, 31, 32, 32, 32, 32, 33, + 33, 34, 34, 34, 35, 35, 35, 38, + 33, 34, 34, 34, 35, 35, 36, 38, 39, + 34, 35, 35, 36, 36, 36, 37, 40, 40, 41, + 36, 38, 38, 38, 39, 40, 40, 43, 43, 44, 47, + 36, 38, 38, 38, 39, 40, 40, 43, 43, 44, 47, 47, + 38, 39, 40, 40, 41, 41, 41, 43, 44, 45, 47, 47, 47, + 41, 42, 42, 42, 42, 42, 43, 44, 45, 45, 47, 47, 48, 48, + 41, 42, 42, 42, 42, 42, 43, 44, 45, 45, 47, 47, 48, 48, 48, + 45, 45, 45, 45, 44, 44, 44, 46, 46, 46, 47, 47, 48, 49, 49, 50, + 49, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 48, 49, 50, 50, 51, 53, + 49, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 48, 49, 50, 50, 51, 53, 53, + 49, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 51, 53, 53, 53, + 48, 47, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 48, 49, 49, 51, 53, 53, 54, 54, + 48, 47, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 48, 49, 49, 51, 53, 53, 54, 54, 54, + 49, 47, 47, 47, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 51, 53, 53, 54, 55, 55, 57, + 49, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 55, 55, 57, 58, + 49, 47, 47, 47, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, 56, 58, 58, 59, + 50, 49, 48, 48, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59, 59, 61, + 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 59, 60, 60, 61, 61, + 51, 49, 48, 48, 47, 46, 46, 47, 47, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 59, 60, 60, 61, 62, 62, + 52, 50, 49, 49, 48, 47, 47, 47, 47, 47, 46, 46, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 63, 64, 66, + 52, 50, 50, 49, 48, 47, 47, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 63, 64, 66, 66, + 53, 51, 50, 50, 48, 48, 48, 48, 48, 48, 47, 47, 48, 51, 51, 52, 54, 54, 56, 58, 58, 60, 61, 62, 63, 64, 64, 67, 67, 68, + 54, 53, 52, 52, 50, 49, 49, 49, 49, 49, 48, 48, 49, 52, 52, 53, 55, 55, 57, 58, 58, 61, 62, 63, 64, 65, 66, 68, 68, 69, 71, + 54, 53, 52, 52, 50, 49, 49, 49, 49, 49, 48, 48, 49, 52, 52, 53, 55, 55, 57, 58, 58, 61, 62, 63, 64, 65, 66, 68, 68, 69, 71, 71, + }, + }, { + { + 32, + 31, 31, + 31, 31, 32, + 31, 31, 32, 32, + 31, 31, 32, 32, 32, + 31, 31, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, + 31, 32, 32, 32, 32, 32, 32, 32, + 31, 32, 32, 32, 32, 32, 32, 32, 33, + 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, + 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 36, 36, 36, + 34, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 36, 37, 37, 38, 39, + 34, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 36, 37, 37, 38, 39, 39, + 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 38, 40, 40, 41, + 35, 35, 35, 35, 34, 34, 34, 34, 36, 36, 36, 37, 38, 38, 39, 42, 42, 43, 46, + 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 44, 47, 48, + 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 44, 47, 48, 48, + 38, 37, 37, 37, 36, 36, 36, 36, 37, 38, 38, 39, 39, 39, 41, 44, 44, 45, 48, 50, 50, 51, + 39, 39, 38, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 40, 42, 45, 45, 46, 49, 50, 50, 52, 54, + 39, 39, 38, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 40, 42, 45, 45, 46, 49, 50, 50, 52, 54, 54, + 41, 40, 40, 40, 39, 38, 38, 39, 40, 40, 40, 41, 41, 41, 43, 46, 46, 47, 50, 52, 52, 54, 55, 55, 57, + 44, 43, 42, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44, 47, 47, 49, 52, 54, 54, 56, 58, 58, 60, 63, + 44, 43, 42, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44, 47, 47, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, + 45, 44, 43, 43, 42, 41, 41, 42, 42, 42, 42, 43, 43, 43, 45, 48, 48, 49, 53, 54, 54, 57, 58, 58, 60, 64, 64, 65, + 47, 46, 45, 45, 45, 44, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 51, 55, 56, 56, 58, 60, 60, 62, 66, 66, 67, 69, + 48, 47, 46, 46, 45, 44, 44, 45, 45, 45, 45, 45, 46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61, 63, 67, 67, 68, 70, 71, + 48, 47, 46, 46, 45, 44, 44, 45, 45, 45, 45, 45, 46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61, 63, 67, 67, 68, 70, 71, 71, + 51, 50, 49, 49, 48, 47, 47, 47, 48, 48, 48, 48, 48, 48, 50, 53, 53, 54, 57, 58, 58, 61, 63, 63, 66, 69, 69, 70, 73, 74, 74, 77, + }, { + 32, + 31, 31, + 31, 31, 31, + 31, 31, 31, 31, + 31, 31, 31, 31, 31, + 30, 31, 31, 31, 31, 32, + 30, 31, 31, 31, 31, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 33, + 33, 33, 34, 34, 34, 34, 34, 35, 37, + 33, 34, 34, 34, 35, 35, 35, 36, 38, 39, + 33, 34, 34, 34, 35, 35, 35, 36, 38, 39, 39, + 35, 36, 37, 37, 37, 38, 38, 38, 41, 41, 41, 44, + 36, 37, 38, 38, 39, 40, 40, 40, 42, 43, 43, 46, 47, + 36, 37, 38, 38, 39, 40, 40, 40, 42, 43, 43, 46, 47, 47, + 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, 44, 46, 47, 47, 47, + 41, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 46, 47, 47, 48, 48, + 41, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 46, 47, 47, 48, 48, 48, + 43, 43, 43, 43, 43, 43, 43, 43, 45, 45, 45, 46, 47, 47, 48, 49, 49, 49, + 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 47, 47, 48, 50, 50, 50, 52, + 49, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 48, 48, 49, 50, 50, 51, 52, 53, + 49, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 48, 48, 49, 50, 50, 51, 52, 53, 53, + 49, 48, 47, 47, 46, 46, 46, 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 50, 52, 53, 53, 53, + 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 48, 49, 49, 50, 52, 53, 53, 54, 54, + 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 48, 49, 49, 50, 52, 53, 53, 54, 54, 54, + 49, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 47, 49, 49, 50, 52, 53, 53, 54, 55, 55, 55, + 49, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 50, 52, 53, 53, 55, 55, 55, 57, 58, + 49, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 50, 52, 53, 53, 55, 55, 55, 57, 58, 58, + 49, 48, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 50, 52, 53, 53, 55, 56, 56, 57, 59, 59, 59, + 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 57, 59, 59, 60, 61, + 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 58, 60, 60, 60, 61, 61, + 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 58, 60, 60, 60, 61, 61, 61, + 51, 50, 49, 49, 48, 47, 47, 47, 47, 47, 47, 47, 46, 46, 48, 50, 50, 51, 53, 54, 54, 56, 57, 57, 58, 60, 60, 61, 62, 63, 63, 64, + }, + }, { + { + 32, + 31, 31, + 31, 31, 32, + 31, 31, 32, 32, + 31, 31, 32, 32, 32, + 31, 31, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 32, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37, + 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, 39, + 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, 39, 39, + 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, 39, 39, 39, + 35, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 37, 37, 39, 41, 41, 41, 43, + 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, + 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, + 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, + 37, 37, 37, 37, 37, 36, 36, 36, 36, 37, 38, 38, 38, 38, 39, 39, 39, 41, 44, 44, 44, 46, 49, 49, 49, 51, + 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45, 45, 45, 47, 50, 50, 50, 52, 54, + 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45, 45, 45, 47, 50, 50, 50, 52, 54, 54, + 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45, 45, 45, 47, 50, 50, 50, 52, 54, 54, 54, + 41, 41, 40, 40, 40, 39, 39, 39, 39, 40, 40, 40, 40, 41, 41, 41, 41, 44, 46, 46, 46, 49, 52, 52, 52, 54, 56, 56, 56, 58, + 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 45, 47, 47, 47, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63, + 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 45, 47, 47, 47, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63, 63, + }, { + 32, + 31, 31, + 31, 31, 31, + 31, 31, 31, 31, + 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, + 30, 31, 31, 31, 31, 31, 32, + 30, 31, 31, 31, 31, 31, 32, 32, + 30, 31, 31, 31, 31, 31, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 33, 33, 35, + 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 39, + 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 39, 39, + 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 39, 39, 39, + 35, 35, 36, 36, 36, 37, 37, 37, 37, 39, 41, 41, 41, 43, + 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, + 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, + 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, + 39, 39, 40, 40, 40, 41, 41, 41, 41, 42, 44, 44, 44, 45, 47, 47, 47, 47, + 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45, 45, 45, 46, 47, 47, 47, 48, 48, + 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45, 45, 45, 46, 47, 47, 47, 48, 48, 48, + 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45, 45, 45, 46, 47, 47, 47, 48, 48, 48, 48, + 45, 45, 45, 45, 45, 44, 44, 44, 44, 45, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 50, + 49, 48, 47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 51, 53, + 49, 48, 47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 51, 53, 53, + 49, 48, 47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 51, 53, 53, 53, + 49, 48, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 51, 53, 53, 53, 53, + 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 49, 49, 49, 51, 53, 53, 53, 53, 54, + 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 49, 49, 49, 51, 53, 53, 53, 53, 54, 54, + 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 49, 49, 49, 51, 53, 53, 53, 53, 54, 54, 54, + 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55, 55, 56, + 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55, 55, 57, 58, + 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55, 55, 57, 58, 58, + }, + }, { + { + 32, + 31, 31, + 31, 31, 31, + 31, 31, 31, 32, + 31, 31, 31, 32, 32, + 31, 31, 31, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, + 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 37, 38, + 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 38, 39, 39, + 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 38, 39, 39, 39, + 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 38, 39, 39, 39, 39, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 37, 37, 37, 38, 40, 41, 41, 41, 42, + 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 37, 38, 38, 38, 39, 41, 42, 42, 42, 44, 46, + 36, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42, 42, 42, 45, 47, 48, + 36, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42, 42, 42, 45, 47, 48, 48, + 36, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42, 42, 42, 45, 47, 48, 48, 48, + 37, 37, 36, 36, 36, 36, 36, 35, 35, 35, 35, 36, 37, 37, 37, 37, 38, 39, 39, 39, 39, 41, 42, 43, 43, 43, 45, 48, 49, 49, 49, 50, + }, { + 32, + 31, 31, + 31, 31, 31, + 31, 31, 31, 31, + 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, + 30, 31, 31, 31, 31, 31, 31, 31, + 30, 30, 31, 31, 31, 31, 31, 31, 32, + 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, + 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, + 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, + 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36, 37, + 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39, + 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39, 39, + 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39, 39, 39, + 34, 35, 36, 36, 36, 36, 36, 37, 37, 37, 37, 38, 40, 40, 40, 40, 42, + 36, 36, 37, 37, 37, 37, 38, 38, 39, 39, 39, 40, 41, 42, 42, 42, 44, 46, + 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 45, 46, 47, + 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 45, 46, 47, 47, + 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 45, 46, 47, 47, 47, + 38, 39, 39, 40, 40, 40, 40, 41, 41, 41, 41, 42, 43, 44, 44, 44, 45, 47, 47, 47, 47, 47, + 40, 41, 41, 41, 41, 41, 41, 42, 42, 42, 42, 43, 44, 44, 44, 44, 45, 47, 47, 47, 47, 48, 48, + 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 48, 48, 48, + 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 48, 48, 48, 48, + 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 48, 48, 48, 48, 48, + 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 45, 46, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 49, 50, + 47, 47, 46, 46, 46, 46, 46, 46, 45, 45, 45, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, + 49, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 51, 52, 53, + 49, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 51, 52, 53, 53, + 49, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 51, 52, 53, 53, 53, + 49, 48, 47, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53, + }, + }, { + { + 32, + 31, 31, + 31, 31, 31, + 31, 31, 31, 31, + 31, 31, 31, 32, 32, + 31, 31, 31, 32, 32, 32, + 31, 31, 31, 32, 32, 32, 32, + 31, 31, 31, 32, 32, 32, 32, 32, + 31, 31, 31, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, + 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 36, 36, 36, 36, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36, 36, 37, 38, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 39, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 39, 39, + }, { + 32, + 31, 31, + 31, 31, 31, + 31, 31, 31, 31, + 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, + 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, + 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, + 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, + 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 35, + 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37, + 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, 39, + 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, 39, 39, + 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, + 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, + 34, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, + 35, 36, 36, 36, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 39, 41, 41, 41, 41, 41, 42, 44, + 36, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 40, 41, 42, 43, 43, 43, 43, 44, 45, 46, + 36, 37, 37, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46, 47, 47, + 36, 37, 37, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46, 47, 47, 47, + 36, 37, 37, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47, + 37, 37, 38, 38, 39, 39, 39, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, + 38, 39, 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 43, 44, 44, 44, 44, 45, 46, 47, 47, 47, 47, 47, 47, + 40, 40, 40, 41, 41, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 43, 44, 44, 44, 44, 44, 45, 46, 47, 47, 47, 47, 47, 48, 48, + 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 43, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47, 48, 48, 48, + 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 43, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47, 48, 48, 48, 48, + }, + }, { + { + 32, + 31, 31, + 31, 31, 31, + 31, 31, 31, 31, + 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 32, + 31, 31, 31, 31, 31, 32, 32, + 31, 31, 31, 31, 31, 32, 32, 32, + 31, 31, 31, 31, 31, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + }, { + 32, + 31, 31, + 31, 31, 31, + 31, 31, 31, 31, + 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, + 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, + 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, + 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, + 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, + 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, + 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, + 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, + 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, + 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, + 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, + 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, + 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 39, + 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 39, 39, + 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36, 36, 36, 36, 36, 37, 37, 38, 39, 40, 40, 40, 40, 40, 40, 40, + }, + }, { + { + 32, + 31, 31, + 31, 31, 31, + 31, 31, 31, 31, + 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + }, { + 32, + 31, 31, + 31, 31, 31, + 31, 31, 31, 31, + 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, + 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, + }, + }, +}; + +const uint8_t *dav1d_qm_tbl[16][2][N_RECT_TX_SIZES]; +static uint8_t pb_32x32[32 * 32]; +static uint8_t qm_tbl_4x4[15][2][16]; +static uint8_t qm_tbl_4x8[15][2][32]; +static uint8_t qm_tbl_4x16[15][2][64]; +static uint8_t qm_tbl_8x8[15][2][64]; +static uint8_t qm_tbl_8x16[15][2][128]; +static uint8_t qm_tbl_8x32[15][2][256]; +static uint8_t qm_tbl_16x16[15][2][256]; +static uint8_t qm_tbl_16x32[15][2][512]; +static uint8_t qm_tbl_32x32[15][2][1024]; + +static void subsample(uint8_t *const dst, const uint8_t *const src, + const int sz, const int step) +{ + for (int y = 0; y < sz; y++) + for (int x = 0; x < sz; x++) + dst[y * sz + x] = src[y * sz * step * step + x * step]; +} + +static void transpose(uint8_t *const dst, const uint8_t *const src, + const int w, const int h) +{ + for (int y = 0, y_off = 0; y < h; y++, y_off += w) + for (int x = 0, x_off = 0; x < w; x++, x_off += h) + dst[x_off + y] = src[y_off + x]; +} + +static void untriangle(uint8_t *dst, const uint8_t *src, const int sz) { + for (int y = 0; y < sz; y++) { + memcpy(dst, src, y + 1); + const uint8_t *src_ptr = &src[y]; + for (int x = y + 1; x < sz; x++) { + src_ptr += x; + dst[x] = *src_ptr; + } + dst += sz; + src += y + 1; + } +} + +COLD void dav1d_init_qm_tables(void) { + // This function is guaranteed to be called only once + + for (int i = 0; i < 15; i++) + for (int j = 0; j < 2; j++) { + // note that the w/h in the assignment is inverted, this is on purpose + // because we store coefficients transposed + dav1d_qm_tbl[i][j][RTX_4X8 ] = qm_tbl_8x4[i][j]; + dav1d_qm_tbl[i][j][RTX_8X4 ] = qm_tbl_4x8[i][j]; + transpose(qm_tbl_4x8[i][j], qm_tbl_8x4[i][j], 8, 4); + dav1d_qm_tbl[i][j][RTX_4X16 ] = qm_tbl_16x4[i][j]; + dav1d_qm_tbl[i][j][RTX_16X4 ] = qm_tbl_4x16[i][j]; + transpose(qm_tbl_4x16[i][j], qm_tbl_16x4[i][j], 16, 4); + dav1d_qm_tbl[i][j][RTX_8X16 ] = qm_tbl_16x8[i][j]; + dav1d_qm_tbl[i][j][RTX_16X8 ] = qm_tbl_8x16[i][j]; + transpose(qm_tbl_8x16[i][j], qm_tbl_16x8[i][j], 16, 8); + dav1d_qm_tbl[i][j][RTX_8X32 ] = qm_tbl_32x8[i][j]; + dav1d_qm_tbl[i][j][RTX_32X8 ] = qm_tbl_8x32[i][j]; + transpose(qm_tbl_8x32[i][j], qm_tbl_32x8[i][j], 32, 8); + dav1d_qm_tbl[i][j][RTX_16X32] = qm_tbl_32x16[i][j]; + dav1d_qm_tbl[i][j][RTX_32X16] = qm_tbl_16x32[i][j]; + transpose(qm_tbl_16x32[i][j], qm_tbl_32x16[i][j], 32, 16); + + dav1d_qm_tbl[i][j][ TX_4X4 ] = qm_tbl_4x4[i][j]; + dav1d_qm_tbl[i][j][ TX_8X8 ] = qm_tbl_8x8[i][j]; + dav1d_qm_tbl[i][j][ TX_16X16] = qm_tbl_16x16[i][j]; + dav1d_qm_tbl[i][j][ TX_32X32] = qm_tbl_32x32[i][j]; + untriangle(qm_tbl_4x4[i][j], qm_tbl_4x4_t[i][j], 4); + untriangle(qm_tbl_8x8[i][j], qm_tbl_8x8_t[i][j], 8); + untriangle(qm_tbl_32x32[i][j], qm_tbl_32x32_t[i][j], 32); + subsample(qm_tbl_16x16[i][j], qm_tbl_32x32[i][j], 16, 2); + + dav1d_qm_tbl[i][j][ TX_64X64] = dav1d_qm_tbl[i][j][ TX_32X32]; + dav1d_qm_tbl[i][j][RTX_64X32] = dav1d_qm_tbl[i][j][ TX_32X32]; + dav1d_qm_tbl[i][j][RTX_64X16] = dav1d_qm_tbl[i][j][RTX_32X16]; + dav1d_qm_tbl[i][j][RTX_32X64] = dav1d_qm_tbl[i][j][ TX_32X32]; + dav1d_qm_tbl[i][j][RTX_16X64] = dav1d_qm_tbl[i][j][RTX_16X32]; + } + + memset(pb_32x32, 32, sizeof(pb_32x32)); + for (int j = 0; j < 2; j++) + for (int k = 0; k < N_RECT_TX_SIZES; k++) + dav1d_qm_tbl[15][j][k] = pb_32x32; +} diff --git a/third_party/dav1d/src/qm.h b/third_party/dav1d/src/qm.h new file mode 100644 index 0000000000..23b2348a70 --- /dev/null +++ b/third_party/dav1d/src/qm.h @@ -0,0 +1,37 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_QM_H +#define DAV1D_SRC_QM_H + +#include "src/levels.h" + +extern const uint8_t *dav1d_qm_tbl[16][2][N_RECT_TX_SIZES]; + +void dav1d_init_qm_tables(void); + +#endif /* DAV1D_SRC_QM_H */ diff --git a/third_party/dav1d/src/recon.h b/third_party/dav1d/src/recon.h new file mode 100644 index 0000000000..f84c8ab31e --- /dev/null +++ b/third_party/dav1d/src/recon.h @@ -0,0 +1,75 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_RECON_H +#define DAV1D_SRC_RECON_H + +#include "src/internal.h" +#include "src/levels.h" + +#define DEBUG_BLOCK_INFO 0 && \ + f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \ + t->bx >= 8 && t->bx < 12 +#define DEBUG_B_PIXELS 0 + +#define decl_recon_b_intra_fn(name) \ +void (name)(Dav1dTileContext *t, enum BlockSize bs, \ + enum EdgeFlags intra_edge_flags, const Av1Block *b) +typedef decl_recon_b_intra_fn(*recon_b_intra_fn); + +#define decl_recon_b_inter_fn(name) \ +int (name)(Dav1dTileContext *t, enum BlockSize bs, const Av1Block *b) +typedef decl_recon_b_inter_fn(*recon_b_inter_fn); + +#define decl_filter_sbrow_fn(name) \ +void (name)(Dav1dFrameContext *f, int sby) +typedef decl_filter_sbrow_fn(*filter_sbrow_fn); + +#define decl_backup_ipred_edge_fn(name) \ +void (name)(Dav1dTileContext *t) +typedef decl_backup_ipred_edge_fn(*backup_ipred_edge_fn); + +#define decl_read_coef_blocks_fn(name) \ +void (name)(Dav1dTileContext *t, enum BlockSize bs, const Av1Block *b) +typedef decl_read_coef_blocks_fn(*read_coef_blocks_fn); + +decl_recon_b_intra_fn(dav1d_recon_b_intra_8bpc); +decl_recon_b_intra_fn(dav1d_recon_b_intra_16bpc); + +decl_recon_b_inter_fn(dav1d_recon_b_inter_8bpc); +decl_recon_b_inter_fn(dav1d_recon_b_inter_16bpc); + +decl_filter_sbrow_fn(dav1d_filter_sbrow_8bpc); +decl_filter_sbrow_fn(dav1d_filter_sbrow_16bpc); + +decl_backup_ipred_edge_fn(dav1d_backup_ipred_edge_8bpc); +decl_backup_ipred_edge_fn(dav1d_backup_ipred_edge_16bpc); + +decl_read_coef_blocks_fn(dav1d_read_coef_blocks_8bpc); +decl_read_coef_blocks_fn(dav1d_read_coef_blocks_16bpc); + +#endif /* DAV1D_SRC_RECON_H */ diff --git a/third_party/dav1d/src/recon_tmpl.c b/third_party/dav1d/src/recon_tmpl.c new file mode 100644 index 0000000000..5a3e81dcb9 --- /dev/null +++ b/third_party/dav1d/src/recon_tmpl.c @@ -0,0 +1,2062 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include +#include + +#include "common/attributes.h" +#include "common/bitdepth.h" +#include "common/dump.h" +#include "common/intops.h" + +#include "src/cdef_apply.h" +#include "src/ctx.h" +#include "src/ipred_prepare.h" +#include "src/lf_apply.h" +#include "src/lr_apply.h" +#include "src/recon.h" +#include "src/scan.h" +#include "src/tables.h" +#include "src/wedge.h" + +static inline unsigned read_golomb(MsacContext *const msac) { + int len = 0; + unsigned val = 1; + + while (!dav1d_msac_decode_bool_equi(msac) && len < 32) len++; + while (len--) val = (val << 1) + dav1d_msac_decode_bool_equi(msac); + + return val - 1; +} + +static inline unsigned get_skip_ctx(const TxfmInfo *const t_dim, + const enum BlockSize bs, + const uint8_t *const a, + const uint8_t *const l, + const int chroma, + const enum Dav1dPixelLayout layout) +{ + const uint8_t *const b_dim = dav1d_block_dimensions[bs]; + + if (chroma) { + const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420; + const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444; + const int not_one_blk = b_dim[2] - (!!b_dim[2] && ss_hor) > t_dim->lw || + b_dim[3] - (!!b_dim[3] && ss_ver) > t_dim->lh; + unsigned ca, cl; + +#define MERGE_CTX(dir, type, no_val) \ + c##dir = *(const type *) dir != no_val; \ + break + + switch (t_dim->lw) { + /* For some reason the MSVC CRT _wassert() function is not flagged as + * __declspec(noreturn), so when using those headers the compiler will + * expect execution to continue after an assertion has been triggered + * and will therefore complain about the use of uninitialized variables + * when compiled in debug mode if we put the default case at the end. */ + default: assert(0); /* fall-through */ + case TX_4X4: MERGE_CTX(a, uint8_t, 0x40); + case TX_8X8: MERGE_CTX(a, uint16_t, 0x4040); + case TX_16X16: MERGE_CTX(a, uint32_t, 0x40404040U); + case TX_32X32: MERGE_CTX(a, uint64_t, 0x4040404040404040ULL); + } + switch (t_dim->lh) { + default: assert(0); /* fall-through */ + case TX_4X4: MERGE_CTX(l, uint8_t, 0x40); + case TX_8X8: MERGE_CTX(l, uint16_t, 0x4040); + case TX_16X16: MERGE_CTX(l, uint32_t, 0x40404040U); + case TX_32X32: MERGE_CTX(l, uint64_t, 0x4040404040404040ULL); + } +#undef MERGE_CTX + + return 7 + not_one_blk * 3 + ca + cl; + } else if (b_dim[2] == t_dim->lw && b_dim[3] == t_dim->lh) { + return 0; + } else { + unsigned la, ll; + +#define MERGE_CTX(dir, type, tx) \ + if (tx == TX_64X64) { \ + uint64_t tmp = *(const uint64_t *) dir; \ + tmp |= *(const uint64_t *) &dir[8]; \ + l##dir = (unsigned) (tmp >> 32) | (unsigned) tmp; \ + } else \ + l##dir = *(const type *) dir; \ + if (tx == TX_32X32) l##dir |= *(const type *) &dir[sizeof(type)]; \ + if (tx >= TX_16X16) l##dir |= l##dir >> 16; \ + if (tx >= TX_8X8) l##dir |= l##dir >> 8; \ + break + + switch (t_dim->lw) { + default: assert(0); /* fall-through */ + case TX_4X4: MERGE_CTX(a, uint8_t, TX_4X4); + case TX_8X8: MERGE_CTX(a, uint16_t, TX_8X8); + case TX_16X16: MERGE_CTX(a, uint32_t, TX_16X16); + case TX_32X32: MERGE_CTX(a, uint32_t, TX_32X32); + case TX_64X64: MERGE_CTX(a, uint32_t, TX_64X64); + } + switch (t_dim->lh) { + default: assert(0); /* fall-through */ + case TX_4X4: MERGE_CTX(l, uint8_t, TX_4X4); + case TX_8X8: MERGE_CTX(l, uint16_t, TX_8X8); + case TX_16X16: MERGE_CTX(l, uint32_t, TX_16X16); + case TX_32X32: MERGE_CTX(l, uint32_t, TX_32X32); + case TX_64X64: MERGE_CTX(l, uint32_t, TX_64X64); + } +#undef MERGE_CTX + + return dav1d_skip_ctx[umin(la & 0x3F, 4)][umin(ll & 0x3F, 4)]; + } +} + +static inline unsigned get_dc_sign_ctx(const int /*enum RectTxfmSize*/ tx, + const uint8_t *const a, + const uint8_t *const l) +{ + uint64_t mask = 0xC0C0C0C0C0C0C0C0ULL, mul = 0x0101010101010101ULL; + int s; + +#if ARCH_X86_64 && defined(__GNUC__) + /* Coerce compilers into producing better code. For some reason + * every x86-64 compiler is awful at handling 64-bit constants. */ + __asm__("" : "+r"(mask), "+r"(mul)); +#endif + + switch(tx) { + default: assert(0); /* fall-through */ + case TX_4X4: { + int t = *(const uint8_t *) a >> 6; + t += *(const uint8_t *) l >> 6; + s = t - 1 - 1; + break; + } + case TX_8X8: { + uint32_t t = *(const uint16_t *) a & (uint32_t) mask; + t += *(const uint16_t *) l & (uint32_t) mask; + t *= 0x04040404U; + s = (int) (t >> 24) - 2 - 2; + break; + } + case TX_16X16: { + uint32_t t = (*(const uint32_t *) a & (uint32_t) mask) >> 6; + t += (*(const uint32_t *) l & (uint32_t) mask) >> 6; + t *= (uint32_t) mul; + s = (int) (t >> 24) - 4 - 4; + break; + } + case TX_32X32: { + uint64_t t = (*(const uint64_t *) a & mask) >> 6; + t += (*(const uint64_t *) l & mask) >> 6; + t *= mul; + s = (int) (t >> 56) - 8 - 8; + break; + } + case TX_64X64: { + uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6; + t += (*(const uint64_t *) &a[8] & mask) >> 6; + t += (*(const uint64_t *) &l[0] & mask) >> 6; + t += (*(const uint64_t *) &l[8] & mask) >> 6; + t *= mul; + s = (int) (t >> 56) - 16 - 16; + break; + } + case RTX_4X8: { + uint32_t t = *(const uint8_t *) a & (uint32_t) mask; + t += *(const uint16_t *) l & (uint32_t) mask; + t *= 0x04040404U; + s = (int) (t >> 24) - 1 - 2; + break; + } + case RTX_8X4: { + uint32_t t = *(const uint16_t *) a & (uint32_t) mask; + t += *(const uint8_t *) l & (uint32_t) mask; + t *= 0x04040404U; + s = (int) (t >> 24) - 2 - 1; + break; + } + case RTX_8X16: { + uint32_t t = *(const uint16_t *) a & (uint32_t) mask; + t += *(const uint32_t *) l & (uint32_t) mask; + t = (t >> 6) * (uint32_t) mul; + s = (int) (t >> 24) - 2 - 4; + break; + } + case RTX_16X8: { + uint32_t t = *(const uint32_t *) a & (uint32_t) mask; + t += *(const uint16_t *) l & (uint32_t) mask; + t = (t >> 6) * (uint32_t) mul; + s = (int) (t >> 24) - 4 - 2; + break; + } + case RTX_16X32: { + uint64_t t = *(const uint32_t *) a & (uint32_t) mask; + t += *(const uint64_t *) l & mask; + t = (t >> 6) * mul; + s = (int) (t >> 56) - 4 - 8; + break; + } + case RTX_32X16: { + uint64_t t = *(const uint64_t *) a & mask; + t += *(const uint32_t *) l & (uint32_t) mask; + t = (t >> 6) * mul; + s = (int) (t >> 56) - 8 - 4; + break; + } + case RTX_32X64: { + uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6; + t += (*(const uint64_t *) &l[0] & mask) >> 6; + t += (*(const uint64_t *) &l[8] & mask) >> 6; + t *= mul; + s = (int) (t >> 56) - 8 - 16; + break; + } + case RTX_64X32: { + uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6; + t += (*(const uint64_t *) &a[8] & mask) >> 6; + t += (*(const uint64_t *) &l[0] & mask) >> 6; + t *= mul; + s = (int) (t >> 56) - 16 - 8; + break; + } + case RTX_4X16: { + uint32_t t = *(const uint8_t *) a & (uint32_t) mask; + t += *(const uint32_t *) l & (uint32_t) mask; + t = (t >> 6) * (uint32_t) mul; + s = (int) (t >> 24) - 1 - 4; + break; + } + case RTX_16X4: { + uint32_t t = *(const uint32_t *) a & (uint32_t) mask; + t += *(const uint8_t *) l & (uint32_t) mask; + t = (t >> 6) * (uint32_t) mul; + s = (int) (t >> 24) - 4 - 1; + break; + } + case RTX_8X32: { + uint64_t t = *(const uint16_t *) a & (uint32_t) mask; + t += *(const uint64_t *) l & mask; + t = (t >> 6) * mul; + s = (int) (t >> 56) - 2 - 8; + break; + } + case RTX_32X8: { + uint64_t t = *(const uint64_t *) a & mask; + t += *(const uint16_t *) l & (uint32_t) mask; + t = (t >> 6) * mul; + s = (int) (t >> 56) - 8 - 2; + break; + } + case RTX_16X64: { + uint64_t t = *(const uint32_t *) a & (uint32_t) mask; + t += *(const uint64_t *) &l[0] & mask; + t = (t >> 6) + ((*(const uint64_t *) &l[8] & mask) >> 6); + t *= mul; + s = (int) (t >> 56) - 4 - 16; + break; + } + case RTX_64X16: { + uint64_t t = *(const uint64_t *) &a[0] & mask; + t += *(const uint32_t *) l & (uint32_t) mask; + t = (t >> 6) + ((*(const uint64_t *) &a[8] & mask) >> 6); + t *= mul; + s = (int) (t >> 56) - 16 - 4; + break; + } + } + + return (s != 0) + (s > 0); +} + +static inline unsigned get_lo_ctx(const uint8_t *const levels, + const enum TxClass tx_class, + unsigned *const hi_mag, + const uint8_t (*const ctx_offsets)[5], + const unsigned x, const unsigned y, + const ptrdiff_t stride) +{ + unsigned mag = levels[0 * stride + 1] + levels[1 * stride + 0]; + unsigned offset; + if (tx_class == TX_CLASS_2D) { + mag += levels[1 * stride + 1]; + *hi_mag = mag; + mag += levels[0 * stride + 2] + levels[2 * stride + 0]; + offset = ctx_offsets[umin(y, 4)][umin(x, 4)]; + } else { + mag += levels[0 * stride + 2]; + *hi_mag = mag; + mag += levels[0 * stride + 3] + levels[0 * stride + 4]; + offset = 26 + (y > 1 ? 10 : y * 5); + } + return offset + (mag > 512 ? 4 : (mag + 64) >> 7); +} + +static int decode_coefs(Dav1dTileContext *const t, + uint8_t *const a, uint8_t *const l, + const enum RectTxfmSize tx, const enum BlockSize bs, + const Av1Block *const b, const int intra, + const int plane, coef *cf, + enum TxfmType *const txtp, uint8_t *res_ctx) +{ + Dav1dTileState *const ts = t->ts; + const int chroma = !!plane; + const Dav1dFrameContext *const f = t->f; + const int lossless = f->frame_hdr->segmentation.lossless[b->seg_id]; + const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx]; + const int dbg = DEBUG_BLOCK_INFO && plane && 0; + + if (dbg) + printf("Start: r=%d\n", ts->msac.rng); + + // does this block have any non-zero coefficients + const int sctx = get_skip_ctx(t_dim, bs, a, l, chroma, f->cur.p.layout); + const int all_skip = dav1d_msac_decode_bool_adapt(&ts->msac, + ts->cdf.coef.skip[t_dim->ctx][sctx]); + if (dbg) + printf("Post-non-zero[%d][%d][%d]: r=%d\n", + t_dim->ctx, sctx, all_skip, ts->msac.rng); + if (all_skip) { + *res_ctx = 0x40; + *txtp = lossless * WHT_WHT; /* lossless ? WHT_WHT : DCT_DCT */ + return -1; + } + + // transform type (chroma: derived, luma: explicitly coded) + if (lossless) { + assert(t_dim->max == TX_4X4); + *txtp = WHT_WHT; + } else if (t_dim->max + intra >= TX_64X64) { + *txtp = DCT_DCT; + } else if (chroma) { + // inferred from either the luma txtp (inter) or a LUT (intra) + *txtp = intra ? dav1d_txtp_from_uvmode[b->uv_mode] : + get_uv_inter_txtp(t_dim, *txtp); + } else if (!f->frame_hdr->segmentation.qidx[b->seg_id]) { + // In libaom, lossless is checked by a literal qidx == 0, but not all + // such blocks are actually lossless. The remainder gets an implicit + // transform type (for luma) + *txtp = DCT_DCT; + } else { + unsigned idx; + if (intra) { + const enum IntraPredMode y_mode_nofilt = b->y_mode == FILTER_PRED ? + dav1d_filter_mode_to_y_mode[b->y_angle] : b->y_mode; + if (f->frame_hdr->reduced_txtp_set || t_dim->min == TX_16X16) { + idx = dav1d_msac_decode_symbol_adapt4(&ts->msac, + ts->cdf.m.txtp_intra2[t_dim->min][y_mode_nofilt], 4); + *txtp = dav1d_tx_types_per_set[idx + 0]; + } else { + idx = dav1d_msac_decode_symbol_adapt8(&ts->msac, + ts->cdf.m.txtp_intra1[t_dim->min][y_mode_nofilt], 6); + *txtp = dav1d_tx_types_per_set[idx + 5]; + } + if (dbg) + printf("Post-txtp-intra[%d->%d][%d][%d->%d]: r=%d\n", + tx, t_dim->min, y_mode_nofilt, idx, *txtp, ts->msac.rng); + } else { + if (f->frame_hdr->reduced_txtp_set || t_dim->max == TX_32X32) { + idx = dav1d_msac_decode_bool_adapt(&ts->msac, + ts->cdf.m.txtp_inter3[t_dim->min]); + *txtp = (idx - 1) & IDTX; /* idx ? DCT_DCT : IDTX */ + } else if (t_dim->min == TX_16X16) { + idx = dav1d_msac_decode_symbol_adapt16(&ts->msac, + ts->cdf.m.txtp_inter2, 11); + *txtp = dav1d_tx_types_per_set[idx + 12]; + } else { + idx = dav1d_msac_decode_symbol_adapt16(&ts->msac, + ts->cdf.m.txtp_inter1[t_dim->min], 15); + *txtp = dav1d_tx_types_per_set[idx + 24]; + } + if (dbg) + printf("Post-txtp-inter[%d->%d][%d->%d]: r=%d\n", + tx, t_dim->min, idx, *txtp, ts->msac.rng); + } + } + + // find end-of-block (eob) + int eob_bin; + const int tx2dszctx = imin(t_dim->lw, TX_32X32) + imin(t_dim->lh, TX_32X32); + const enum TxClass tx_class = dav1d_tx_type_class[*txtp]; + const int is_1d = tx_class != TX_CLASS_2D; + switch (tx2dszctx) { +#define case_sz(sz, bin, ns, is_1d) \ + case sz: { \ + uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_##bin[chroma]is_1d; \ + eob_bin = dav1d_msac_decode_symbol_adapt##ns(&ts->msac, eob_bin_cdf, 4 + sz); \ + break; \ + } + case_sz(0, 16, 4, [is_1d]); + case_sz(1, 32, 8, [is_1d]); + case_sz(2, 64, 8, [is_1d]); + case_sz(3, 128, 8, [is_1d]); + case_sz(4, 256, 16, [is_1d]); + case_sz(5, 512, 16, ); + case_sz(6, 1024, 16, ); +#undef case_sz + } + if (dbg) + printf("Post-eob_bin_%d[%d][%d][%d]: r=%d\n", + 16 << tx2dszctx, chroma, is_1d, eob_bin, ts->msac.rng); + int eob; + if (eob_bin > 1) { + uint16_t *const eob_hi_bit_cdf = + ts->cdf.coef.eob_hi_bit[t_dim->ctx][chroma][eob_bin]; + const int eob_hi_bit = dav1d_msac_decode_bool_adapt(&ts->msac, eob_hi_bit_cdf); + if (dbg) + printf("Post-eob_hi_bit[%d][%d][%d][%d]: r=%d\n", + t_dim->ctx, chroma, eob_bin, eob_hi_bit, ts->msac.rng); + eob = ((eob_hi_bit | 2) << (eob_bin - 2)) | + dav1d_msac_decode_bools(&ts->msac, eob_bin - 2); + if (dbg) + printf("Post-eob[%d]: r=%d\n", eob, ts->msac.rng); + } else { + eob = eob_bin; + } + + // base tokens + uint16_t (*const eob_cdf)[4] = ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma]; + uint16_t (*const hi_cdf)[4] = ts->cdf.coef.br_tok[imin(t_dim->ctx, 3)][chroma]; + const uint16_t *const scan = dav1d_scans[tx][tx_class]; + int dc_tok; + + if (eob) { + uint16_t (*const lo_cdf)[4] = ts->cdf.coef.base_tok[t_dim->ctx][chroma]; + uint8_t *const levels = t->scratch.levels; // bits 0-5: tok, 6-7: lo_tok + const int sw = imin(t_dim->w, 8), sh = imin(t_dim->h, 8); + const unsigned shift = 2 + imin(t_dim->lh, 3), mask = 4 * sh - 1; + + /* eob */ + unsigned rc = scan[eob], x = rc >> shift, y = rc & mask; + unsigned ctx = 1 + (eob > sw * sh * 2) + (eob > sw * sh * 4); + int eob_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[ctx], 2); + int tok = eob_tok + 1; + int level_tok = tok * 0x41; + unsigned mag; + if (dbg) + printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", + t_dim->ctx, chroma, ctx, eob, rc, tok, ts->msac.rng); + +#define DECODE_COEFS_CLASS(tx_class) \ + if (eob_tok == 2) { \ + ctx = (tx_class == TX_CLASS_2D ? (x | y) > 1 : \ + tx_class == TX_CLASS_H ? x != 0 : y != 0) ? 14 : 7; \ + tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \ + level_tok = tok + (3 << 6); \ + if (dbg) \ + printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \ + imin(t_dim->ctx, 3), chroma, ctx, eob, rc, tok, \ + ts->msac.rng); \ + } \ + cf[rc] = tok; \ + if (tx_class == TX_CLASS_H) \ + /* Transposing reduces the stride and padding requirements */ \ + levels[y * stride + x] = (uint8_t) level_tok; \ + else \ + levels[x * stride + y] = (uint8_t) level_tok; \ + for (int i = eob - 1; i > 0; i--) { /* ac */ \ + if (tx_class == TX_CLASS_H) \ + rc = i, x = rc & mask, y = rc >> shift; \ + else \ + rc = scan[i], x = rc >> shift, y = rc & mask; \ + assert(x < 32 && y < 32); \ + uint8_t *const level = levels + x * stride + y; \ + ctx = get_lo_ctx(level, tx_class, &mag, lo_ctx_offsets, x, y, stride); \ + if (tx_class == TX_CLASS_2D) \ + y |= x; \ + tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \ + level_tok = tok * 0x41; \ + if (dbg) \ + printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \ + t_dim->ctx, chroma, ctx, i, rc, tok, ts->msac.rng); \ + if (tok == 3) { \ + mag &= 63; \ + ctx = (y > (tx_class == TX_CLASS_2D) ? 14 : 7) + \ + (mag > 12 ? 6 : (mag + 1) >> 1); \ + tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \ + level_tok = tok + (3 << 6); \ + if (dbg) \ + printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \ + imin(t_dim->ctx, 3), chroma, ctx, i, rc, tok, \ + ts->msac.rng); \ + } \ + cf[rc] = tok; \ + *level = (uint8_t) level_tok; \ + } \ + /* dc */ \ + ctx = (tx_class == TX_CLASS_2D) ? 0 : \ + get_lo_ctx(levels, tx_class, &mag, lo_ctx_offsets, 0, 0, stride); \ + dc_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \ + if (dbg) \ + printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n", \ + t_dim->ctx, chroma, ctx, dc_tok, ts->msac.rng); \ + if (dc_tok == 3) { \ + if (tx_class == TX_CLASS_2D) \ + mag = levels[0 * stride + 1] + levels[1 * stride + 0] + \ + levels[1 * stride + 1]; \ + mag &= 63; \ + ctx = mag > 12 ? 6 : (mag + 1) >> 1; \ + dc_tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \ + if (dbg) \ + printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n", \ + imin(t_dim->ctx, 3), chroma, dc_tok, ts->msac.rng); \ + } \ + break + + switch (tx_class) { + case TX_CLASS_2D: { + const unsigned nonsquare_tx = tx >= RTX_4X8; + const uint8_t (*const lo_ctx_offsets)[5] = + dav1d_lo_ctx_offsets[nonsquare_tx + (tx & nonsquare_tx)]; + const ptrdiff_t stride = 4 * sh; + memset(levels, 0, stride * (4 * sw + 2)); + DECODE_COEFS_CLASS(TX_CLASS_2D); + } + case TX_CLASS_H: { +#define lo_ctx_offsets NULL + const ptrdiff_t stride = 16; + memset(levels, 0, stride * (4 * sh + 2)); + DECODE_COEFS_CLASS(TX_CLASS_H); + } + case TX_CLASS_V: { + const ptrdiff_t stride = 16; + memset(levels, 0, stride * (4 * sw + 2)); + DECODE_COEFS_CLASS(TX_CLASS_V); + } +#undef lo_ctx_offsets +#undef DECODE_COEFS_CLASS + default: assert(0); + } + } else { // dc-only + int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[0], 2); + dc_tok = 1 + tok_br; + if (dbg) + printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n", + t_dim->ctx, chroma, 0, dc_tok, ts->msac.rng); + if (tok_br == 2) { + dc_tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[0]); + if (dbg) + printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n", + imin(t_dim->ctx, 3), chroma, dc_tok, ts->msac.rng); + } + } + + // residual and sign + int dc_sign = 1 << 6; + const uint16_t *const dq_tbl = ts->dq[b->seg_id][plane]; + const uint8_t *const qm_tbl = f->qm[lossless || is_1d || *txtp == IDTX][tx][plane]; + const int dq_shift = imax(0, t_dim->ctx - 2); + const int bitdepth = BITDEPTH == 8 ? 8 : f->cur.p.bpc; + const int cf_max = (1 << (7 + bitdepth)) - 1; + unsigned cul_level = 0; + + if (dc_tok) { // dc + const int dc_sign_ctx = get_dc_sign_ctx(tx, a, l); + uint16_t *const dc_sign_cdf = + ts->cdf.coef.dc_sign[chroma][dc_sign_ctx]; + const int sign = dav1d_msac_decode_bool_adapt(&ts->msac, dc_sign_cdf); + const unsigned dq = (dq_tbl[0] * qm_tbl[0] + 16) >> 5; + if (dbg) + printf("Post-dc_sign[%d][%d][%d]: r=%d\n", + chroma, dc_sign_ctx, sign, ts->msac.rng); + dc_sign = (sign - 1) & (2 << 6); + + if (dc_tok == 15) { + dc_tok += read_golomb(&ts->msac); + if (dbg) + printf("Post-dc_residual[%d->%d]: r=%d\n", + dc_tok - 15, dc_tok, ts->msac.rng); + + dc_tok &= 0xfffff; + } + + cul_level += dc_tok; + dc_tok = ((dq * dc_tok) & 0xffffff) >> dq_shift; + cf[0] = imin(dc_tok - sign, cf_max) ^ -sign; + } + for (int i = 1; i <= eob; i++) { // ac + const int rc = scan[i]; + int tok = cf[rc]; + if (!tok) continue; + + // sign + const int sign = dav1d_msac_decode_bool_equi(&ts->msac); + const unsigned dq = (dq_tbl[1] * qm_tbl[rc] + 16) >> 5; + if (dbg) + printf("Post-sign[%d=%d=%d]: r=%d\n", i, rc, sign, ts->msac.rng); + + // residual + if (tok == 15) { + tok += read_golomb(&ts->msac); + if (dbg) + printf("Post-residual[%d=%d=%d->%d]: r=%d\n", + i, rc, tok - 15, tok, ts->msac.rng); + + // coefficient parsing, see 5.11.39 + tok &= 0xfffff; + } + + // dequant, see 7.12.3 + cul_level += tok; + tok = ((dq * tok) & 0xffffff) >> dq_shift; + cf[rc] = imin(tok - sign, cf_max) ^ -sign; + } + + // context + *res_ctx = umin(cul_level, 63) | dc_sign; + + return eob; +} + +static void read_coef_tree(Dav1dTileContext *const t, + const enum BlockSize bs, const Av1Block *const b, + const enum RectTxfmSize ytx, const int depth, + const uint16_t *const tx_split, + const int x_off, const int y_off, pixel *dst) +{ + const Dav1dFrameContext *const f = t->f; + Dav1dTileState *const ts = t->ts; + const Dav1dDSPContext *const dsp = f->dsp; + const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[ytx]; + const int txw = t_dim->w, txh = t_dim->h; + + /* y_off can be larger than 3 since lossless blocks use TX_4X4 but can't + * be splitted. Aviods an undefined left shift. */ + if (depth < 2 && tx_split[depth] && + tx_split[depth] & (1 << (y_off * 4 + x_off))) + { + const enum RectTxfmSize sub = t_dim->sub; + const TxfmInfo *const sub_t_dim = &dav1d_txfm_dimensions[sub]; + const int txsw = sub_t_dim->w, txsh = sub_t_dim->h; + + read_coef_tree(t, bs, b, sub, depth + 1, tx_split, + x_off * 2 + 0, y_off * 2 + 0, dst); + t->bx += txsw; + if (txw >= txh && t->bx < f->bw) + read_coef_tree(t, bs, b, sub, depth + 1, tx_split, x_off * 2 + 1, + y_off * 2 + 0, dst ? &dst[4 * txsw] : NULL); + t->bx -= txsw; + t->by += txsh; + if (txh >= txw && t->by < f->bh) { + if (dst) + dst += 4 * txsh * PXSTRIDE(f->cur.stride[0]); + read_coef_tree(t, bs, b, sub, depth + 1, tx_split, + x_off * 2 + 0, y_off * 2 + 1, dst); + t->bx += txsw; + if (txw >= txh && t->bx < f->bw) + read_coef_tree(t, bs, b, sub, depth + 1, tx_split, x_off * 2 + 1, + y_off * 2 + 1, dst ? &dst[4 * txsw] : NULL); + t->bx -= txsw; + } + t->by -= txsh; + } else { + const int bx4 = t->bx & 31, by4 = t->by & 31; + enum TxfmType txtp; + uint8_t cf_ctx; + int eob; + coef *cf; + struct CodedBlockInfo *cbi; + + if (f->frame_thread.pass) { + assert(ts->frame_thread.cf); + cf = ts->frame_thread.cf; + ts->frame_thread.cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16; + cbi = &f->frame_thread.cbi[t->by * f->b4_stride + t->bx]; + } else { + cf = bitfn(t->cf); + } + if (f->frame_thread.pass != 2) { + eob = decode_coefs(t, &t->a->lcoef[bx4], &t->l.lcoef[by4], + ytx, bs, b, 0, 0, cf, &txtp, &cf_ctx); + if (DEBUG_BLOCK_INFO) + printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n", + ytx, txtp, eob, ts->msac.rng); +#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ + rep_macro(type, t->dir lcoef, off, mul * cf_ctx) +#define default_memset(dir, diridx, off, sz) \ + memset(&t->dir lcoef[off], cf_ctx, sz) + case_set_upto16_with_default(imin(txh, f->bh - t->by), l., 1, by4); + case_set_upto16_with_default(imin(txw, f->bw - t->bx), a->, 0, bx4); +#undef default_memset +#undef set_ctx +#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ + for (int y = 0; y < txh; y++) { \ + rep_macro(type, txtp_map, 0, mul * txtp); \ + txtp_map += 32; \ + } + uint8_t *txtp_map = &t->txtp_map[by4 * 32 + bx4]; + case_set_upto16(txw,,,); +#undef set_ctx + if (f->frame_thread.pass == 1) { + cbi->eob[0] = eob; + cbi->txtp[0] = txtp; + } + } else { + eob = cbi->eob[0]; + txtp = cbi->txtp[0]; + } + if (!(f->frame_thread.pass & 1)) { + assert(dst); + if (eob >= 0) { + if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) + coef_dump(cf, imin(t_dim->h, 8) * 4, imin(t_dim->w, 8) * 4, 3, "dq"); + dsp->itx.itxfm_add[ytx][txtp](dst, f->cur.stride[0], cf, eob + HIGHBD_CALL_SUFFIX); + if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) + hex_dump(dst, f->cur.stride[0], t_dim->w * 4, t_dim->h * 4, "recon"); + } + } + } +} + +void bytefn(dav1d_read_coef_blocks)(Dav1dTileContext *const t, + const enum BlockSize bs, const Av1Block *const b) +{ + const Dav1dFrameContext *const f = t->f; + const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; + const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; + const int bx4 = t->bx & 31, by4 = t->by & 31; + const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver; + const uint8_t *const b_dim = dav1d_block_dimensions[bs]; + const int bw4 = b_dim[0], bh4 = b_dim[1]; + const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver; + const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 && + (bw4 > ss_hor || t->bx & 1) && + (bh4 > ss_ver || t->by & 1); + + if (b->skip) { +#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ + rep_macro(type, t->dir lcoef, off, mul * 0x40) + case_set(bh4, l., 1, by4); + case_set(bw4, a->, 0, bx4); +#undef set_ctx + if (has_chroma) { +#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ + rep_macro(type, t->dir ccoef[0], off, mul * 0x40); \ + rep_macro(type, t->dir ccoef[1], off, mul * 0x40) + case_set(cbh4, l., 1, cby4); + case_set(cbw4, a->, 0, cbx4); +#undef set_ctx + } + return; + } + + Dav1dTileState *const ts = t->ts; + const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by); + const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver; + assert(f->frame_thread.pass == 1); + assert(!b->skip); + const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx]; + const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->intra ? b->tx : b->max_ytx]; + const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 }; + + for (int init_y = 0; init_y < h4; init_y += 16) { + const int sub_h4 = imin(h4, 16 + init_y); + for (int init_x = 0; init_x < w4; init_x += 16) { + const int sub_w4 = imin(w4, init_x + 16); + int y_off = !!init_y, y, x; + for (y = init_y, t->by += init_y; y < sub_h4; + y += t_dim->h, t->by += t_dim->h, y_off++) + { + struct CodedBlockInfo *const cbi = + &f->frame_thread.cbi[t->by * f->b4_stride]; + int x_off = !!init_x; + for (x = init_x, t->bx += init_x; x < sub_w4; + x += t_dim->w, t->bx += t_dim->w, x_off++) + { + if (!b->intra) { + read_coef_tree(t, bs, b, b->max_ytx, 0, tx_split, + x_off, y_off, NULL); + } else { + uint8_t cf_ctx = 0x40; + enum TxfmType txtp; + const int eob = cbi[t->bx].eob[0] = + decode_coefs(t, &t->a->lcoef[bx4 + x], + &t->l.lcoef[by4 + y], b->tx, bs, b, 1, + 0, ts->frame_thread.cf, &txtp, &cf_ctx); + if (DEBUG_BLOCK_INFO) + printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n", + b->tx, txtp, eob, ts->msac.rng); + cbi[t->bx].txtp[0] = txtp; + ts->frame_thread.cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16; +#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ + rep_macro(type, t->dir lcoef, off, mul * cf_ctx) +#define default_memset(dir, diridx, off, sz) \ + memset(&t->dir lcoef[off], cf_ctx, sz) + case_set_upto16_with_default(imin(t_dim->h, f->bh - t->by), + l., 1, by4 + y); + case_set_upto16_with_default(imin(t_dim->w, f->bw - t->bx), + a->, 0, bx4 + x); +#undef default_memset +#undef set_ctx + } + } + t->bx -= x; + } + t->by -= y; + + if (!has_chroma) continue; + + const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver); + const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor); + for (int pl = 0; pl < 2; pl++) { + for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4; + y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver) + { + struct CodedBlockInfo *const cbi = + &f->frame_thread.cbi[t->by * f->b4_stride]; + for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4; + x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor) + { + uint8_t cf_ctx = 0x40; + enum TxfmType txtp; + if (!b->intra) + txtp = t->txtp_map[(by4 + (y << ss_ver)) * 32 + + bx4 + (x << ss_hor)]; + const int eob = cbi[t->bx].eob[1 + pl] = + decode_coefs(t, &t->a->ccoef[pl][cbx4 + x], + &t->l.ccoef[pl][cby4 + y], b->uvtx, bs, + b, b->intra, 1 + pl, ts->frame_thread.cf, + &txtp, &cf_ctx); + if (DEBUG_BLOCK_INFO) + printf("Post-uv-cf-blk[pl=%d,tx=%d," + "txtp=%d,eob=%d]: r=%d\n", + pl, b->uvtx, txtp, eob, ts->msac.rng); + cbi[t->bx].txtp[1 + pl] = txtp; + ts->frame_thread.cf += uv_t_dim->w * uv_t_dim->h * 16; +#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ + rep_macro(type, t->dir ccoef[pl], off, mul * cf_ctx) +#define default_memset(dir, diridx, off, sz) \ + memset(&t->dir ccoef[pl][off], cf_ctx, sz) + case_set_upto16_with_default( \ + imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver), + l., 1, cby4 + y); + case_set_upto16_with_default( \ + imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor), + a->, 0, cbx4 + x); +#undef default_memset +#undef set_ctx + } + t->bx -= x << ss_hor; + } + t->by -= y << ss_ver; + } + } + } +} + +static int mc(Dav1dTileContext *const t, + pixel *const dst8, int16_t *const dst16, const ptrdiff_t dst_stride, + const int bw4, const int bh4, + const int bx, const int by, const int pl, + const mv mv, const Dav1dThreadPicture *const refp, const int refidx, + const enum Filter2d filter_2d) +{ + assert((dst8 != NULL) ^ (dst16 != NULL)); + const Dav1dFrameContext *const f = t->f; + const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; + const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; + const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver; + const int mvx = mv.x, mvy = mv.y; + const int mx = mvx & (15 >> !ss_hor), my = mvy & (15 >> !ss_ver); + ptrdiff_t ref_stride = refp->p.stride[!!pl]; + const pixel *ref; + + if (refp->p.p.w == f->cur.p.w && refp->p.p.h == f->cur.p.h) { + const int dx = bx * h_mul + (mvx >> (3 + ss_hor)); + const int dy = by * v_mul + (mvy >> (3 + ss_ver)); + int w, h; + + if (refp->p.data[0] != f->cur.data[0]) { // i.e. not for intrabc + if (dav1d_thread_picture_wait(refp, dy + bh4 * v_mul + !!my * 4, + PLANE_TYPE_Y + !!pl)) + { + return -1; + } + w = (f->cur.p.w + ss_hor) >> ss_hor; + h = (f->cur.p.h + ss_ver) >> ss_ver; + } else { + w = f->bw * 4 >> ss_hor; + h = f->bh * 4 >> ss_ver; + } + if (dx < !!mx * 3 || dy < !!my * 3 || + dx + bw4 * h_mul + !!mx * 4 > w || + dy + bh4 * v_mul + !!my * 4 > h) + { + pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge); + f->dsp->mc.emu_edge(bw4 * h_mul + !!mx * 7, bh4 * v_mul + !!my * 7, + w, h, dx - !!mx * 3, dy - !!my * 3, + emu_edge_buf, 192 * sizeof(pixel), + refp->p.data[pl], ref_stride); + ref = &emu_edge_buf[192 * !!my * 3 + !!mx * 3]; + ref_stride = 192 * sizeof(pixel); + } else { + ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx; + } + + if (dst8 != NULL) { + f->dsp->mc.mc[filter_2d](dst8, dst_stride, ref, ref_stride, bw4 * h_mul, + bh4 * v_mul, mx << !ss_hor, my << !ss_ver + HIGHBD_CALL_SUFFIX); + } else { + f->dsp->mc.mct[filter_2d](dst16, ref, ref_stride, bw4 * h_mul, + bh4 * v_mul, mx << !ss_hor, my << !ss_ver + HIGHBD_CALL_SUFFIX); + } + } else { + assert(refp != &f->sr_cur); + + const int orig_pos_y = (by * v_mul << 4) + mvy * (1 << !ss_ver); + const int orig_pos_x = (bx * h_mul << 4) + mvx * (1 << !ss_hor); +#define scale_mv(res, val, scale) do { \ + const int64_t tmp = (int64_t)(val) * scale + (scale - 0x4000) * 8; \ + res = apply_sign64((int) ((llabs(tmp) + 128) >> 8), tmp) + 32; \ + } while (0) + int pos_y, pos_x; + scale_mv(pos_x, orig_pos_x, f->svc[refidx][0].scale); + scale_mv(pos_y, orig_pos_y, f->svc[refidx][1].scale); +#undef scale_mv + const int left = pos_x >> 10; + const int top = pos_y >> 10; + const int right = + ((pos_x + (bw4 * h_mul - 1) * f->svc[refidx][0].step) >> 10) + 1; + const int bottom = + ((pos_y + (bh4 * v_mul - 1) * f->svc[refidx][1].step) >> 10) + 1; + + if (dav1d_thread_picture_wait(refp, bottom + 4, PLANE_TYPE_Y + !!pl)) + return -1; + if (DEBUG_BLOCK_INFO) + printf("Off %dx%d [%d,%d,%d], size %dx%d [%d,%d]\n", + left, top, orig_pos_x, f->svc[refidx][0].scale, refidx, + right-left, bottom-top, + f->svc[refidx][0].step, f->svc[refidx][1].step); + + const int w = (refp->p.p.w + ss_hor) >> ss_hor; + const int h = (refp->p.p.h + ss_ver) >> ss_ver; + if (left < 3 || top < 3 || right + 4 > w || bottom + 4 > h) { + pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge); + f->dsp->mc.emu_edge(right - left + 7, bottom - top + 7, + w, h, left - 3, top - 3, + emu_edge_buf, 320 * sizeof(pixel), + refp->p.data[pl], ref_stride); + ref = &emu_edge_buf[320 * 3 + 3]; + ref_stride = 320 * sizeof(pixel); + if (DEBUG_BLOCK_INFO) printf("Emu\n"); + } else { + ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * top + left; + } + + if (dst8 != NULL) { + f->dsp->mc.mc_scaled[filter_2d](dst8, dst_stride, ref, ref_stride, + bw4 * h_mul, bh4 * v_mul, + pos_x & 0x3ff, pos_y & 0x3ff, + f->svc[refidx][0].step, + f->svc[refidx][1].step + HIGHBD_CALL_SUFFIX); + } else { + f->dsp->mc.mct_scaled[filter_2d](dst16, ref, ref_stride, + bw4 * h_mul, bh4 * v_mul, + pos_x & 0x3ff, pos_y & 0x3ff, + f->svc[refidx][0].step, + f->svc[refidx][1].step + HIGHBD_CALL_SUFFIX); + } + } + + return 0; +} + +static int obmc(Dav1dTileContext *const t, + pixel *const dst, const ptrdiff_t dst_stride, + const uint8_t *const b_dim, const int pl, + const int bx4, const int by4, const int w4, const int h4) +{ + assert(!(t->bx & 1) && !(t->by & 1)); + const Dav1dFrameContext *const f = t->f; + /*const*/ refmvs_block **r = &t->rt.r[(t->by & 31) + 5]; + pixel *const lap = bitfn(t->scratch.lap); + const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; + const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; + const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver; + int res; + + if (t->by > t->ts->tiling.row_start && + (!pl || b_dim[0] * h_mul + b_dim[1] * v_mul >= 16)) + { + for (int i = 0, x = 0; x < w4 && i < imin(b_dim[2], 4); ) { + // only odd blocks are considered for overlap handling, hence +1 + const refmvs_block *const a_r = &r[-1][t->bx + x + 1]; + const uint8_t *const a_b_dim = dav1d_block_dimensions[a_r->bs]; + + if (a_r->ref.ref[0] > 0) { + const int ow4 = iclip(a_b_dim[0], 2, b_dim[0]); + const int oh4 = imin(b_dim[1], 16) >> 1; + res = mc(t, lap, NULL, ow4 * h_mul * sizeof(pixel), ow4, (oh4 * 3 + 3) >> 2, + t->bx + x, t->by, pl, a_r->mv.mv[0], + &f->refp[a_r->ref.ref[0] - 1], a_r->ref.ref[0] - 1, + dav1d_filter_2d[t->a->filter[1][bx4 + x + 1]][t->a->filter[0][bx4 + x + 1]]); + if (res) return res; + f->dsp->mc.blend_h(&dst[x * h_mul], dst_stride, lap, + h_mul * ow4, v_mul * oh4); + i++; + } + x += imax(a_b_dim[0], 2); + } + } + + if (t->bx > t->ts->tiling.col_start) + for (int i = 0, y = 0; y < h4 && i < imin(b_dim[3], 4); ) { + // only odd blocks are considered for overlap handling, hence +1 + const refmvs_block *const l_r = &r[y + 1][t->bx - 1]; + const uint8_t *const l_b_dim = dav1d_block_dimensions[l_r->bs]; + + if (l_r->ref.ref[0] > 0) { + const int ow4 = imin(b_dim[0], 16) >> 1; + const int oh4 = iclip(l_b_dim[1], 2, b_dim[1]); + res = mc(t, lap, NULL, h_mul * ow4 * sizeof(pixel), ow4, oh4, + t->bx, t->by + y, pl, l_r->mv.mv[0], + &f->refp[l_r->ref.ref[0] - 1], l_r->ref.ref[0] - 1, + dav1d_filter_2d[t->l.filter[1][by4 + y + 1]][t->l.filter[0][by4 + y + 1]]); + if (res) return res; + f->dsp->mc.blend_v(&dst[y * v_mul * PXSTRIDE(dst_stride)], + dst_stride, lap, h_mul * ow4, v_mul * oh4); + i++; + } + y += imax(l_b_dim[1], 2); + } + return 0; +} + +static int warp_affine(Dav1dTileContext *const t, + pixel *dst8, int16_t *dst16, const ptrdiff_t dstride, + const uint8_t *const b_dim, const int pl, + const Dav1dThreadPicture *const refp, + const Dav1dWarpedMotionParams *const wmp) +{ + assert((dst8 != NULL) ^ (dst16 != NULL)); + const Dav1dFrameContext *const f = t->f; + const Dav1dDSPContext *const dsp = f->dsp; + const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; + const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; + const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver; + assert(!((b_dim[0] * h_mul) & 7) && !((b_dim[1] * v_mul) & 7)); + const int32_t *const mat = wmp->matrix; + const int width = (refp->p.p.w + ss_hor) >> ss_hor; + const int height = (refp->p.p.h + ss_ver) >> ss_ver; + + for (int y = 0; y < b_dim[1] * v_mul; y += 8) { + const int src_y = t->by * 4 + ((y + 4) << ss_ver); + const int64_t mat3_y = (int64_t) mat[3] * src_y + mat[0]; + const int64_t mat5_y = (int64_t) mat[5] * src_y + mat[1]; + for (int x = 0; x < b_dim[0] * h_mul; x += 8) { + // calculate transformation relative to center of 8x8 block in + // luma pixel units + const int src_x = t->bx * 4 + ((x + 4) << ss_hor); + const int64_t mvx = ((int64_t) mat[2] * src_x + mat3_y) >> ss_hor; + const int64_t mvy = ((int64_t) mat[4] * src_x + mat5_y) >> ss_ver; + + const int dx = (int) (mvx >> 16) - 4; + const int mx = (((int) mvx & 0xffff) - wmp->u.p.alpha * 4 - + wmp->u.p.beta * 7) & ~0x3f; + const int dy = (int) (mvy >> 16) - 4; + const int my = (((int) mvy & 0xffff) - wmp->u.p.gamma * 4 - + wmp->u.p.delta * 4) & ~0x3f; + + const pixel *ref_ptr; + ptrdiff_t ref_stride = refp->p.stride[!!pl]; + + if (dav1d_thread_picture_wait(refp, dy + 4 + 8, + PLANE_TYPE_Y + !!pl)) + { + return -1; + } + if (dx < 3 || dx + 8 + 4 > width || dy < 3 || dy + 8 + 4 > height) { + pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge); + f->dsp->mc.emu_edge(15, 15, width, height, dx - 3, dy - 3, + emu_edge_buf, 32 * sizeof(pixel), + refp->p.data[pl], ref_stride); + ref_ptr = &emu_edge_buf[32 * 3 + 3]; + ref_stride = 32 * sizeof(pixel); + } else { + ref_ptr = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx; + } + if (dst16 != NULL) + dsp->mc.warp8x8t(&dst16[x], dstride, ref_ptr, ref_stride, + wmp->u.abcd, mx, my HIGHBD_CALL_SUFFIX); + else + dsp->mc.warp8x8(&dst8[x], dstride, ref_ptr, ref_stride, + wmp->u.abcd, mx, my HIGHBD_CALL_SUFFIX); + } + if (dst8) dst8 += 8 * PXSTRIDE(dstride); + else dst16 += 8 * dstride; + } + return 0; +} + +void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize bs, + const enum EdgeFlags intra_edge_flags, + const Av1Block *const b) +{ + Dav1dTileState *const ts = t->ts; + const Dav1dFrameContext *const f = t->f; + const Dav1dDSPContext *const dsp = f->dsp; + const int bx4 = t->bx & 31, by4 = t->by & 31; + const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; + const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; + const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver; + const uint8_t *const b_dim = dav1d_block_dimensions[bs]; + const int bw4 = b_dim[0], bh4 = b_dim[1]; + const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by); + const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver; + const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 && + (bw4 > ss_hor || t->bx & 1) && + (bh4 > ss_ver || t->by & 1); + const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->tx]; + const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx]; + + // coefficient coding + pixel *const edge = bitfn(t->scratch.edge) + 128; + const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver; + + const int intra_edge_filter_flag = f->seq_hdr->intra_edge_filter << 10; + + for (int init_y = 0; init_y < h4; init_y += 16) { + const int sub_h4 = imin(h4, 16 + init_y); + const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver); + for (int init_x = 0; init_x < w4; init_x += 16) { + if (b->pal_sz[0]) { + pixel *dst = ((pixel *) f->cur.data[0]) + + 4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx); + const uint8_t *pal_idx; + if (f->frame_thread.pass) { + assert(ts->frame_thread.pal_idx); + pal_idx = ts->frame_thread.pal_idx; + ts->frame_thread.pal_idx += bw4 * bh4 * 16; + } else { + pal_idx = t->scratch.pal_idx; + } + const uint16_t *const pal = f->frame_thread.pass ? + f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + + ((t->bx >> 1) + (t->by & 1))][0] : t->scratch.pal[0]; + f->dsp->ipred.pal_pred(dst, f->cur.stride[0], pal, + pal_idx, bw4 * 4, bh4 * 4); + if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) + hex_dump(dst, PXSTRIDE(f->cur.stride[0]), + bw4 * 4, bh4 * 4, "y-pal-pred"); + } + + const int intra_flags = (sm_flag(t->a, bx4) | + sm_flag(&t->l, by4) | + intra_edge_filter_flag); + const int sb_has_tr = init_x + 16 < w4 ? 1 : init_y ? 0 : + intra_edge_flags & EDGE_I444_TOP_HAS_RIGHT; + const int sb_has_bl = init_x ? 0 : init_y + 16 < h4 ? 1 : + intra_edge_flags & EDGE_I444_LEFT_HAS_BOTTOM; + int y, x; + const int sub_w4 = imin(w4, init_x + 16); + for (y = init_y, t->by += init_y; y < sub_h4; + y += t_dim->h, t->by += t_dim->h) + { + pixel *dst = ((pixel *) f->cur.data[0]) + + 4 * (t->by * PXSTRIDE(f->cur.stride[0]) + + t->bx + init_x); + for (x = init_x, t->bx += init_x; x < sub_w4; + x += t_dim->w, t->bx += t_dim->w) + { + if (b->pal_sz[0]) goto skip_y_pred; + + int angle = b->y_angle; + const enum EdgeFlags edge_flags = + (((y > init_y || !sb_has_tr) && (x + t_dim->w >= sub_w4)) ? + 0 : EDGE_I444_TOP_HAS_RIGHT) | + ((x > init_x || (!sb_has_bl && y + t_dim->h >= sub_h4)) ? + 0 : EDGE_I444_LEFT_HAS_BOTTOM); + const pixel *top_sb_edge = NULL; + if (!(t->by & (f->sb_step - 1))) { + top_sb_edge = f->ipred_edge[0]; + const int sby = t->by >> f->sb_shift; + top_sb_edge += f->sb128w * 128 * (sby - 1); + } + const enum IntraPredMode m = + bytefn(dav1d_prepare_intra_edges)(t->bx, + t->bx > ts->tiling.col_start, + t->by, + t->by > ts->tiling.row_start, + ts->tiling.col_end, + ts->tiling.row_end, + edge_flags, dst, + f->cur.stride[0], top_sb_edge, + b->y_mode, &angle, + t_dim->w, t_dim->h, + f->seq_hdr->intra_edge_filter, + edge HIGHBD_CALL_SUFFIX); + dsp->ipred.intra_pred[m](dst, f->cur.stride[0], edge, + t_dim->w * 4, t_dim->h * 4, + angle | intra_flags, + 4 * f->bw - 4 * t->bx, + 4 * f->bh - 4 * t->by + HIGHBD_CALL_SUFFIX); + + if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) { + hex_dump(edge - t_dim->h * 4, t_dim->h * 4, + t_dim->h * 4, 2, "l"); + hex_dump(edge, 0, 1, 1, "tl"); + hex_dump(edge + 1, t_dim->w * 4, + t_dim->w * 4, 2, "t"); + hex_dump(dst, f->cur.stride[0], + t_dim->w * 4, t_dim->h * 4, "y-intra-pred"); + } + + skip_y_pred: {} + if (!b->skip) { + coef *cf; + int eob; + enum TxfmType txtp; + if (f->frame_thread.pass) { + cf = ts->frame_thread.cf; + ts->frame_thread.cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16; + const struct CodedBlockInfo *const cbi = + &f->frame_thread.cbi[t->by * f->b4_stride + t->bx]; + eob = cbi->eob[0]; + txtp = cbi->txtp[0]; + } else { + uint8_t cf_ctx; + cf = bitfn(t->cf); + eob = decode_coefs(t, &t->a->lcoef[bx4 + x], + &t->l.lcoef[by4 + y], b->tx, bs, + b, 1, 0, cf, &txtp, &cf_ctx); + if (DEBUG_BLOCK_INFO) + printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n", + b->tx, txtp, eob, ts->msac.rng); +#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ + rep_macro(type, t->dir lcoef, off, mul * cf_ctx) +#define default_memset(dir, diridx, off, sz) \ + memset(&t->dir lcoef[off], cf_ctx, sz) + case_set_upto16_with_default(imin(t_dim->h, f->bh - t->by), \ + l., 1, by4 + y); + case_set_upto16_with_default(imin(t_dim->w, f->bw - t->bx), \ + a->, 0, bx4 + x); +#undef default_memset +#undef set_ctx + } + if (eob >= 0) { + if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) + coef_dump(cf, imin(t_dim->h, 8) * 4, + imin(t_dim->w, 8) * 4, 3, "dq"); + dsp->itx.itxfm_add[b->tx] + [txtp](dst, + f->cur.stride[0], + cf, eob HIGHBD_CALL_SUFFIX); + if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) + hex_dump(dst, f->cur.stride[0], + t_dim->w * 4, t_dim->h * 4, "recon"); + } + } else if (!f->frame_thread.pass) { +#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ + rep_macro(type, t->dir lcoef, off, mul * 0x40) + case_set_upto16(t_dim->h, l., 1, by4 + y); + case_set_upto16(t_dim->w, a->, 0, bx4 + x); +#undef set_ctx + } + dst += 4 * t_dim->w; + } + t->bx -= x; + } + t->by -= y; + + if (!has_chroma) continue; + + const ptrdiff_t stride = f->cur.stride[1]; + + if (b->uv_mode == CFL_PRED) { + assert(!init_x && !init_y); + + int16_t *const ac = t->scratch.ac; + pixel *y_src = ((pixel *) f->cur.data[0]) + 4 * (t->bx & ~ss_hor) + + 4 * (t->by & ~ss_ver) * PXSTRIDE(f->cur.stride[0]); + const ptrdiff_t uv_off = 4 * ((t->bx >> ss_hor) + + (t->by >> ss_ver) * PXSTRIDE(stride)); + pixel *const uv_dst[2] = { ((pixel *) f->cur.data[1]) + uv_off, + ((pixel *) f->cur.data[2]) + uv_off }; + + const int furthest_r = + ((cw4 << ss_hor) + t_dim->w - 1) & ~(t_dim->w - 1); + const int furthest_b = + ((ch4 << ss_ver) + t_dim->h - 1) & ~(t_dim->h - 1); + dsp->ipred.cfl_ac[f->cur.p.layout - 1](ac, y_src, f->cur.stride[0], + cbw4 - (furthest_r >> ss_hor), + cbh4 - (furthest_b >> ss_ver), + cbw4 * 4, cbh4 * 4); + for (int pl = 0; pl < 2; pl++) { + if (!b->cfl_alpha[pl]) continue; + int angle = 0; + const pixel *top_sb_edge = NULL; + if (!((t->by & ~ss_ver) & (f->sb_step - 1))) { + top_sb_edge = f->ipred_edge[pl + 1]; + const int sby = t->by >> f->sb_shift; + top_sb_edge += f->sb128w * 128 * (sby - 1); + } + const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver; + const int xstart = ts->tiling.col_start >> ss_hor; + const int ystart = ts->tiling.row_start >> ss_ver; + const enum IntraPredMode m = + bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart, + ypos, ypos > ystart, + ts->tiling.col_end >> ss_hor, + ts->tiling.row_end >> ss_ver, + 0, uv_dst[pl], stride, + top_sb_edge, DC_PRED, &angle, + uv_t_dim->w, uv_t_dim->h, 0, + edge HIGHBD_CALL_SUFFIX); + dsp->ipred.cfl_pred[m](uv_dst[pl], stride, edge, + uv_t_dim->w * 4, + uv_t_dim->h * 4, + ac, b->cfl_alpha[pl] + HIGHBD_CALL_SUFFIX); + } + if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) { + ac_dump(ac, 4*cbw4, 4*cbh4, "ac"); + hex_dump(uv_dst[0], stride, cbw4 * 4, cbh4 * 4, "u-cfl-pred"); + hex_dump(uv_dst[1], stride, cbw4 * 4, cbh4 * 4, "v-cfl-pred"); + } + } else if (b->pal_sz[1]) { + const ptrdiff_t uv_dstoff = 4 * ((t->bx >> ss_hor) + + (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1])); + const uint16_t (*pal)[8]; + const uint8_t *pal_idx; + if (f->frame_thread.pass) { + assert(ts->frame_thread.pal_idx); + pal = f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + + ((t->bx >> 1) + (t->by & 1))]; + pal_idx = ts->frame_thread.pal_idx; + ts->frame_thread.pal_idx += cbw4 * cbh4 * 16; + } else { + pal = t->scratch.pal; + pal_idx = &t->scratch.pal_idx[bw4 * bh4 * 16]; + } + + f->dsp->ipred.pal_pred(((pixel *) f->cur.data[1]) + uv_dstoff, + f->cur.stride[1], pal[1], + pal_idx, cbw4 * 4, cbh4 * 4); + f->dsp->ipred.pal_pred(((pixel *) f->cur.data[2]) + uv_dstoff, + f->cur.stride[1], pal[2], + pal_idx, cbw4 * 4, cbh4 * 4); + if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) { + hex_dump(((pixel *) f->cur.data[1]) + uv_dstoff, + PXSTRIDE(f->cur.stride[1]), + cbw4 * 4, cbh4 * 4, "u-pal-pred"); + hex_dump(((pixel *) f->cur.data[2]) + uv_dstoff, + PXSTRIDE(f->cur.stride[1]), + cbw4 * 4, cbh4 * 4, "v-pal-pred"); + } + } + + const int sm_uv_fl = sm_uv_flag(t->a, cbx4) | + sm_uv_flag(&t->l, cby4); + const int uv_sb_has_tr = + ((init_x + 16) >> ss_hor) < cw4 ? 1 : init_y ? 0 : + intra_edge_flags & (EDGE_I420_TOP_HAS_RIGHT >> (f->cur.p.layout - 1)); + const int uv_sb_has_bl = + init_x ? 0 : ((init_y + 16) >> ss_ver) < ch4 ? 1 : + intra_edge_flags & (EDGE_I420_LEFT_HAS_BOTTOM >> (f->cur.p.layout - 1)); + const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor); + for (int pl = 0; pl < 2; pl++) { + for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4; + y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver) + { + pixel *dst = ((pixel *) f->cur.data[1 + pl]) + + 4 * ((t->by >> ss_ver) * PXSTRIDE(stride) + + ((t->bx + init_x) >> ss_hor)); + for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4; + x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor) + { + if ((b->uv_mode == CFL_PRED && b->cfl_alpha[pl]) || + b->pal_sz[1]) + { + goto skip_uv_pred; + } + + int angle = b->uv_angle; + // this probably looks weird because we're using + // luma flags in a chroma loop, but that's because + // prepare_intra_edges() expects luma flags as input + const enum EdgeFlags edge_flags = + (((y > (init_y >> ss_ver) || !uv_sb_has_tr) && + (x + uv_t_dim->w >= sub_cw4)) ? + 0 : EDGE_I444_TOP_HAS_RIGHT) | + ((x > (init_x >> ss_hor) || + (!uv_sb_has_bl && y + uv_t_dim->h >= sub_ch4)) ? + 0 : EDGE_I444_LEFT_HAS_BOTTOM); + const pixel *top_sb_edge = NULL; + if (!((t->by & ~ss_ver) & (f->sb_step - 1))) { + top_sb_edge = f->ipred_edge[1 + pl]; + const int sby = t->by >> f->sb_shift; + top_sb_edge += f->sb128w * 128 * (sby - 1); + } + const enum IntraPredMode uv_mode = + b->uv_mode == CFL_PRED ? DC_PRED : b->uv_mode; + const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver; + const int xstart = ts->tiling.col_start >> ss_hor; + const int ystart = ts->tiling.row_start >> ss_ver; + const enum IntraPredMode m = + bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart, + ypos, ypos > ystart, + ts->tiling.col_end >> ss_hor, + ts->tiling.row_end >> ss_ver, + edge_flags, dst, stride, + top_sb_edge, uv_mode, + &angle, uv_t_dim->w, + uv_t_dim->h, + f->seq_hdr->intra_edge_filter, + edge HIGHBD_CALL_SUFFIX); + angle |= intra_edge_filter_flag; + dsp->ipred.intra_pred[m](dst, stride, edge, + uv_t_dim->w * 4, + uv_t_dim->h * 4, + angle | sm_uv_fl, + (4 * f->bw + ss_hor - + 4 * (t->bx & ~ss_hor)) >> ss_hor, + (4 * f->bh + ss_ver - + 4 * (t->by & ~ss_ver)) >> ss_ver + HIGHBD_CALL_SUFFIX); + if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) { + hex_dump(edge - uv_t_dim->h * 4, uv_t_dim->h * 4, + uv_t_dim->h * 4, 2, "l"); + hex_dump(edge, 0, 1, 1, "tl"); + hex_dump(edge + 1, uv_t_dim->w * 4, + uv_t_dim->w * 4, 2, "t"); + hex_dump(dst, stride, uv_t_dim->w * 4, + uv_t_dim->h * 4, pl ? "v-intra-pred" : "u-intra-pred"); + } + + skip_uv_pred: {} + if (!b->skip) { + enum TxfmType txtp; + int eob; + coef *cf; + if (f->frame_thread.pass) { + cf = ts->frame_thread.cf; + ts->frame_thread.cf += uv_t_dim->w * uv_t_dim->h * 16; + const struct CodedBlockInfo *const cbi = + &f->frame_thread.cbi[t->by * f->b4_stride + t->bx]; + eob = cbi->eob[pl + 1]; + txtp = cbi->txtp[pl + 1]; + } else { + uint8_t cf_ctx; + cf = bitfn(t->cf); + eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x], + &t->l.ccoef[pl][cby4 + y], + b->uvtx, bs, b, 1, 1 + pl, cf, + &txtp, &cf_ctx); + if (DEBUG_BLOCK_INFO) + printf("Post-uv-cf-blk[pl=%d,tx=%d," + "txtp=%d,eob=%d]: r=%d [x=%d,cbx4=%d]\n", + pl, b->uvtx, txtp, eob, ts->msac.rng, x, cbx4); +#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ + rep_macro(type, t->dir ccoef[pl], off, mul * cf_ctx) +#define default_memset(dir, diridx, off, sz) \ + memset(&t->dir ccoef[pl][off], cf_ctx, sz) + case_set_upto16_with_default( \ + imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver), + l., 1, cby4 + y); + case_set_upto16_with_default( \ + imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor), + a->, 0, cbx4 + x); +#undef default_memset +#undef set_ctx + } + if (eob >= 0) { + if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) + coef_dump(cf, uv_t_dim->h * 4, + uv_t_dim->w * 4, 3, "dq"); + dsp->itx.itxfm_add[b->uvtx] + [txtp](dst, stride, + cf, eob HIGHBD_CALL_SUFFIX); + if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) + hex_dump(dst, stride, uv_t_dim->w * 4, + uv_t_dim->h * 4, "recon"); + } + } else if (!f->frame_thread.pass) { +#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ + rep_macro(type, t->dir ccoef[pl], off, mul * 0x40) + case_set_upto16(uv_t_dim->h, l., 1, cby4 + y); + case_set_upto16(uv_t_dim->w, a->, 0, cbx4 + x); +#undef set_ctx + } + dst += uv_t_dim->w * 4; + } + t->bx -= x << ss_hor; + } + t->by -= y << ss_ver; + } + } + } +} + +int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize bs, + const Av1Block *const b) +{ + Dav1dTileState *const ts = t->ts; + const Dav1dFrameContext *const f = t->f; + const Dav1dDSPContext *const dsp = f->dsp; + const int bx4 = t->bx & 31, by4 = t->by & 31; + const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; + const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; + const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver; + const uint8_t *const b_dim = dav1d_block_dimensions[bs]; + const int bw4 = b_dim[0], bh4 = b_dim[1]; + const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by); + const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 && + (bw4 > ss_hor || t->bx & 1) && + (bh4 > ss_ver || t->by & 1); + const int chr_layout_idx = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I400 ? 0 : + DAV1D_PIXEL_LAYOUT_I444 - f->cur.p.layout; + int res; + + // prediction + const int cbh4 = (bh4 + ss_ver) >> ss_ver, cbw4 = (bw4 + ss_hor) >> ss_hor; + pixel *dst = ((pixel *) f->cur.data[0]) + + 4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx); + const ptrdiff_t uvdstoff = + 4 * ((t->bx >> ss_hor) + (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1])); + if (!(f->frame_hdr->frame_type & 1)) { + // intrabc + assert(!f->frame_hdr->super_res.enabled); + res = mc(t, dst, NULL, f->cur.stride[0], bw4, bh4, t->bx, t->by, 0, + b->mv[0], &f->sr_cur, 0 /* unused */, FILTER_2D_BILINEAR); + if (res) return res; + if (has_chroma) for (int pl = 1; pl < 3; pl++) { + res = mc(t, ((pixel *)f->cur.data[pl]) + uvdstoff, NULL, f->cur.stride[1], + bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver), + t->bx & ~ss_hor, t->by & ~ss_ver, pl, b->mv[0], + &f->sr_cur, 0 /* unused */, FILTER_2D_BILINEAR); + if (res) return res; + } + } else if (b->comp_type == COMP_INTER_NONE) { + const Dav1dThreadPicture *const refp = &f->refp[b->ref[0]]; + const enum Filter2d filter_2d = b->filter2d; + + if (imin(bw4, bh4) > 1 && + ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) || + (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION))) + { + res = warp_affine(t, dst, NULL, f->cur.stride[0], b_dim, 0, refp, + b->motion_mode == MM_WARP ? &t->warpmv : + &f->frame_hdr->gmv[b->ref[0]]); + if (res) return res; + } else { + res = mc(t, dst, NULL, f->cur.stride[0], + bw4, bh4, t->bx, t->by, 0, b->mv[0], refp, b->ref[0], filter_2d); + if (res) return res; + if (b->motion_mode == MM_OBMC) { + res = obmc(t, dst, f->cur.stride[0], b_dim, 0, bx4, by4, w4, h4); + if (res) return res; + } + } + if (b->interintra_type) { + pixel *const tl_edge = bitfn(t->scratch.edge) + 32; + enum IntraPredMode m = b->interintra_mode == II_SMOOTH_PRED ? + SMOOTH_PRED : b->interintra_mode; + pixel *const tmp = bitfn(t->scratch.interintra); + int angle = 0; + const pixel *top_sb_edge = NULL; + if (!(t->by & (f->sb_step - 1))) { + top_sb_edge = f->ipred_edge[0]; + const int sby = t->by >> f->sb_shift; + top_sb_edge += f->sb128w * 128 * (sby - 1); + } + m = bytefn(dav1d_prepare_intra_edges)(t->bx, t->bx > ts->tiling.col_start, + t->by, t->by > ts->tiling.row_start, + ts->tiling.col_end, ts->tiling.row_end, + 0, dst, f->cur.stride[0], top_sb_edge, + m, &angle, bw4, bh4, 0, tl_edge + HIGHBD_CALL_SUFFIX); + dsp->ipred.intra_pred[m](tmp, 4 * bw4 * sizeof(pixel), + tl_edge, bw4 * 4, bh4 * 4, 0, 0, 0 + HIGHBD_CALL_SUFFIX); + const uint8_t *const ii_mask = + b->interintra_type == INTER_INTRA_BLEND ? + dav1d_ii_masks[bs][0][b->interintra_mode] : + dav1d_wedge_masks[bs][0][0][b->wedge_idx]; + dsp->mc.blend(dst, f->cur.stride[0], tmp, + bw4 * 4, bh4 * 4, ii_mask); + } + + if (!has_chroma) goto skip_inter_chroma_pred; + + // sub8x8 derivation + int is_sub8x8 = bw4 == ss_hor || bh4 == ss_ver; + refmvs_block *const *r; + if (is_sub8x8) { + assert(ss_hor == 1); + r = &t->rt.r[(t->by & 31) + 5]; + if (bw4 == 1) is_sub8x8 &= r[0][t->bx - 1].ref.ref[0] > 0; + if (bh4 == ss_ver) is_sub8x8 &= r[-1][t->bx].ref.ref[0] > 0; + if (bw4 == 1 && bh4 == ss_ver) + is_sub8x8 &= r[-1][t->bx - 1].ref.ref[0] > 0; + } + + // chroma prediction + if (is_sub8x8) { + assert(ss_hor == 1); + ptrdiff_t h_off = 0, v_off = 0; + if (bw4 == 1 && bh4 == ss_ver) { + for (int pl = 0; pl < 2; pl++) { + res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff, + NULL, f->cur.stride[1], + bw4, bh4, t->bx - 1, t->by - 1, 1 + pl, + r[-1][t->bx - 1].mv.mv[0], + &f->refp[r[-1][t->bx - 1].ref.ref[0] - 1], + r[-1][t->bx - 1].ref.ref[0] - 1, + f->frame_thread.pass != 2 ? t->tl_4x4_filter : + f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx - 1].filter2d); + if (res) return res; + } + v_off = 2 * PXSTRIDE(f->cur.stride[1]); + h_off = 2; + } + if (bw4 == 1) { + const enum Filter2d left_filter_2d = + dav1d_filter_2d[t->l.filter[1][by4]][t->l.filter[0][by4]]; + for (int pl = 0; pl < 2; pl++) { + res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + v_off, NULL, + f->cur.stride[1], bw4, bh4, t->bx - 1, + t->by, 1 + pl, r[0][t->bx - 1].mv.mv[0], + &f->refp[r[0][t->bx - 1].ref.ref[0] - 1], + r[0][t->bx - 1].ref.ref[0] - 1, + f->frame_thread.pass != 2 ? left_filter_2d : + f->frame_thread.b[(t->by * f->b4_stride) + t->bx - 1].filter2d); + if (res) return res; + } + h_off = 2; + } + if (bh4 == ss_ver) { + const enum Filter2d top_filter_2d = + dav1d_filter_2d[t->a->filter[1][bx4]][t->a->filter[0][bx4]]; + for (int pl = 0; pl < 2; pl++) { + res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + h_off, NULL, + f->cur.stride[1], bw4, bh4, t->bx, t->by - 1, + 1 + pl, r[-1][t->bx].mv.mv[0], + &f->refp[r[-1][t->bx].ref.ref[0] - 1], + r[-1][t->bx].ref.ref[0] - 1, + f->frame_thread.pass != 2 ? top_filter_2d : + f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx].filter2d); + if (res) return res; + } + v_off = 2 * PXSTRIDE(f->cur.stride[1]); + } + for (int pl = 0; pl < 2; pl++) { + res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + h_off + v_off, NULL, f->cur.stride[1], + bw4, bh4, t->bx, t->by, 1 + pl, b->mv[0], + refp, b->ref[0], filter_2d); + if (res) return res; + } + } else { + if (imin(cbw4, cbh4) > 1 && + ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) || + (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION))) + { + for (int pl = 0; pl < 2; pl++) { + res = warp_affine(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff, NULL, + f->cur.stride[1], b_dim, 1 + pl, refp, + b->motion_mode == MM_WARP ? &t->warpmv : + &f->frame_hdr->gmv[b->ref[0]]); + if (res) return res; + } + } else { + for (int pl = 0; pl < 2; pl++) { + res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff, + NULL, f->cur.stride[1], + bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver), + t->bx & ~ss_hor, t->by & ~ss_ver, + 1 + pl, b->mv[0], refp, b->ref[0], filter_2d); + if (res) return res; + if (b->motion_mode == MM_OBMC) { + res = obmc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff, + f->cur.stride[1], b_dim, 1 + pl, bx4, by4, w4, h4); + if (res) return res; + } + } + } + if (b->interintra_type) { + // FIXME for 8x32 with 4:2:2 subsampling, this probably does + // the wrong thing since it will select 4x16, not 4x32, as a + // transform size... + const uint8_t *const ii_mask = + b->interintra_type == INTER_INTRA_BLEND ? + dav1d_ii_masks[bs][chr_layout_idx][b->interintra_mode] : + dav1d_wedge_masks[bs][chr_layout_idx][0][b->wedge_idx]; + + for (int pl = 0; pl < 2; pl++) { + pixel *const tmp = bitfn(t->scratch.interintra); + pixel *const tl_edge = bitfn(t->scratch.edge) + 32; + enum IntraPredMode m = + b->interintra_mode == II_SMOOTH_PRED ? + SMOOTH_PRED : b->interintra_mode; + int angle = 0; + pixel *const uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff; + const pixel *top_sb_edge = NULL; + if (!(t->by & (f->sb_step - 1))) { + top_sb_edge = f->ipred_edge[pl + 1]; + const int sby = t->by >> f->sb_shift; + top_sb_edge += f->sb128w * 128 * (sby - 1); + } + m = bytefn(dav1d_prepare_intra_edges)(t->bx >> ss_hor, + (t->bx >> ss_hor) > + (ts->tiling.col_start >> ss_hor), + t->by >> ss_ver, + (t->by >> ss_ver) > + (ts->tiling.row_start >> ss_ver), + ts->tiling.col_end >> ss_hor, + ts->tiling.row_end >> ss_ver, + 0, uvdst, f->cur.stride[1], + top_sb_edge, m, + &angle, cbw4, cbh4, 0, tl_edge + HIGHBD_CALL_SUFFIX); + dsp->ipred.intra_pred[m](tmp, cbw4 * 4 * sizeof(pixel), + tl_edge, cbw4 * 4, cbh4 * 4, 0, 0, 0 + HIGHBD_CALL_SUFFIX); + dsp->mc.blend(uvdst, f->cur.stride[1], tmp, + cbw4 * 4, cbh4 * 4, ii_mask); + } + } + } + + skip_inter_chroma_pred: {} + t->tl_4x4_filter = filter_2d; + } else { + const enum Filter2d filter_2d = b->filter2d; + // Maximum super block size is 128x128 + int16_t (*tmp)[128 * 128] = t->scratch.compinter; + int jnt_weight; + uint8_t *const seg_mask = t->scratch.seg_mask; + const uint8_t *mask; + + for (int i = 0; i < 2; i++) { + const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]]; + + if (b->inter_mode == GLOBALMV_GLOBALMV && f->gmv_warp_allowed[b->ref[i]]) { + res = warp_affine(t, NULL, tmp[i], bw4 * 4, b_dim, 0, refp, + &f->frame_hdr->gmv[b->ref[i]]); + if (res) return res; + } else { + res = mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by, 0, + b->mv[i], refp, b->ref[i], filter_2d); + if (res) return res; + } + } + switch (b->comp_type) { + case COMP_INTER_AVG: + dsp->mc.avg(dst, f->cur.stride[0], tmp[0], tmp[1], + bw4 * 4, bh4 * 4 HIGHBD_CALL_SUFFIX); + break; + case COMP_INTER_WEIGHTED_AVG: + jnt_weight = f->jnt_weights[b->ref[0]][b->ref[1]]; + dsp->mc.w_avg(dst, f->cur.stride[0], tmp[0], tmp[1], + bw4 * 4, bh4 * 4, jnt_weight HIGHBD_CALL_SUFFIX); + break; + case COMP_INTER_SEG: + dsp->mc.w_mask[chr_layout_idx](dst, f->cur.stride[0], + tmp[b->mask_sign], tmp[!b->mask_sign], + bw4 * 4, bh4 * 4, seg_mask, + b->mask_sign HIGHBD_CALL_SUFFIX); + mask = seg_mask; + break; + case COMP_INTER_WEDGE: + mask = dav1d_wedge_masks[bs][0][0][b->wedge_idx]; + dsp->mc.mask(dst, f->cur.stride[0], + tmp[b->mask_sign], tmp[!b->mask_sign], + bw4 * 4, bh4 * 4, mask HIGHBD_CALL_SUFFIX); + if (has_chroma) + mask = dav1d_wedge_masks[bs][chr_layout_idx][b->mask_sign][b->wedge_idx]; + break; + } + + // chroma + if (has_chroma) for (int pl = 0; pl < 2; pl++) { + for (int i = 0; i < 2; i++) { + const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]]; + if (b->inter_mode == GLOBALMV_GLOBALMV && + imin(cbw4, cbh4) > 1 && f->gmv_warp_allowed[b->ref[i]]) + { + res = warp_affine(t, NULL, tmp[i], bw4 * 4 >> ss_hor, + b_dim, 1 + pl, + refp, &f->frame_hdr->gmv[b->ref[i]]); + if (res) return res; + } else { + res = mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by, + 1 + pl, b->mv[i], refp, b->ref[i], filter_2d); + if (res) return res; + } + } + pixel *const uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff; + switch (b->comp_type) { + case COMP_INTER_AVG: + dsp->mc.avg(uvdst, f->cur.stride[1], tmp[0], tmp[1], + bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver + HIGHBD_CALL_SUFFIX); + break; + case COMP_INTER_WEIGHTED_AVG: + dsp->mc.w_avg(uvdst, f->cur.stride[1], tmp[0], tmp[1], + bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, jnt_weight + HIGHBD_CALL_SUFFIX); + break; + case COMP_INTER_WEDGE: + case COMP_INTER_SEG: + dsp->mc.mask(uvdst, f->cur.stride[1], + tmp[b->mask_sign], tmp[!b->mask_sign], + bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, mask + HIGHBD_CALL_SUFFIX); + break; + } + } + } + + if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) { + hex_dump(dst, f->cur.stride[0], b_dim[0] * 4, b_dim[1] * 4, "y-pred"); + if (has_chroma) { + hex_dump(&((pixel *) f->cur.data[1])[uvdstoff], f->cur.stride[1], + cbw4 * 4, cbh4 * 4, "u-pred"); + hex_dump(&((pixel *) f->cur.data[2])[uvdstoff], f->cur.stride[1], + cbw4 * 4, cbh4 * 4, "v-pred"); + } + } + + const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver; + + if (b->skip) { + // reset coef contexts +#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ + rep_macro(type, t->dir lcoef, off, mul * 0x40) + case_set(bh4, l., 1, by4); + case_set(bw4, a->, 0, bx4); +#undef set_ctx + if (has_chroma) { +#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ + rep_macro(type, t->dir ccoef[0], off, mul * 0x40); \ + rep_macro(type, t->dir ccoef[1], off, mul * 0x40) + case_set(cbh4, l., 1, cby4); + case_set(cbw4, a->, 0, cbx4); +#undef set_ctx + } + return 0; + } + + const TxfmInfo *const uvtx = &dav1d_txfm_dimensions[b->uvtx]; + const TxfmInfo *const ytx = &dav1d_txfm_dimensions[b->max_ytx]; + const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 }; + + for (int init_y = 0; init_y < bh4; init_y += 16) { + for (int init_x = 0; init_x < bw4; init_x += 16) { + // coefficient coding & inverse transforms + int y_off = !!init_y, y; + dst += PXSTRIDE(f->cur.stride[0]) * 4 * init_y; + for (y = init_y, t->by += init_y; y < imin(h4, init_y + 16); + y += ytx->h, y_off++) + { + int x, x_off = !!init_x; + for (x = init_x, t->bx += init_x; x < imin(w4, init_x + 16); + x += ytx->w, x_off++) + { + read_coef_tree(t, bs, b, b->max_ytx, 0, tx_split, + x_off, y_off, &dst[x * 4]); + t->bx += ytx->w; + } + dst += PXSTRIDE(f->cur.stride[0]) * 4 * ytx->h; + t->bx -= x; + t->by += ytx->h; + } + dst -= PXSTRIDE(f->cur.stride[0]) * 4 * y; + t->by -= y; + + // chroma coefs and inverse transform + if (has_chroma) for (int pl = 0; pl < 2; pl++) { + pixel *uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff + + (PXSTRIDE(f->cur.stride[1]) * init_y * 4 >> ss_ver); + for (y = init_y >> ss_ver, t->by += init_y; + y < imin(ch4, (init_y + 16) >> ss_ver); y += uvtx->h) + { + int x; + for (x = init_x >> ss_hor, t->bx += init_x; + x < imin(cw4, (init_x + 16) >> ss_hor); x += uvtx->w) + { + coef *cf; + int eob; + enum TxfmType txtp; + if (f->frame_thread.pass) { + cf = ts->frame_thread.cf; + ts->frame_thread.cf += uvtx->w * uvtx->h * 16; + const struct CodedBlockInfo *const cbi = + &f->frame_thread.cbi[t->by * f->b4_stride + t->bx]; + eob = cbi->eob[1 + pl]; + txtp = cbi->txtp[1 + pl]; + } else { + uint8_t cf_ctx; + cf = bitfn(t->cf); + txtp = t->txtp_map[(by4 + (y << ss_ver)) * 32 + + bx4 + (x << ss_hor)]; + eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x], + &t->l.ccoef[pl][cby4 + y], + b->uvtx, bs, b, 0, 1 + pl, + cf, &txtp, &cf_ctx); + if (DEBUG_BLOCK_INFO) + printf("Post-uv-cf-blk[pl=%d,tx=%d," + "txtp=%d,eob=%d]: r=%d\n", + pl, b->uvtx, txtp, eob, ts->msac.rng); +#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ + rep_macro(type, t->dir ccoef[pl], off, mul * cf_ctx) +#define default_memset(dir, diridx, off, sz) \ + memset(&t->dir ccoef[pl][off], cf_ctx, sz) + case_set_upto16_with_default( \ + imin(uvtx->h, (f->bh - t->by + ss_ver) >> ss_ver), + l., 1, cby4 + y); + case_set_upto16_with_default( \ + imin(uvtx->w, (f->bw - t->bx + ss_hor) >> ss_hor), + a->, 0, cbx4 + x); +#undef default_memset +#undef set_ctx + } + if (eob >= 0) { + if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) + coef_dump(cf, uvtx->h * 4, uvtx->w * 4, 3, "dq"); + dsp->itx.itxfm_add[b->uvtx] + [txtp](&uvdst[4 * x], + f->cur.stride[1], + cf, eob HIGHBD_CALL_SUFFIX); + if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) + hex_dump(&uvdst[4 * x], f->cur.stride[1], + uvtx->w * 4, uvtx->h * 4, "recon"); + } + t->bx += uvtx->w << ss_hor; + } + uvdst += PXSTRIDE(f->cur.stride[1]) * 4 * uvtx->h; + t->bx -= x << ss_hor; + t->by += uvtx->h << ss_ver; + } + t->by -= y << ss_ver; + } + } + } + return 0; +} + +void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) { + const int sbsz = f->sb_step, sbh = f->sbh; + + if (f->frame_hdr->loopfilter.level_y[0] || + f->frame_hdr->loopfilter.level_y[1]) + { + int start_of_tile_row = 0; + if (f->frame_hdr->tiling.row_start_sb[f->lf.tile_row] == sby) + start_of_tile_row = f->lf.tile_row++; + bytefn(dav1d_loopfilter_sbrow)(f, f->lf.p, f->lf.mask_ptr, sby, + start_of_tile_row); + } + + if (f->lf.restore_planes) { + // Store loop filtered pixels required by loop restoration + bytefn(dav1d_lr_copy_lpf)(f, f->lf.p, sby); + } + if (f->seq_hdr->cdef) { + if (sby) { + const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; + pixel *p_up[3] = { + f->lf.p[0] - 8 * PXSTRIDE(f->cur.stride[0]), + f->lf.p[1] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver), + f->lf.p[2] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver), + }; + bytefn(dav1d_cdef_brow)(f, p_up, f->lf.prev_mask_ptr, + sby * sbsz - 2, sby * sbsz); + } + const int n_blks = sbsz - 2 * (sby + 1 < sbh); + bytefn(dav1d_cdef_brow)(f, f->lf.p, f->lf.mask_ptr, sby * sbsz, + imin(sby * sbsz + n_blks, f->bh)); + } + if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) { + const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400; + for (int pl = 0; pl < 1 + 2 * has_chroma; pl++) { + const int ss_ver = pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; + const int h_start = 8 * !!sby >> ss_ver; + const ptrdiff_t dst_stride = f->sr_cur.p.stride[!!pl]; + pixel *dst = f->lf.sr_p[pl] - h_start * PXSTRIDE(dst_stride); + const ptrdiff_t src_stride = f->cur.stride[!!pl]; + const pixel *src = f->lf.p[pl] - h_start * PXSTRIDE(src_stride); + const int h_end = 4 * (sbsz - 2 * (sby + 1 < sbh)) >> ss_ver; + const int ss_hor = pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; + const int dst_w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor; + const int src_w = (4 * f->bw + ss_hor) >> ss_hor; + const int img_h = (f->cur.p.h - sbsz * 4 * sby + ss_ver) >> ss_ver; + + f->dsp->mc.resize(dst, dst_stride, src, src_stride, dst_w, + imin(img_h, h_end) + h_start, src_w, + f->resize_step[!!pl], f->resize_start[!!pl] + HIGHBD_CALL_SUFFIX); + } + } + if (f->lf.restore_planes) { + bytefn(dav1d_lr_sbrow)(f, f->lf.sr_p, sby); + } + + const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; + f->lf.p[0] += sbsz * 4 * PXSTRIDE(f->cur.stride[0]); + f->lf.p[1] += sbsz * 4 * PXSTRIDE(f->cur.stride[1]) >> ss_ver; + f->lf.p[2] += sbsz * 4 * PXSTRIDE(f->cur.stride[1]) >> ss_ver; + f->lf.sr_p[0] += sbsz * 4 * PXSTRIDE(f->sr_cur.p.stride[0]); + f->lf.sr_p[1] += sbsz * 4 * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver; + f->lf.sr_p[2] += sbsz * 4 * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver; + f->lf.prev_mask_ptr = f->lf.mask_ptr; + if ((sby & 1) || f->seq_hdr->sb128) { + f->lf.mask_ptr += f->sb128w; + } +} + +void bytefn(dav1d_backup_ipred_edge)(Dav1dTileContext *const t) { + const Dav1dFrameContext *const f = t->f; + Dav1dTileState *const ts = t->ts; + const int sby = t->by >> f->sb_shift; + const int sby_off = f->sb128w * 128 * sby; + const int x_off = ts->tiling.col_start; + + const pixel *const y = + ((const pixel *) f->cur.data[0]) + x_off * 4 + + ((t->by + f->sb_step) * 4 - 1) * PXSTRIDE(f->cur.stride[0]); + pixel_copy(&f->ipred_edge[0][sby_off + x_off * 4], y, + 4 * (ts->tiling.col_end - x_off)); + + if (f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) { + const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; + const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; + + const ptrdiff_t uv_off = (x_off * 4 >> ss_hor) + + (((t->by + f->sb_step) * 4 >> ss_ver) - 1) * PXSTRIDE(f->cur.stride[1]); + for (int pl = 1; pl <= 2; pl++) + pixel_copy(&f->ipred_edge[pl][sby_off + (x_off * 4 >> ss_hor)], + &((const pixel *) f->cur.data[pl])[uv_off], + 4 * (ts->tiling.col_end - x_off) >> ss_hor); + } +} diff --git a/third_party/dav1d/src/ref.c b/third_party/dav1d/src/ref.c new file mode 100644 index 0000000000..3889cba565 --- /dev/null +++ b/third_party/dav1d/src/ref.c @@ -0,0 +1,111 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include "src/ref.h" + +static void default_free_callback(const uint8_t *const data, void *const user_data) { + assert(data == user_data); + dav1d_free_aligned(user_data); +} + +Dav1dRef *dav1d_ref_create(size_t size) { + size = (size + sizeof(void*) - 1) & ~(sizeof(void*) - 1); + + uint8_t *const data = dav1d_alloc_aligned(size + sizeof(Dav1dRef), 64); + if (!data) return NULL; + + Dav1dRef *const res = (Dav1dRef*)(data + size); + res->const_data = res->user_data = res->data = data; + atomic_init(&res->ref_cnt, 1); + res->free_ref = 0; + res->free_callback = default_free_callback; + + return res; +} + +static void pool_free_callback(const uint8_t *const data, void *const user_data) { + dav1d_mem_pool_push((Dav1dMemPool*)data, user_data); +} + +Dav1dRef *dav1d_ref_create_using_pool(Dav1dMemPool *const pool, size_t size) { + size = (size + sizeof(void*) - 1) & ~(sizeof(void*) - 1); + + Dav1dMemPoolBuffer *const buf = + dav1d_mem_pool_pop(pool, size + sizeof(Dav1dRef)); + if (!buf) return NULL; + + Dav1dRef *const res = &((Dav1dRef*)buf)[-1]; + res->data = buf->data; + res->const_data = pool; + atomic_init(&res->ref_cnt, 1); + res->free_ref = 0; + res->free_callback = pool_free_callback; + res->user_data = buf; + + return res; +} + +Dav1dRef *dav1d_ref_wrap(const uint8_t *const ptr, + void (*free_callback)(const uint8_t *data, void *user_data), + void *const user_data) +{ + Dav1dRef *res = malloc(sizeof(Dav1dRef)); + if (!res) return NULL; + + res->data = NULL; + res->const_data = ptr; + atomic_init(&res->ref_cnt, 1); + res->free_ref = 1; + res->free_callback = free_callback; + res->user_data = user_data; + + return res; +} + +void dav1d_ref_inc(Dav1dRef *const ref) { + atomic_fetch_add(&ref->ref_cnt, 1); +} + +void dav1d_ref_dec(Dav1dRef **const pref) { + assert(pref != NULL); + + Dav1dRef *const ref = *pref; + if (!ref) return; + + if (atomic_fetch_sub(&ref->ref_cnt, 1) == 1) { + const int free_ref = ref->free_ref; + ref->free_callback(ref->const_data, ref->user_data); + if (free_ref) free(ref); + } + *pref = NULL; +} + +int dav1d_ref_is_writable(Dav1dRef *const ref) { + return atomic_load(&ref->ref_cnt) == 1 && ref->data; +} diff --git a/third_party/dav1d/src/ref.h b/third_party/dav1d/src/ref.h new file mode 100644 index 0000000000..54f5f69f88 --- /dev/null +++ b/third_party/dav1d/src/ref.h @@ -0,0 +1,58 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_REF_H +#define DAV1D_SRC_REF_H + +#include "dav1d/dav1d.h" + +#include "src/mem.h" +#include "src/thread.h" + +#include +#include + +struct Dav1dRef { + void *data; + const void *const_data; + atomic_int ref_cnt; + int free_ref; + void (*free_callback)(const uint8_t *data, void *user_data); + void *user_data; +}; + +Dav1dRef *dav1d_ref_create(size_t size); +Dav1dRef *dav1d_ref_create_using_pool(Dav1dMemPool *pool, size_t size); +Dav1dRef *dav1d_ref_wrap(const uint8_t *ptr, + void (*free_callback)(const uint8_t *data, void *user_data), + void *user_data); +void dav1d_ref_inc(Dav1dRef *ref); +void dav1d_ref_dec(Dav1dRef **ref); + +int dav1d_ref_is_writable(Dav1dRef *ref); + +#endif /* DAV1D_SRC_REF_H */ diff --git a/third_party/dav1d/src/refmvs.c b/third_party/dav1d/src/refmvs.c new file mode 100644 index 0000000000..1e113b4eac --- /dev/null +++ b/third_party/dav1d/src/refmvs.c @@ -0,0 +1,909 @@ +/* + * Copyright © 2020, VideoLAN and dav1d authors + * Copyright © 2020, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include +#include + +#include "dav1d/common.h" + +#include "common/intops.h" + +#include "src/env.h" +#include "src/refmvs.h" + +static void add_spatial_candidate(refmvs_candidate *const mvstack, int *const cnt, + const int weight, const refmvs_block *const b, + const union refmvs_refpair ref, const mv gmv[2], + int *const have_newmv_match, + int *const have_refmv_match) +{ + if (b->mv.mv[0].n == INVALID_MV) return; // intra block, no intrabc + + if (ref.ref[1] == -1) { + for (int n = 0; n < 2; n++) { + if (b->ref.ref[n] == ref.ref[0]) { + const mv cand_mv = ((b->mf & 1) && gmv[0].n != INVALID_MV) ? + gmv[0] : b->mv.mv[n]; + + const int last = *cnt; + for (int m = 0; m < last; m++) + if (mvstack[m].mv.mv[0].n == cand_mv.n) { + mvstack[m].weight += weight; + *have_refmv_match = 1; + *have_newmv_match |= b->mf >> 1; + return; + } + + if (last < 8) { + mvstack[last].mv.mv[0] = cand_mv; + mvstack[last].weight = weight; + *cnt = last + 1; + } + *have_refmv_match = 1; + *have_newmv_match |= b->mf >> 1; + return; + } + } + } else if (b->ref.pair == ref.pair) { + const refmvs_mvpair cand_mv = { .mv = { + [0] = ((b->mf & 1) && gmv[0].n != INVALID_MV) ? gmv[0] : b->mv.mv[0], + [1] = ((b->mf & 1) && gmv[1].n != INVALID_MV) ? gmv[1] : b->mv.mv[1], + }}; + + const int last = *cnt; + for (int n = 0; n < last; n++) + if (mvstack[n].mv.n == cand_mv.n) { + mvstack[n].weight += weight; + *have_refmv_match = 1; + *have_newmv_match |= b->mf >> 1; + return; + } + + if (last < 8) { + mvstack[last].mv = cand_mv; + mvstack[last].weight = weight; + *cnt = last + 1; + } + *have_refmv_match = 1; + *have_newmv_match |= b->mf >> 1; + } +} + +static int scan_row(refmvs_candidate *const mvstack, int *const cnt, + const union refmvs_refpair ref, const mv gmv[2], + const refmvs_block *b, const int bw4, const int w4, + const int max_rows, const int step, + int *const have_newmv_match, int *const have_refmv_match) +{ + const refmvs_block *cand_b = b; + const enum BlockSize first_cand_bs = cand_b->bs; + const uint8_t *const first_cand_b_dim = dav1d_block_dimensions[first_cand_bs]; + int cand_bw4 = first_cand_b_dim[0]; + int len = imax(step, imin(bw4, cand_bw4)); + + if (bw4 <= cand_bw4) { + // FIXME weight can be higher for odd blocks (bx4 & 1), but then the + // position of the first block has to be odd already, i.e. not just + // for row_offset=-3/-5 + // FIXME why can this not be cand_bw4? + const int weight = bw4 == 1 ? 2 : + imax(2, imin(2 * max_rows, first_cand_b_dim[1])); + add_spatial_candidate(mvstack, cnt, len * weight, cand_b, ref, gmv, + have_newmv_match, have_refmv_match); + return weight >> 1; + } + + for (int x = 0;;) { + // FIXME if we overhang above, we could fill a bitmask so we don't have + // to repeat the add_spatial_candidate() for the next row, but just increase + // the weight here + add_spatial_candidate(mvstack, cnt, len * 2, cand_b, ref, gmv, + have_newmv_match, have_refmv_match); + x += len; + if (x >= w4) return 1; + cand_b = &b[x]; + cand_bw4 = dav1d_block_dimensions[cand_b->bs][0]; + assert(cand_bw4 < bw4); + len = imax(step, cand_bw4); + } +} + +static int scan_col(refmvs_candidate *const mvstack, int *const cnt, + const union refmvs_refpair ref, const mv gmv[2], + /*const*/ refmvs_block *const *b, const int bh4, const int h4, + const int bx4, const int max_cols, const int step, + int *const have_newmv_match, int *const have_refmv_match) +{ + const refmvs_block *cand_b = &b[0][bx4]; + const enum BlockSize first_cand_bs = cand_b->bs; + const uint8_t *const first_cand_b_dim = dav1d_block_dimensions[first_cand_bs]; + int cand_bh4 = first_cand_b_dim[1]; + int len = imax(step, imin(bh4, cand_bh4)); + + if (bh4 <= cand_bh4) { + // FIXME weight can be higher for odd blocks (by4 & 1), but then the + // position of the first block has to be odd already, i.e. not just + // for col_offset=-3/-5 + // FIXME why can this not be cand_bh4? + const int weight = bh4 == 1 ? 2 : + imax(2, imin(2 * max_cols, first_cand_b_dim[0])); + add_spatial_candidate(mvstack, cnt, len * weight, cand_b, ref, gmv, + have_newmv_match, have_refmv_match); + return weight >> 1; + } + + for (int y = 0;;) { + // FIXME if we overhang above, we could fill a bitmask so we don't have + // to repeat the add_spatial_candidate() for the next row, but just increase + // the weight here + add_spatial_candidate(mvstack, cnt, len * 2, cand_b, ref, gmv, + have_newmv_match, have_refmv_match); + y += len; + if (y >= h4) return 1; + cand_b = &b[y][bx4]; + cand_bh4 = dav1d_block_dimensions[cand_b->bs][1]; + assert(cand_bh4 < bh4); + len = imax(step, cand_bh4); + } +} + +static inline union mv mv_projection(const union mv mv, const int num, const int den) { + static const uint16_t div_mult[32] = { + 0, 16384, 8192, 5461, 4096, 3276, 2730, 2340, + 2048, 1820, 1638, 1489, 1365, 1260, 1170, 1092, + 1024, 963, 910, 862, 819, 780, 744, 712, + 682, 655, 630, 606, 585, 564, 546, 528 + }; + assert(den > 0 && den < 32); + assert(num > -32 && num < 32); + const int frac = num * div_mult[den]; + const int y = mv.y * frac, x = mv.x * frac; + // Round and clip according to AV1 spec section 7.9.3 + return (union mv) { // 0x3fff == (1 << 14) - 1 + .y = iclip((y + 8192 + (y >> 31)) >> 14, -0x3fff, 0x3fff), + .x = iclip((x + 8192 + (x >> 31)) >> 14, -0x3fff, 0x3fff) + }; +} + +static void add_temporal_candidate(const refmvs_frame *const rf, + refmvs_candidate *const mvstack, int *const cnt, + const refmvs_temporal_block *const rb, + const union refmvs_refpair ref, int *const globalmv_ctx, + const union mv gmv[]) +{ + if (rb->mv.n == INVALID_MV) return; + + union mv mv = mv_projection(rb->mv, rf->pocdiff[ref.ref[0] - 1], rb->ref); + fix_mv_precision(rf->frm_hdr, &mv); + + const int last = *cnt; + if (ref.ref[1] == -1) { + if (globalmv_ctx) + *globalmv_ctx = (abs(mv.x - gmv[0].x) | abs(mv.y - gmv[0].y)) >= 16; + + for (int n = 0; n < last; n++) + if (mvstack[n].mv.mv[0].n == mv.n) { + mvstack[n].weight += 2; + return; + } + if (last < 8) { + mvstack[last].mv.mv[0] = mv; + mvstack[last].weight = 2; + *cnt = last + 1; + } + } else { + refmvs_mvpair mvp = { .mv = { + [0] = mv, + [1] = mv_projection(rb->mv, rf->pocdiff[ref.ref[1] - 1], rb->ref), + }}; + fix_mv_precision(rf->frm_hdr, &mvp.mv[1]); + + for (int n = 0; n < last; n++) + if (mvstack[n].mv.n == mvp.n) { + mvstack[n].weight += 2; + return; + } + if (last < 8) { + mvstack[last].mv = mvp; + mvstack[last].weight = 2; + *cnt = last + 1; + } + } +} + +static void add_compound_extended_candidate(refmvs_candidate *const same, + int *const same_count, + const refmvs_block *const cand_b, + const int sign0, const int sign1, + const union refmvs_refpair ref, + const uint8_t *const sign_bias) +{ + refmvs_candidate *const diff = &same[2]; + int *const diff_count = &same_count[2]; + + for (int n = 0; n < 2; n++) { + const int cand_ref = cand_b->ref.ref[n]; + + if (cand_ref <= 0) break; + + mv cand_mv = cand_b->mv.mv[n]; + if (cand_ref == ref.ref[0]) { + if (same_count[0] < 2) + same[same_count[0]++].mv.mv[0] = cand_mv; + if (diff_count[1] < 2) { + if (sign1 ^ sign_bias[cand_ref - 1]) { + cand_mv.y = -cand_mv.y; + cand_mv.x = -cand_mv.x; + } + diff[diff_count[1]++].mv.mv[1] = cand_mv; + } + } else if (cand_ref == ref.ref[1]) { + if (same_count[1] < 2) + same[same_count[1]++].mv.mv[1] = cand_mv; + if (diff_count[0] < 2) { + if (sign0 ^ sign_bias[cand_ref - 1]) { + cand_mv.y = -cand_mv.y; + cand_mv.x = -cand_mv.x; + } + diff[diff_count[0]++].mv.mv[0] = cand_mv; + } + } else { + mv i_cand_mv = (union mv) { + .x = -cand_mv.x, + .y = -cand_mv.y + }; + + if (diff_count[0] < 2) { + diff[diff_count[0]++].mv.mv[0] = + sign0 ^ sign_bias[cand_ref - 1] ? + i_cand_mv : cand_mv; + } + + if (diff_count[1] < 2) { + diff[diff_count[1]++].mv.mv[1] = + sign1 ^ sign_bias[cand_ref - 1] ? + i_cand_mv : cand_mv; + } + } + } +} + +static void add_single_extended_candidate(refmvs_candidate mvstack[8], int *const cnt, + const refmvs_block *const cand_b, + const int sign, const uint8_t *const sign_bias) +{ + for (int n = 0; n < 2; n++) { + const int cand_ref = cand_b->ref.ref[n]; + + if (cand_ref <= 0) break; + // we need to continue even if cand_ref == ref.ref[0], since + // the candidate could have been added as a globalmv variant, + // which changes the value + // FIXME if scan_{row,col}() returned a mask for the nearest + // edge, we could skip the appropriate ones here + + mv cand_mv = cand_b->mv.mv[n]; + if (sign ^ sign_bias[cand_ref - 1]) { + cand_mv.y = -cand_mv.y; + cand_mv.x = -cand_mv.x; + } + + int m; + const int last = *cnt; + for (m = 0; m < last; m++) + if (cand_mv.n == mvstack[m].mv.mv[0].n) + break; + if (m == last) { + mvstack[m].mv.mv[0] = cand_mv; + mvstack[m].weight = 2; // "minimal" + *cnt = last + 1; + } + } +} + +/* + * refmvs_frame allocates memory for one sbrow (32 blocks high, whole frame + * wide) of 4x4-resolution refmvs_block entries for spatial MV referencing. + * mvrefs_tile[] keeps a list of 35 (32 + 3 above) pointers into this memory, + * and each sbrow, the bottom entries (y=27/29/31) are exchanged with the top + * (-5/-3/-1) pointers by calling dav1d_refmvs_tile_sbrow_init() at the start + * of each tile/sbrow. + * + * For temporal MV referencing, we call dav1d_refmvs_save_tmvs() at the end of + * each tile/sbrow (when tile column threading is enabled), or at the start of + * each interleaved sbrow (i.e. once for all tile columns together, when tile + * column threading is disabled). This will copy the 4x4-resolution spatial MVs + * into 8x8-resolution refmvs_temporal_block structures. Then, for subsequent + * frames, at the start of each tile/sbrow (when tile column threading is + * enabled) or at the start of each interleaved sbrow (when tile column + * threading is disabled), we call load_tmvs(), which will project the MVs to + * their respective position in the current frame. + */ + +void dav1d_refmvs_find(const refmvs_tile *const rt, + refmvs_candidate mvstack[8], int *const cnt, + int *const ctx, + const union refmvs_refpair ref, const enum BlockSize bs, + const enum EdgeFlags edge_flags, + const int by4, const int bx4) +{ + const refmvs_frame *const rf = rt->rf; + const uint8_t *const b_dim = dav1d_block_dimensions[bs]; + const int bw4 = b_dim[0], w4 = imin(imin(bw4, 16), rt->tile_col.end - bx4); + const int bh4 = b_dim[1], h4 = imin(imin(bh4, 16), rt->tile_row.end - by4); + mv gmv[2], tgmv[2]; + + *cnt = 0; + assert(ref.ref[0] >= 0 && ref.ref[0] <= 8 && + ref.ref[1] >= -1 && ref.ref[1] <= 8); + if (ref.ref[0] > 0) { + tgmv[0] = get_gmv_2d(&rf->frm_hdr->gmv[ref.ref[0] - 1], + bx4, by4, bw4, bh4, rf->frm_hdr); + gmv[0] = rf->frm_hdr->gmv[ref.ref[0] - 1].type > DAV1D_WM_TYPE_TRANSLATION ? + tgmv[0] : (mv) { .n = INVALID_MV }; + } else { + tgmv[0] = (mv) { .n = 0 }; + gmv[0] = (mv) { .n = INVALID_MV }; + } + if (ref.ref[1] > 0) { + tgmv[1] = get_gmv_2d(&rf->frm_hdr->gmv[ref.ref[1] - 1], + bx4, by4, bw4, bh4, rf->frm_hdr); + gmv[1] = rf->frm_hdr->gmv[ref.ref[1] - 1].type > DAV1D_WM_TYPE_TRANSLATION ? + tgmv[1] : (mv) { .n = INVALID_MV }; + } + + // top + int have_newmv = 0, have_col_mvs = 0, have_row_mvs = 0; + unsigned max_rows = 0, n_rows = ~0; + const refmvs_block *b_top; + if (by4 > rt->tile_row.start) { + max_rows = imin((by4 - rt->tile_row.start + 1) >> 1, 2 + (bh4 > 1)); + b_top = &rt->r[(by4 & 31) - 1 + 5][bx4]; + n_rows = scan_row(mvstack, cnt, ref, gmv, b_top, + bw4, w4, max_rows, bw4 >= 16 ? 4 : 1, + &have_newmv, &have_row_mvs); + } + + // left + unsigned max_cols = 0, n_cols = ~0U; + refmvs_block *const *b_left; + if (bx4 > rt->tile_col.start) { + max_cols = imin((bx4 - rt->tile_col.start + 1) >> 1, 2 + (bw4 > 1)); + b_left = &rt->r[(by4 & 31) + 5]; + n_cols = scan_col(mvstack, cnt, ref, gmv, b_left, + bh4, h4, bx4 - 1, max_cols, bh4 >= 16 ? 4 : 1, + &have_newmv, &have_col_mvs); + } + + // top/right + if (n_rows != ~0U && edge_flags & EDGE_I444_TOP_HAS_RIGHT && + imax(bw4, bh4) <= 16 && bw4 + bx4 < rt->tile_col.end) + { + add_spatial_candidate(mvstack, cnt, 4, &b_top[bw4], ref, gmv, + &have_newmv, &have_row_mvs); + } + + const int nearest_match = have_col_mvs + have_row_mvs; + const int nearest_cnt = *cnt; + for (int n = 0; n < nearest_cnt; n++) + mvstack[n].weight += 640; + + // temporal + int globalmv_ctx = rf->frm_hdr->use_ref_frame_mvs; + if (rf->use_ref_frame_mvs) { + const ptrdiff_t stride = rf->rp_stride; + const int by8 = by4 >> 1, bx8 = bx4 >> 1; + const refmvs_temporal_block *const rbi = &rt->rp_proj[(by8 & 15) * stride + bx8]; + const refmvs_temporal_block *rb = rbi; + const int step_h = bw4 >= 16 ? 2 : 1, step_v = bh4 >= 16 ? 2 : 1; + const int w8 = imin((w4 + 1) >> 1, 8), h8 = imin((h4 + 1) >> 1, 8); + for (int y = 0; y < h8; y += step_v) { + for (int x = 0; x < w8; x+= step_h) { + add_temporal_candidate(rf, mvstack, cnt, &rb[x], ref, + !(x | y) ? &globalmv_ctx : NULL, tgmv); + } + rb += stride * step_v; + } + if (imin(bw4, bh4) >= 2 && imax(bw4, bh4) < 16) { + const int bh8 = bh4 >> 1, bw8 = bw4 >> 1; + rb = &rbi[bh8 * stride]; + const int has_bottom = by8 + bh8 < imin(rt->tile_row.end >> 1, + (by8 & ~7) + 8); + if (has_bottom && bx8 - 1 >= imax(rt->tile_col.start >> 1, bx8 & ~7)) { + add_temporal_candidate(rf, mvstack, cnt, &rb[-1], ref, + NULL, NULL); + } + if (bx8 + bw8 < imin(rt->tile_col.end >> 1, (bx8 & ~7) + 8)) { + if (has_bottom) { + add_temporal_candidate(rf, mvstack, cnt, &rb[bw8], ref, + NULL, NULL); + } + if (by8 + bh8 - 1 < imin(rt->tile_row.end >> 1, (by8 & ~7) + 8)) { + add_temporal_candidate(rf, mvstack, cnt, &rb[bw8 - stride], + ref, NULL, NULL); + } + } + } + } + assert(*cnt <= 8); + + // top/left (which, confusingly, is part of "secondary" references) + int have_dummy_newmv_match; + if ((n_rows | n_cols) != ~0U) { + add_spatial_candidate(mvstack, cnt, 4, &b_top[-1], ref, gmv, + &have_dummy_newmv_match, &have_row_mvs); + } + + // "secondary" (non-direct neighbour) top & left edges + // what is different about secondary is that everything is now in 8x8 resolution + for (int n = 2; n <= 3; n++) { + if ((unsigned) n > n_rows && (unsigned) n <= max_rows) { + n_rows += scan_row(mvstack, cnt, ref, gmv, + &rt->r[(((by4 & 31) - 2 * n + 1) | 1) + 5][bx4 | 1], + bw4, w4, 1 + max_rows - n, bw4 >= 16 ? 4 : 2, + &have_dummy_newmv_match, &have_row_mvs); + } + + if ((unsigned) n > n_cols && (unsigned) n <= max_cols) { + n_cols += scan_col(mvstack, cnt, ref, gmv, &rt->r[((by4 & 31) | 1) + 5], + bh4, h4, (bx4 - n * 2 + 1) | 1, + 1 + max_cols - n, bh4 >= 16 ? 4 : 2, + &have_dummy_newmv_match, &have_col_mvs); + } + } + assert(*cnt <= 8); + + const int ref_match_count = have_col_mvs + have_row_mvs; + + // context build-up + int refmv_ctx, newmv_ctx; + switch (nearest_match) { + case 0: + refmv_ctx = imin(2, ref_match_count); + newmv_ctx = ref_match_count > 0; + break; + case 1: + refmv_ctx = imin(ref_match_count * 3, 4); + newmv_ctx = 3 - have_newmv; + break; + case 2: + refmv_ctx = 5; + newmv_ctx = 5 - have_newmv; + break; + } + + // sorting (nearest, then "secondary") + int len = nearest_cnt; + while (len) { + int last = 0; + for (int n = 1; n < len; n++) { + if (mvstack[n - 1].weight < mvstack[n].weight) { +#define EXCHANGE(a, b) do { refmvs_candidate tmp = a; a = b; b = tmp; } while (0) + EXCHANGE(mvstack[n - 1], mvstack[n]); + last = n; + } + } + len = last; + } + len = *cnt; + while (len > nearest_cnt) { + int last = nearest_cnt; + for (int n = nearest_cnt + 1; n < len; n++) { + if (mvstack[n - 1].weight < mvstack[n].weight) { + EXCHANGE(mvstack[n - 1], mvstack[n]); +#undef EXCHANGE + last = n; + } + } + len = last; + } + + if (ref.ref[1] > 0) { + if (*cnt < 2) { + const int sign0 = rf->sign_bias[ref.ref[0] - 1]; + const int sign1 = rf->sign_bias[ref.ref[1] - 1]; + const int sz4 = imin(w4, h4); + refmvs_candidate *const same = &mvstack[*cnt]; + int same_count[4] = { 0 }; + + // non-self references in top + if (n_rows != ~0U) for (int x = 0; x < sz4;) { + const refmvs_block *const cand_b = &b_top[x]; + add_compound_extended_candidate(same, same_count, cand_b, + sign0, sign1, ref, rf->sign_bias); + x += dav1d_block_dimensions[cand_b->bs][0]; + } + + // non-self references in left + if (n_cols != ~0U) for (int y = 0; y < sz4;) { + const refmvs_block *const cand_b = &b_left[y][bx4 - 1]; + add_compound_extended_candidate(same, same_count, cand_b, + sign0, sign1, ref, rf->sign_bias); + y += dav1d_block_dimensions[cand_b->bs][1]; + } + + refmvs_candidate *const diff = &same[2]; + const int *const diff_count = &same_count[2]; + + // merge together + for (int n = 0; n < 2; n++) { + int m = same_count[n]; + + if (m >= 2) continue; + + const int l = diff_count[n]; + if (l) { + same[m].mv.mv[n] = diff[0].mv.mv[n]; + if (++m == 2) continue; + if (l == 2) { + same[1].mv.mv[n] = diff[1].mv.mv[n]; + continue; + } + } + do { + same[m].mv.mv[n] = tgmv[n]; + } while (++m < 2); + } + + // if the first extended was the same as the non-extended one, + // then replace it with the second extended one + int n = *cnt; + if (n == 1 && mvstack[0].mv.n == same[0].mv.n) + mvstack[1].mv = mvstack[2].mv; + do { + mvstack[n].weight = 2; + } while (++n < 2); + *cnt = 2; + } + + // clamping + const int left = -(bx4 + bw4 + 4) * 4 * 8; + const int right = (rf->iw4 - bx4 + 4) * 4 * 8; + const int top = -(by4 + bh4 + 4) * 4 * 8; + const int bottom = (rf->ih4 - by4 + 4) * 4 * 8; + + const int n_refmvs = *cnt; + int n = 0; + do { + mvstack[n].mv.mv[0].x = iclip(mvstack[n].mv.mv[0].x, left, right); + mvstack[n].mv.mv[0].y = iclip(mvstack[n].mv.mv[0].y, top, bottom); + mvstack[n].mv.mv[1].x = iclip(mvstack[n].mv.mv[1].x, left, right); + mvstack[n].mv.mv[1].y = iclip(mvstack[n].mv.mv[1].y, top, bottom); + } while (++n < n_refmvs); + + switch (refmv_ctx >> 1) { + case 0: + *ctx = imin(newmv_ctx, 1); + break; + case 1: + *ctx = 1 + imin(newmv_ctx, 3); + break; + case 2: + *ctx = iclip(3 + newmv_ctx, 4, 7); + break; + } + + return; + } else if (*cnt < 2 && ref.ref[0] > 0) { + const int sign = rf->sign_bias[ref.ref[0] - 1]; + const int sz4 = imin(w4, h4); + + // non-self references in top + if (n_rows != ~0U) for (int x = 0; x < sz4 && *cnt < 2;) { + const refmvs_block *const cand_b = &b_top[x]; + add_single_extended_candidate(mvstack, cnt, cand_b, sign, rf->sign_bias); + x += dav1d_block_dimensions[cand_b->bs][0]; + } + + // non-self references in left + if (n_cols != ~0U) for (int y = 0; y < sz4 && *cnt < 2;) { + const refmvs_block *const cand_b = &b_left[y][bx4 - 1]; + add_single_extended_candidate(mvstack, cnt, cand_b, sign, rf->sign_bias); + y += dav1d_block_dimensions[cand_b->bs][1]; + } + } + assert(*cnt <= 8); + + // clamping + int n_refmvs = *cnt; + if (n_refmvs) { + const int left = -(bx4 + bw4 + 4) * 4 * 8; + const int right = (rf->iw4 - bx4 + 4) * 4 * 8; + const int top = -(by4 + bh4 + 4) * 4 * 8; + const int bottom = (rf->ih4 - by4 + 4) * 4 * 8; + + int n = 0; + do { + mvstack[n].mv.mv[0].x = iclip(mvstack[n].mv.mv[0].x, left, right); + mvstack[n].mv.mv[0].y = iclip(mvstack[n].mv.mv[0].y, top, bottom); + } while (++n < n_refmvs); + } + + for (int n = *cnt; n < 2; n++) + mvstack[n].mv.mv[0] = tgmv[0]; + + *ctx = (refmv_ctx << 4) | (globalmv_ctx << 3) | newmv_ctx; +} + +void dav1d_refmvs_tile_sbrow_init(refmvs_tile *const rt, const refmvs_frame *const rf, + const int tile_col_start4, const int tile_col_end4, + const int tile_row_start4, const int tile_row_end4, + const int sby, int tile_row_idx) +{ + if (rf->n_tile_threads == 1) tile_row_idx = 0; + rt->rp_proj = &rf->rp_proj[16 * rf->rp_stride * tile_row_idx]; + refmvs_block *r = &rf->r[35 * rf->r_stride * tile_row_idx]; + const int sbsz = rf->sbsz; + const int off = (sbsz * sby) & 16; + for (int i = 0; i < sbsz; i++, r += rf->r_stride) + rt->r[off + 5 + i] = r; + rt->r[off + 0] = r; + r += rf->r_stride; + rt->r[off + 1] = NULL; + rt->r[off + 2] = r; + r += rf->r_stride; + rt->r[off + 3] = NULL; + rt->r[off + 4] = r; + if (sby & 1) { +#define EXCHANGE(a, b) do { void *const tmp = a; a = b; b = tmp; } while (0) + EXCHANGE(rt->r[off + 0], rt->r[off + sbsz + 0]); + EXCHANGE(rt->r[off + 2], rt->r[off + sbsz + 2]); + EXCHANGE(rt->r[off + 4], rt->r[off + sbsz + 4]); +#undef EXCHANGE + } + + rt->rf = rf; + rt->tile_row.start = tile_row_start4; + rt->tile_row.end = imin(tile_row_end4, rf->ih4); + rt->tile_col.start = tile_col_start4; + rt->tile_col.end = imin(tile_col_end4, rf->iw4); +} + +void dav1d_refmvs_load_tmvs(const refmvs_frame *const rf, int tile_row_idx, + const int col_start8, const int col_end8, + const int row_start8, int row_end8) +{ + if (rf->n_tile_threads == 1) tile_row_idx = 0; + assert(row_start8 >= 0); + assert((unsigned) (row_end8 - row_start8) <= 16U); + row_end8 = imin(row_end8, rf->ih8); + const int col_start8i = imax(col_start8 - 8, 0); + const int col_end8i = imin(col_end8 + 8, rf->iw8); + + const ptrdiff_t stride = rf->rp_stride; + refmvs_temporal_block *rp_proj = + &rf->rp_proj[16 * stride * tile_row_idx + (row_start8 & 15) * stride]; + for (int y = row_start8; y < row_end8; y++) { + for (int x = col_start8; x < col_end8; x++) + rp_proj[x].mv.n = INVALID_MV; + rp_proj += stride; + } + + rp_proj = &rf->rp_proj[16 * stride * tile_row_idx]; + for (int n = 0; n < rf->n_mfmvs; n++) { + const int ref2cur = rf->mfmv_ref2cur[n]; + if (ref2cur == INT_MIN) continue; + + const int ref = rf->mfmv_ref[n]; + const int ref_sign = ref - 4; + const refmvs_temporal_block *r = &rf->rp_ref[ref][row_start8 * stride]; + for (int y = row_start8; y < row_end8; y++) { + const int y_sb_align = y & ~7; + const int y_proj_start = imax(y_sb_align, row_start8); + const int y_proj_end = imin(y_sb_align + 8, row_end8); + for (int x = col_start8i; x < col_end8i; x++) { + const refmvs_temporal_block *rb = &r[x]; + const int b_ref = rb->ref; + if (!b_ref) continue; + const int ref2ref = rf->mfmv_ref2ref[n][b_ref - 1]; + if (!ref2ref) continue; + const mv b_mv = rb->mv; + const mv offset = mv_projection(b_mv, ref2cur, ref2ref); + int pos_x = x + apply_sign(abs(offset.x) >> 6, + offset.x ^ ref_sign); + const int pos_y = y + apply_sign(abs(offset.y) >> 6, + offset.y ^ ref_sign); + if (pos_y >= y_proj_start && pos_y < y_proj_end) { + const ptrdiff_t pos = (pos_y & 15) * stride; + for (;;) { + const int x_sb_align = x & ~7; + if (pos_x >= imax(x_sb_align - 8, col_start8) && + pos_x < imin(x_sb_align + 16, col_end8)) + { + rp_proj[pos + pos_x].mv = rb->mv; + rp_proj[pos + pos_x].ref = ref2ref; + } + if (++x >= col_end8i) break; + rb++; + if (rb->ref != b_ref || rb->mv.n != b_mv.n) break; + pos_x++; + } + } else { + for (;;) { + if (++x >= col_end8i) break; + rb++; + if (rb->ref != b_ref || rb->mv.n != b_mv.n) break; + } + } + x--; + } + r += stride; + } + } +} + +void dav1d_refmvs_save_tmvs(const refmvs_tile *const rt, + const int col_start8, int col_end8, + const int row_start8, int row_end8) +{ + const refmvs_frame *const rf = rt->rf; + + assert(row_start8 >= 0); + assert((unsigned) (row_end8 - row_start8) <= 16U); + row_end8 = imin(row_end8, rf->ih8); + col_end8 = imin(col_end8, rf->iw8); + + const ptrdiff_t stride = rf->rp_stride; + const uint8_t *const ref_sign = rf->mfmv_sign; + refmvs_temporal_block *rp = &rf->rp[row_start8 * stride]; + for (int y = row_start8; y < row_end8; y++) { + const refmvs_block *const b = rt->r[6 + (y & 15) * 2]; + + for (int x = col_start8; x < col_end8;) { + const refmvs_block *const cand_b = &b[x * 2 + 1]; + const int bw8 = (dav1d_block_dimensions[cand_b->bs][0] + 1) >> 1; + + if (cand_b->ref.ref[1] > 0 && ref_sign[cand_b->ref.ref[1] - 1] && + (abs(cand_b->mv.mv[1].y) | abs(cand_b->mv.mv[1].x)) < 4096) + { + for (int n = 0; n < bw8; n++, x++) + rp[x] = (refmvs_temporal_block) { .mv = cand_b->mv.mv[1], + .ref = cand_b->ref.ref[1] }; + } else if (cand_b->ref.ref[0] > 0 && ref_sign[cand_b->ref.ref[0] - 1] && + (abs(cand_b->mv.mv[0].y) | abs(cand_b->mv.mv[0].x)) < 4096) + { + for (int n = 0; n < bw8; n++, x++) + rp[x] = (refmvs_temporal_block) { .mv = cand_b->mv.mv[0], + .ref = cand_b->ref.ref[0] }; + } else { + for (int n = 0; n < bw8; n++, x++) + rp[x].ref = 0; // "invalid" + } + } + rp += stride; + } +} + +int dav1d_refmvs_init_frame(refmvs_frame *const rf, + const Dav1dSequenceHeader *const seq_hdr, + const Dav1dFrameHeader *const frm_hdr, + const unsigned ref_poc[7], + refmvs_temporal_block *const rp, + const unsigned ref_ref_poc[7][7], + /*const*/ refmvs_temporal_block *const rp_ref[7], + const int n_tile_threads) +{ + rf->sbsz = 16 << seq_hdr->sb128; + rf->frm_hdr = frm_hdr; + rf->iw8 = (frm_hdr->width[0] + 7) >> 3; + rf->ih8 = (frm_hdr->height + 7) >> 3; + rf->iw4 = rf->iw8 << 1; + rf->ih4 = rf->ih8 << 1; + + const ptrdiff_t r_stride = ((frm_hdr->width[0] + 127) & ~127) >> 2; + const int n_tile_rows = n_tile_threads > 1 ? frm_hdr->tiling.rows : 1; + if (r_stride != rf->r_stride || n_tile_rows != rf->n_tile_rows) { + if (rf->r) free(rf->r); + rf->r = malloc(sizeof(*rf->r) * 35 * r_stride * n_tile_rows); + if (!rf->r) return DAV1D_ERR(ENOMEM); + rf->r_stride = r_stride; + } + + const ptrdiff_t rp_stride = r_stride >> 1; + if (rp_stride != rf->rp_stride || n_tile_rows != rf->n_tile_rows) { + if (rf->rp_proj) free(rf->rp_proj); + rf->rp_proj = malloc(sizeof(*rf->rp_proj) * 16 * rp_stride * n_tile_rows); + if (!rf->rp_proj) return DAV1D_ERR(ENOMEM); + rf->rp_stride = rp_stride; + } + rf->n_tile_rows = n_tile_rows; + rf->n_tile_threads = n_tile_threads; + rf->rp = rp; + rf->rp_ref = rp_ref; + const unsigned poc = frm_hdr->frame_offset; + for (int i = 0; i < 7; i++) { + const int poc_diff = get_poc_diff(seq_hdr->order_hint_n_bits, + ref_poc[i], poc); + rf->sign_bias[i] = poc_diff > 0; + rf->mfmv_sign[i] = poc_diff < 0; + rf->pocdiff[i] = iclip(get_poc_diff(seq_hdr->order_hint_n_bits, + poc, ref_poc[i]), -31, 31); + } + + // temporal MV setup + rf->n_mfmvs = 0; + if (frm_hdr->use_ref_frame_mvs && seq_hdr->order_hint_n_bits) { + int total = 2; + if (rp_ref[0] && ref_ref_poc[0][6] != ref_poc[3] /* alt-of-last != gold */) { + rf->mfmv_ref[rf->n_mfmvs++] = 0; // last + total = 3; + } + if (rp_ref[4] && get_poc_diff(seq_hdr->order_hint_n_bits, ref_poc[4], + frm_hdr->frame_offset) > 0) + { + rf->mfmv_ref[rf->n_mfmvs++] = 4; // bwd + } + if (rp_ref[5] && get_poc_diff(seq_hdr->order_hint_n_bits, ref_poc[5], + frm_hdr->frame_offset) > 0) + { + rf->mfmv_ref[rf->n_mfmvs++] = 5; // altref2 + } + if (rf->n_mfmvs < total && rp_ref[6] && + get_poc_diff(seq_hdr->order_hint_n_bits, ref_poc[6], + frm_hdr->frame_offset) > 0) + { + rf->mfmv_ref[rf->n_mfmvs++] = 6; // altref + } + if (rf->n_mfmvs < total && rp_ref[1]) + rf->mfmv_ref[rf->n_mfmvs++] = 1; // last2 + + for (int n = 0; n < rf->n_mfmvs; n++) { + const unsigned rpoc = ref_poc[rf->mfmv_ref[n]]; + const int diff1 = get_poc_diff(seq_hdr->order_hint_n_bits, + rpoc, frm_hdr->frame_offset); + if (abs(diff1) > 31) { + rf->mfmv_ref2cur[n] = INT_MIN; + } else { + rf->mfmv_ref2cur[n] = rf->mfmv_ref[n] < 4 ? -diff1 : diff1; + for (int m = 0; m < 7; m++) { + const unsigned rrpoc = ref_ref_poc[rf->mfmv_ref[n]][m]; + const int diff2 = get_poc_diff(seq_hdr->order_hint_n_bits, + rpoc, rrpoc); + // unsigned comparison also catches the < 0 case + rf->mfmv_ref2ref[n][m] = (unsigned) diff2 > 31U ? 0 : diff2; + } + } + } + } + rf->use_ref_frame_mvs = rf->n_mfmvs > 0; + + return 0; +} + +void dav1d_refmvs_init(refmvs_frame *const rf) { + rf->r = NULL; + rf->r_stride = 0; + rf->rp_proj = NULL; + rf->rp_stride = 0; +} + +void dav1d_refmvs_clear(refmvs_frame *const rf) { + if (rf->r) free(rf->r); + if (rf->rp_proj) free(rf->rp_proj); +} diff --git a/third_party/dav1d/src/refmvs.h b/third_party/dav1d/src/refmvs.h new file mode 100644 index 0000000000..6f68a76757 --- /dev/null +++ b/third_party/dav1d/src/refmvs.h @@ -0,0 +1,233 @@ +/* + * Copyright © 2020, VideoLAN and dav1d authors + * Copyright © 2020, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_REF_MVS_H +#define DAV1D_SRC_REF_MVS_H + +#include + +#include "dav1d/headers.h" + +#include "common/intops.h" + +#include "src/intra_edge.h" +#include "src/levels.h" +#include "src/tables.h" + +#define INVALID_MV 0x80008000 + +typedef struct refmvs_temporal_block { + mv mv; + int8_t ref; +} refmvs_temporal_block; + +typedef union refmvs_refpair { + int8_t ref[2]; // [0] = 0: intra=1, [1] = -1: comp=0 + uint16_t pair; +} refmvs_refpair; + +typedef union refmvs_mvpair { + mv mv[2]; + uint64_t n; +} refmvs_mvpair; + +typedef struct refmvs_block { + refmvs_mvpair mv; + refmvs_refpair ref; + uint8_t bs, mf; // 1 = globalmv+affine, 2 = newmv +} refmvs_block; + +typedef struct refmvs_frame { + const Dav1dFrameHeader *frm_hdr; + int iw4, ih4, iw8, ih8; + int sbsz; + int use_ref_frame_mvs; + uint8_t sign_bias[7], mfmv_sign[7]; + int8_t pocdiff[7]; + uint8_t mfmv_ref[3]; + int mfmv_ref2cur[3]; + int mfmv_ref2ref[3][7]; + int n_mfmvs; + + refmvs_temporal_block *rp; + /*const*/ refmvs_temporal_block *const *rp_ref; + refmvs_temporal_block *rp_proj; + ptrdiff_t rp_stride; + + refmvs_block *r; // 35 x r_stride memory + ptrdiff_t r_stride; + int n_tile_rows, n_tile_threads; +} refmvs_frame; + +typedef struct refmvs_tile { + const refmvs_frame *rf; + refmvs_block *r[32 + 5]; + refmvs_temporal_block *rp_proj; + struct { + int start, end; + } tile_col, tile_row; +} refmvs_tile; + +typedef struct refmvs_candidate { + refmvs_mvpair mv; + int weight; +} refmvs_candidate; + +// call once per frame thread +void dav1d_refmvs_init(refmvs_frame *rf); +void dav1d_refmvs_clear(refmvs_frame *rf); + +// call once per frame +int dav1d_refmvs_init_frame(refmvs_frame *rf, + const Dav1dSequenceHeader *seq_hdr, + const Dav1dFrameHeader *frm_hdr, + const unsigned ref_poc[7], + refmvs_temporal_block *rp, + const unsigned ref_ref_poc[7][7], + /*const*/ refmvs_temporal_block *const rp_ref[7], + int n_tile_threads); + +// initialize temporal MVs; this can be done in any configuration, e.g. one +// tile/sbrow at a time, where col_{start,end}8 are the tile boundaries; or +// it can just be for the whole frame's sbrow, where col_{start,end}8 are the +// frame boundaries. row_{start,end}8 are the superblock row boundaries. +void dav1d_refmvs_load_tmvs(const refmvs_frame *rf, int tile_row_idx, + int col_start8, int col_end8, + int row_start8, int row_end8); + +// cache the current tile/sbrow (or frame/sbrow)'s projectable motion vectors +// into buffers for use in future frame's temporal MV prediction +void dav1d_refmvs_save_tmvs(const refmvs_tile *rt, + int col_start8, int col_end8, + int row_start8, int row_end8); + +// initialize tile boundaries and refmvs_block pointers for one tile/sbrow +void dav1d_refmvs_tile_sbrow_init(refmvs_tile *rt, const refmvs_frame *rf, + int tile_col_start4, int tile_col_end4, + int tile_row_start4, int tile_row_end4, + int sby, int tile_row_idx); + +// call for each block +void dav1d_refmvs_find(const refmvs_tile *rt, + refmvs_candidate mvstack[8], int *cnt, + int *ctx, const refmvs_refpair ref, enum BlockSize bs, + enum EdgeFlags edge_flags, int by4, int bx4); + +static inline void splat_oneref_mv(refmvs_tile *const rt, + const int by4, const int bx4, + const enum BlockSize bs, + const enum InterPredMode mode, + const int ref, const mv mv, + const int is_interintra) +{ + const int bw4 = dav1d_block_dimensions[bs][0]; + int bh4 = dav1d_block_dimensions[bs][1]; + refmvs_block **rr = &rt->r[(by4 & 31) + 5]; + + const refmvs_block tmpl = (refmvs_block) { + .ref.ref = { ref + 1, is_interintra ? 0 : -1 }, + .mv.mv[0] = mv, + .bs = bs, + .mf = (mode == GLOBALMV && imin(bw4, bh4) >= 2) | ((mode == NEWMV) * 2), + }; + do { + refmvs_block *r = *rr++ + bx4; + for (int x = 0; x < bw4; x++) + r[x] = tmpl; + } while (--bh4); +} + +static inline void splat_intrabc_mv(refmvs_tile *const rt, + const int by4, const int bx4, + const enum BlockSize bs, const mv mv) +{ + const int bw4 = dav1d_block_dimensions[bs][0]; + int bh4 = dav1d_block_dimensions[bs][1]; + refmvs_block **rr = &rt->r[(by4 & 31) + 5]; + + const refmvs_block tmpl = (refmvs_block) { + .ref.ref = { 0, -1 }, + .mv.mv[0] = mv, + .bs = bs, + .mf = 0, + }; + do { + refmvs_block *r = *rr++ + bx4; + for (int x = 0; x < bw4; x++) { + r[x] = tmpl; + } + } while (--bh4); +} + +static inline void splat_tworef_mv(refmvs_tile *const rt, + const int by4, const int bx4, + const enum BlockSize bs, + const enum CompInterPredMode mode, + const refmvs_refpair ref, + const refmvs_mvpair mv) +{ + const int bw4 = dav1d_block_dimensions[bs][0]; + int bh4 = dav1d_block_dimensions[bs][1]; + refmvs_block **rr = &rt->r[(by4 & 31) + 5]; + + assert(bw4 >= 2 && bh4 >= 2); + const refmvs_block tmpl = (refmvs_block) { + .ref.pair = ref.pair + 0x0101, + .mv = mv, + .bs = bs, + .mf = (mode == GLOBALMV_GLOBALMV) | !!((1 << mode) & (0xbc)) * 2, + }; + do { + refmvs_block *r = *rr++ + bx4; + for (int x = 0; x < bw4; x++) + r[x] = tmpl; + } while (--bh4); +} + +static inline void splat_intraref(refmvs_tile *const rt, + const int by4, const int bx4, + const enum BlockSize bs) +{ + const int bw4 = dav1d_block_dimensions[bs][0]; + int bh4 = dav1d_block_dimensions[bs][1]; + refmvs_block **rr = &rt->r[(by4 & 31) + 5]; + + const refmvs_block tmpl = (refmvs_block) { + .ref.ref = { 0, -1 }, + .mv.mv[0].n = INVALID_MV, + .bs = bs, + .mf = 0, + }; + do { + refmvs_block *r = *rr++ + bx4; + for (int x = 0; x < bw4; x++) { + r[x] = tmpl; + } + } while (--bh4); +} + +#endif /* DAV1D_SRC_REF_MVS_H */ diff --git a/third_party/dav1d/src/scan.c b/third_party/dav1d/src/scan.c new file mode 100644 index 0000000000..c51c6f55ef --- /dev/null +++ b/third_party/dav1d/src/scan.c @@ -0,0 +1,444 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include "common/attributes.h" +#include "src/scan.h" + +static const uint16_t ALIGN(av1_default_scan_4x4[], 32) = { + 0, 4, 1, 2, + 5, 8, 12, 9, + 6, 3, 7, 10, + 13, 14, 11, 15, +}; +static const uint16_t ALIGN(av1_mrow_scan_4x4[], 32) = { + 0, 4, 8, 12, + 1, 5, 9, 13, + 2, 6, 10, 14, + 3, 7, 11, 15, +}; +static const uint16_t ALIGN(av1_default_scan_4x8[], 32) = { + 0, 8, 1, 16, + 9, 2, 24, 17, + 10, 3, 25, 18, + 11, 4, 26, 19, + 12, 5, 27, 20, + 13, 6, 28, 21, + 14, 7, 29, 22, + 15, 30, 23, 31, +}; +static const uint16_t ALIGN(av1_mrow_scan_4x8[], 32) = { + 0, 8, 16, 24, + 1, 9, 17, 25, + 2, 10, 18, 26, + 3, 11, 19, 27, + 4, 12, 20, 28, + 5, 13, 21, 29, + 6, 14, 22, 30, + 7, 15, 23, 31, +}; +static const uint16_t ALIGN(av1_default_scan_4x16[], 32) = { + 0, 16, 1, 32, + 17, 2, 48, 33, + 18, 3, 49, 34, + 19, 4, 50, 35, + 20, 5, 51, 36, + 21, 6, 52, 37, + 22, 7, 53, 38, + 23, 8, 54, 39, + 24, 9, 55, 40, + 25, 10, 56, 41, + 26, 11, 57, 42, + 27, 12, 58, 43, + 28, 13, 59, 44, + 29, 14, 60, 45, + 30, 15, 61, 46, + 31, 62, 47, 63, +}; +static const uint16_t ALIGN(av1_mrow_scan_4x16[], 32) = { + 0, 16, 32, 48, + 1, 17, 33, 49, + 2, 18, 34, 50, + 3, 19, 35, 51, + 4, 20, 36, 52, + 5, 21, 37, 53, + 6, 22, 38, 54, + 7, 23, 39, 55, + 8, 24, 40, 56, + 9, 25, 41, 57, + 10, 26, 42, 58, + 11, 27, 43, 59, + 12, 28, 44, 60, + 13, 29, 45, 61, + 14, 30, 46, 62, + 15, 31, 47, 63, +}; +static const uint16_t ALIGN(av1_default_scan_8x4[], 32) = { + 0, 1, 4, 2, 5, 8, 3, 6, + 9, 12, 7, 10, 13, 16, 11, 14, + 17, 20, 15, 18, 21, 24, 19, 22, + 25, 28, 23, 26, 29, 27, 30, 31, +}; +static const uint16_t ALIGN(av1_mrow_scan_8x4[], 32) = { + 0, 4, 8, 12, 16, 20, 24, 28, + 1, 5, 9, 13, 17, 21, 25, 29, + 2, 6, 10, 14, 18, 22, 26, 30, + 3, 7, 11, 15, 19, 23, 27, 31, +}; +static const uint16_t ALIGN(av1_default_scan_8x8[], 32) = { + 0, 8, 1, 2, 9, 16, 24, 17, + 10, 3, 4, 11, 18, 25, 32, 40, + 33, 26, 19, 12, 5, 6, 13, 20, + 27, 34, 41, 48, 56, 49, 42, 35, + 28, 21, 14, 7, 15, 22, 29, 36, + 43, 50, 57, 58, 51, 44, 37, 30, + 23, 31, 38, 45, 52, 59, 60, 53, + 46, 39, 47, 54, 61, 62, 55, 63, +}; +static const uint16_t ALIGN(av1_mrow_scan_8x8[], 32) = { + 0, 8, 16, 24, 32, 40, 48, 56, + 1, 9, 17, 25, 33, 41, 49, 57, + 2, 10, 18, 26, 34, 42, 50, 58, + 3, 11, 19, 27, 35, 43, 51, 59, + 4, 12, 20, 28, 36, 44, 52, 60, + 5, 13, 21, 29, 37, 45, 53, 61, + 6, 14, 22, 30, 38, 46, 54, 62, + 7, 15, 23, 31, 39, 47, 55, 63, +}; +static const uint16_t ALIGN(av1_default_scan_8x16[], 32) = { + 0, 16, 1, 32, 17, 2, 48, 33, + 18, 3, 64, 49, 34, 19, 4, 80, + 65, 50, 35, 20, 5, 96, 81, 66, + 51, 36, 21, 6, 112, 97, 82, 67, + 52, 37, 22, 7, 113, 98, 83, 68, + 53, 38, 23, 8, 114, 99, 84, 69, + 54, 39, 24, 9, 115, 100, 85, 70, + 55, 40, 25, 10, 116, 101, 86, 71, + 56, 41, 26, 11, 117, 102, 87, 72, + 57, 42, 27, 12, 118, 103, 88, 73, + 58, 43, 28, 13, 119, 104, 89, 74, + 59, 44, 29, 14, 120, 105, 90, 75, + 60, 45, 30, 15, 121, 106, 91, 76, + 61, 46, 31, 122, 107, 92, 77, 62, + 47, 123, 108, 93, 78, 63, 124, 109, + 94, 79, 125, 110, 95, 126, 111, 127, +}; +static const uint16_t ALIGN(av1_mrow_scan_8x16[], 32) = { + 0, 16, 32, 48, 64, 80, 96, 112, + 1, 17, 33, 49, 65, 81, 97, 113, + 2, 18, 34, 50, 66, 82, 98, 114, + 3, 19, 35, 51, 67, 83, 99, 115, + 4, 20, 36, 52, 68, 84, 100, 116, + 5, 21, 37, 53, 69, 85, 101, 117, + 6, 22, 38, 54, 70, 86, 102, 118, + 7, 23, 39, 55, 71, 87, 103, 119, + 8, 24, 40, 56, 72, 88, 104, 120, + 9, 25, 41, 57, 73, 89, 105, 121, + 10, 26, 42, 58, 74, 90, 106, 122, + 11, 27, 43, 59, 75, 91, 107, 123, + 12, 28, 44, 60, 76, 92, 108, 124, + 13, 29, 45, 61, 77, 93, 109, 125, + 14, 30, 46, 62, 78, 94, 110, 126, + 15, 31, 47, 63, 79, 95, 111, 127, +}; +static const uint16_t ALIGN(av1_default_scan_8x32[], 32) = { + 0, 32, 1, 64, 33, 2, 96, 65, + 34, 3, 128, 97, 66, 35, 4, 160, + 129, 98, 67, 36, 5, 192, 161, 130, + 99, 68, 37, 6, 224, 193, 162, 131, + 100, 69, 38, 7, 225, 194, 163, 132, + 101, 70, 39, 8, 226, 195, 164, 133, + 102, 71, 40, 9, 227, 196, 165, 134, + 103, 72, 41, 10, 228, 197, 166, 135, + 104, 73, 42, 11, 229, 198, 167, 136, + 105, 74, 43, 12, 230, 199, 168, 137, + 106, 75, 44, 13, 231, 200, 169, 138, + 107, 76, 45, 14, 232, 201, 170, 139, + 108, 77, 46, 15, 233, 202, 171, 140, + 109, 78, 47, 16, 234, 203, 172, 141, + 110, 79, 48, 17, 235, 204, 173, 142, + 111, 80, 49, 18, 236, 205, 174, 143, + 112, 81, 50, 19, 237, 206, 175, 144, + 113, 82, 51, 20, 238, 207, 176, 145, + 114, 83, 52, 21, 239, 208, 177, 146, + 115, 84, 53, 22, 240, 209, 178, 147, + 116, 85, 54, 23, 241, 210, 179, 148, + 117, 86, 55, 24, 242, 211, 180, 149, + 118, 87, 56, 25, 243, 212, 181, 150, + 119, 88, 57, 26, 244, 213, 182, 151, + 120, 89, 58, 27, 245, 214, 183, 152, + 121, 90, 59, 28, 246, 215, 184, 153, + 122, 91, 60, 29, 247, 216, 185, 154, + 123, 92, 61, 30, 248, 217, 186, 155, + 124, 93, 62, 31, 249, 218, 187, 156, + 125, 94, 63, 250, 219, 188, 157, 126, + 95, 251, 220, 189, 158, 127, 252, 221, + 190, 159, 253, 222, 191, 254, 223, 255, +}; +static const uint16_t ALIGN(av1_default_scan_16x4[], 32) = { + 0, 1, 4, 2, 5, 8, 3, 6, 9, 12, 7, 10, 13, 16, 11, 14, + 17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 32, 27, 30, + 33, 36, 31, 34, 37, 40, 35, 38, 41, 44, 39, 42, 45, 48, 43, 46, + 49, 52, 47, 50, 53, 56, 51, 54, 57, 60, 55, 58, 61, 59, 62, 63, +}; +static const uint16_t ALIGN(av1_mrow_scan_16x4[], 32) = { + 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, + 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61, + 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62, + 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63, +}; +static const uint16_t ALIGN(av1_default_scan_16x8[], 32) = { + 0, 1, 8, 2, 9, 16, 3, 10, 17, 24, 4, 11, 18, 25, 32, 5, + 12, 19, 26, 33, 40, 6, 13, 20, 27, 34, 41, 48, 7, 14, 21, 28, + 35, 42, 49, 56, 15, 22, 29, 36, 43, 50, 57, 64, 23, 30, 37, 44, + 51, 58, 65, 72, 31, 38, 45, 52, 59, 66, 73, 80, 39, 46, 53, 60, + 67, 74, 81, 88, 47, 54, 61, 68, 75, 82, 89, 96, 55, 62, 69, 76, + 83, 90, 97, 104, 63, 70, 77, 84, 91, 98, 105, 112, 71, 78, 85, 92, + 99, 106, 113, 120, 79, 86, 93, 100, 107, 114, 121, 87, 94, 101, 108, 115, + 122, 95, 102, 109, 116, 123, 103, 110, 117, 124, 111, 118, 125, 119, 126, 127, +}; +static const uint16_t ALIGN(av1_mrow_scan_16x8[], 32) = { + 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, + 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121, + 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122, + 3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99, 107, 115, 123, + 4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124, + 5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125, + 6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126, + 7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127, +}; +static const uint16_t ALIGN(av1_default_scan_16x16[], 32) = { + 0, 16, 1, 2, 17, 32, 48, 33, 18, 3, 4, 19, 34, 49, 64, 80, + 65, 50, 35, 20, 5, 6, 21, 36, 51, 66, 81, 96, 112, 97, 82, 67, + 52, 37, 22, 7, 8, 23, 38, 53, 68, 83, 98, 113, 128, 144, 129, 114, + 99, 84, 69, 54, 39, 24, 9, 10, 25, 40, 55, 70, 85, 100, 115, 130, + 145, 160, 176, 161, 146, 131, 116, 101, 86, 71, 56, 41, 26, 11, 12, 27, + 42, 57, 72, 87, 102, 117, 132, 147, 162, 177, 192, 208, 193, 178, 163, 148, + 133, 118, 103, 88, 73, 58, 43, 28, 13, 14, 29, 44, 59, 74, 89, 104, + 119, 134, 149, 164, 179, 194, 209, 224, 240, 225, 210, 195, 180, 165, 150, 135, + 120, 105, 90, 75, 60, 45, 30, 15, 31, 46, 61, 76, 91, 106, 121, 136, + 151, 166, 181, 196, 211, 226, 241, 242, 227, 212, 197, 182, 167, 152, 137, 122, + 107, 92, 77, 62, 47, 63, 78, 93, 108, 123, 138, 153, 168, 183, 198, 213, + 228, 243, 244, 229, 214, 199, 184, 169, 154, 139, 124, 109, 94, 79, 95, 110, + 125, 140, 155, 170, 185, 200, 215, 230, 245, 246, 231, 216, 201, 186, 171, 156, + 141, 126, 111, 127, 142, 157, 172, 187, 202, 217, 232, 247, 248, 233, 218, 203, + 188, 173, 158, 143, 159, 174, 189, 204, 219, 234, 249, 250, 235, 220, 205, 190, + 175, 191, 206, 221, 236, 251, 252, 237, 222, 207, 223, 238, 253, 254, 239, 255, +}; +static const uint16_t ALIGN(av1_mrow_scan_16x16[], 32) = { + 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, + 1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, 209, 225, 241, + 2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162, 178, 194, 210, 226, 242, + 3, 19, 35, 51, 67, 83, 99, 115, 131, 147, 163, 179, 195, 211, 227, 243, + 4, 20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244, + 5, 21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245, + 6, 22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246, + 7, 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247, + 8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248, + 9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249, + 10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250, + 11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251, + 12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252, + 13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253, + 14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254, + 15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255, +}; +static const uint16_t ALIGN(av1_mcol_scan_16x16[], 32) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, + 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, + 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, + 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, + 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, + 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, + 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, + 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, + 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, + 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, + 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, +}; +static const uint16_t ALIGN(av1_default_scan_16x32[], 32) = { + 0, 32, 1, 64, 33, 2, 96, 65, 34, 3, 128, 97, 66, 35, 4, 160, + 129, 98, 67, 36, 5, 192, 161, 130, 99, 68, 37, 6, 224, 193, 162, 131, + 100, 69, 38, 7, 256, 225, 194, 163, 132, 101, 70, 39, 8, 288, 257, 226, + 195, 164, 133, 102, 71, 40, 9, 320, 289, 258, 227, 196, 165, 134, 103, 72, + 41, 10, 352, 321, 290, 259, 228, 197, 166, 135, 104, 73, 42, 11, 384, 353, + 322, 291, 260, 229, 198, 167, 136, 105, 74, 43, 12, 416, 385, 354, 323, 292, + 261, 230, 199, 168, 137, 106, 75, 44, 13, 448, 417, 386, 355, 324, 293, 262, + 231, 200, 169, 138, 107, 76, 45, 14, 480, 449, 418, 387, 356, 325, 294, 263, + 232, 201, 170, 139, 108, 77, 46, 15, 481, 450, 419, 388, 357, 326, 295, 264, + 233, 202, 171, 140, 109, 78, 47, 16, 482, 451, 420, 389, 358, 327, 296, 265, + 234, 203, 172, 141, 110, 79, 48, 17, 483, 452, 421, 390, 359, 328, 297, 266, + 235, 204, 173, 142, 111, 80, 49, 18, 484, 453, 422, 391, 360, 329, 298, 267, + 236, 205, 174, 143, 112, 81, 50, 19, 485, 454, 423, 392, 361, 330, 299, 268, + 237, 206, 175, 144, 113, 82, 51, 20, 486, 455, 424, 393, 362, 331, 300, 269, + 238, 207, 176, 145, 114, 83, 52, 21, 487, 456, 425, 394, 363, 332, 301, 270, + 239, 208, 177, 146, 115, 84, 53, 22, 488, 457, 426, 395, 364, 333, 302, 271, + 240, 209, 178, 147, 116, 85, 54, 23, 489, 458, 427, 396, 365, 334, 303, 272, + 241, 210, 179, 148, 117, 86, 55, 24, 490, 459, 428, 397, 366, 335, 304, 273, + 242, 211, 180, 149, 118, 87, 56, 25, 491, 460, 429, 398, 367, 336, 305, 274, + 243, 212, 181, 150, 119, 88, 57, 26, 492, 461, 430, 399, 368, 337, 306, 275, + 244, 213, 182, 151, 120, 89, 58, 27, 493, 462, 431, 400, 369, 338, 307, 276, + 245, 214, 183, 152, 121, 90, 59, 28, 494, 463, 432, 401, 370, 339, 308, 277, + 246, 215, 184, 153, 122, 91, 60, 29, 495, 464, 433, 402, 371, 340, 309, 278, + 247, 216, 185, 154, 123, 92, 61, 30, 496, 465, 434, 403, 372, 341, 310, 279, + 248, 217, 186, 155, 124, 93, 62, 31, 497, 466, 435, 404, 373, 342, 311, 280, + 249, 218, 187, 156, 125, 94, 63, 498, 467, 436, 405, 374, 343, 312, 281, 250, + 219, 188, 157, 126, 95, 499, 468, 437, 406, 375, 344, 313, 282, 251, 220, 189, + 158, 127, 500, 469, 438, 407, 376, 345, 314, 283, 252, 221, 190, 159, 501, 470, + 439, 408, 377, 346, 315, 284, 253, 222, 191, 502, 471, 440, 409, 378, 347, 316, + 285, 254, 223, 503, 472, 441, 410, 379, 348, 317, 286, 255, 504, 473, 442, 411, + 380, 349, 318, 287, 505, 474, 443, 412, 381, 350, 319, 506, 475, 444, 413, 382, + 351, 507, 476, 445, 414, 383, 508, 477, 446, 415, 509, 478, 447, 510, 479, 511, +}; +static const uint16_t ALIGN(av1_default_scan_32x8[], 32) = { + 0, 1, 8, 2, 9, 16, 3, 10, 17, 24, 4, 11, 18, 25, 32, 5, 12, 19, 26, 33, 40, 6, 13, 20, 27, 34, 41, 48, 7, 14, 21, 28, + 35, 42, 49, 56, 15, 22, 29, 36, 43, 50, 57, 64, 23, 30, 37, 44, 51, 58, 65, 72, 31, 38, 45, 52, 59, 66, 73, 80, 39, 46, 53, 60, + 67, 74, 81, 88, 47, 54, 61, 68, 75, 82, 89, 96, 55, 62, 69, 76, 83, 90, 97, 104, 63, 70, 77, 84, 91, 98, 105, 112, 71, 78, 85, 92, + 99, 106, 113, 120, 79, 86, 93, 100, 107, 114, 121, 128, 87, 94, 101, 108, 115, 122, 129, 136, 95, 102, 109, 116, 123, 130, 137, 144, 103, 110, 117, 124, + 131, 138, 145, 152, 111, 118, 125, 132, 139, 146, 153, 160, 119, 126, 133, 140, 147, 154, 161, 168, 127, 134, 141, 148, 155, 162, 169, 176, 135, 142, 149, 156, + 163, 170, 177, 184, 143, 150, 157, 164, 171, 178, 185, 192, 151, 158, 165, 172, 179, 186, 193, 200, 159, 166, 173, 180, 187, 194, 201, 208, 167, 174, 181, 188, + 195, 202, 209, 216, 175, 182, 189, 196, 203, 210, 217, 224, 183, 190, 197, 204, 211, 218, 225, 232, 191, 198, 205, 212, 219, 226, 233, 240, 199, 206, 213, 220, + 227, 234, 241, 248, 207, 214, 221, 228, 235, 242, 249, 215, 222, 229, 236, 243, 250, 223, 230, 237, 244, 251, 231, 238, 245, 252, 239, 246, 253, 247, 254, 255, +}; +static const uint16_t ALIGN(av1_default_scan_32x16[], 32) = { + 0, 1, 16, 2, 17, 32, 3, 18, 33, 48, 4, 19, 34, 49, 64, 5, 20, 35, 50, 65, 80, 6, 21, 36, 51, 66, 81, 96, 7, 22, 37, 52, + 67, 82, 97, 112, 8, 23, 38, 53, 68, 83, 98, 113, 128, 9, 24, 39, 54, 69, 84, 99, 114, 129, 144, 10, 25, 40, 55, 70, 85, 100, 115, 130, + 145, 160, 11, 26, 41, 56, 71, 86, 101, 116, 131, 146, 161, 176, 12, 27, 42, 57, 72, 87, 102, 117, 132, 147, 162, 177, 192, 13, 28, 43, 58, 73, + 88, 103, 118, 133, 148, 163, 178, 193, 208, 14, 29, 44, 59, 74, 89, 104, 119, 134, 149, 164, 179, 194, 209, 224, 15, 30, 45, 60, 75, 90, 105, 120, + 135, 150, 165, 180, 195, 210, 225, 240, 31, 46, 61, 76, 91, 106, 121, 136, 151, 166, 181, 196, 211, 226, 241, 256, 47, 62, 77, 92, 107, 122, 137, 152, + 167, 182, 197, 212, 227, 242, 257, 272, 63, 78, 93, 108, 123, 138, 153, 168, 183, 198, 213, 228, 243, 258, 273, 288, 79, 94, 109, 124, 139, 154, 169, 184, + 199, 214, 229, 244, 259, 274, 289, 304, 95, 110, 125, 140, 155, 170, 185, 200, 215, 230, 245, 260, 275, 290, 305, 320, 111, 126, 141, 156, 171, 186, 201, 216, + 231, 246, 261, 276, 291, 306, 321, 336, 127, 142, 157, 172, 187, 202, 217, 232, 247, 262, 277, 292, 307, 322, 337, 352, 143, 158, 173, 188, 203, 218, 233, 248, + 263, 278, 293, 308, 323, 338, 353, 368, 159, 174, 189, 204, 219, 234, 249, 264, 279, 294, 309, 324, 339, 354, 369, 384, 175, 190, 205, 220, 235, 250, 265, 280, + 295, 310, 325, 340, 355, 370, 385, 400, 191, 206, 221, 236, 251, 266, 281, 296, 311, 326, 341, 356, 371, 386, 401, 416, 207, 222, 237, 252, 267, 282, 297, 312, + 327, 342, 357, 372, 387, 402, 417, 432, 223, 238, 253, 268, 283, 298, 313, 328, 343, 358, 373, 388, 403, 418, 433, 448, 239, 254, 269, 284, 299, 314, 329, 344, + 359, 374, 389, 404, 419, 434, 449, 464, 255, 270, 285, 300, 315, 330, 345, 360, 375, 390, 405, 420, 435, 450, 465, 480, 271, 286, 301, 316, 331, 346, 361, 376, + 391, 406, 421, 436, 451, 466, 481, 496, 287, 302, 317, 332, 347, 362, 377, 392, 407, 422, 437, 452, 467, 482, 497, 303, 318, 333, 348, 363, 378, 393, 408, 423, + 438, 453, 468, 483, 498, 319, 334, 349, 364, 379, 394, 409, 424, 439, 454, 469, 484, 499, 335, 350, 365, 380, 395, 410, 425, 440, 455, 470, 485, 500, 351, 366, + 381, 396, 411, 426, 441, 456, 471, 486, 501, 367, 382, 397, 412, 427, 442, 457, 472, 487, 502, 383, 398, 413, 428, 443, 458, 473, 488, 503, 399, 414, 429, 444, + 459, 474, 489, 504, 415, 430, 445, 460, 475, 490, 505, 431, 446, 461, 476, 491, 506, 447, 462, 477, 492, 507, 463, 478, 493, 508, 479, 494, 509, 495, 510, 511, +}; +static const uint16_t ALIGN(av1_default_scan_32x32[], 32) = { + 0, 32, 1, 2, 33, 64, 96, 65, 34, 3, 4, 35, 66, 97, 128, 160, 129, 98, 67, 36, 5, 6, 37, 68, 99, 130, 161, 192, 224, 193, 162, 131, + 100, 69, 38, 7, 8, 39, 70, 101, 132, 163, 194, 225, 256, 288, 257, 226, 195, 164, 133, 102, 71, 40, 9, 10, 41, 72, 103, 134, 165, 196, 227, 258, + 289, 320, 352, 321, 290, 259, 228, 197, 166, 135, 104, 73, 42, 11, 12, 43, 74, 105, 136, 167, 198, 229, 260, 291, 322, 353, 384, 416, 385, 354, 323, 292, + 261, 230, 199, 168, 137, 106, 75, 44, 13, 14, 45, 76, 107, 138, 169, 200, 231, 262, 293, 324, 355, 386, 417, 448, 480, 449, 418, 387, 356, 325, 294, 263, + 232, 201, 170, 139, 108, 77, 46, 15, 16, 47, 78, 109, 140, 171, 202, 233, 264, 295, 326, 357, 388, 419, 450, 481, 512, 544, 513, 482, 451, 420, 389, 358, + 327, 296, 265, 234, 203, 172, 141, 110, 79, 48, 17, 18, 49, 80, 111, 142, 173, 204, 235, 266, 297, 328, 359, 390, 421, 452, 483, 514, 545, 576, 608, 577, + 546, 515, 484, 453, 422, 391, 360, 329, 298, 267, 236, 205, 174, 143, 112, 81, 50, 19, 20, 51, 82, 113, 144, 175, 206, 237, 268, 299, 330, 361, 392, 423, + 454, 485, 516, 547, 578, 609, 640, 672, 641, 610, 579, 548, 517, 486, 455, 424, 393, 362, 331, 300, 269, 238, 207, 176, 145, 114, 83, 52, 21, 22, 53, 84, + 115, 146, 177, 208, 239, 270, 301, 332, 363, 394, 425, 456, 487, 518, 549, 580, 611, 642, 673, 704, 736, 705, 674, 643, 612, 581, 550, 519, 488, 457, 426, 395, + 364, 333, 302, 271, 240, 209, 178, 147, 116, 85, 54, 23, 24, 55, 86, 117, 148, 179, 210, 241, 272, 303, 334, 365, 396, 427, 458, 489, 520, 551, 582, 613, + 644, 675, 706, 737, 768, 800, 769, 738, 707, 676, 645, 614, 583, 552, 521, 490, 459, 428, 397, 366, 335, 304, 273, 242, 211, 180, 149, 118, 87, 56, 25, 26, + 57, 88, 119, 150, 181, 212, 243, 274, 305, 336, 367, 398, 429, 460, 491, 522, 553, 584, 615, 646, 677, 708, 739, 770, 801, 832, 864, 833, 802, 771, 740, 709, + 678, 647, 616, 585, 554, 523, 492, 461, 430, 399, 368, 337, 306, 275, 244, 213, 182, 151, 120, 89, 58, 27, 28, 59, 90, 121, 152, 183, 214, 245, 276, 307, + 338, 369, 400, 431, 462, 493, 524, 555, 586, 617, 648, 679, 710, 741, 772, 803, 834, 865, 896, 928, 897, 866, 835, 804, 773, 742, 711, 680, 649, 618, 587, 556, + 525, 494, 463, 432, 401, 370, 339, 308, 277, 246, 215, 184, 153, 122, 91, 60, 29, 30, 61, 92, 123, 154, 185, 216, 247, 278, 309, 340, 371, 402, 433, 464, + 495, 526, 557, 588, 619, 650, 681, 712, 743, 774, 805, 836, 867, 898, 929, 960, 992, 961, 930, 899, 868, 837, 806, 775, 744, 713, 682, 651, 620, 589, 558, 527, + 496, 465, 434, 403, 372, 341, 310, 279, 248, 217, 186, 155, 124, 93, 62, 31, 63, 94, 125, 156, 187, 218, 249, 280, 311, 342, 373, 404, 435, 466, 497, 528, + 559, 590, 621, 652, 683, 714, 745, 776, 807, 838, 869, 900, 931, 962, 993, 994, 963, 932, 901, 870, 839, 808, 777, 746, 715, 684, 653, 622, 591, 560, 529, 498, + 467, 436, 405, 374, 343, 312, 281, 250, 219, 188, 157, 126, 95, 127, 158, 189, 220, 251, 282, 313, 344, 375, 406, 437, 468, 499, 530, 561, 592, 623, 654, 685, + 716, 747, 778, 809, 840, 871, 902, 933, 964, 995, 996, 965, 934, 903, 872, 841, 810, 779, 748, 717, 686, 655, 624, 593, 562, 531, 500, 469, 438, 407, 376, 345, + 314, 283, 252, 221, 190, 159, 191, 222, 253, 284, 315, 346, 377, 408, 439, 470, 501, 532, 563, 594, 625, 656, 687, 718, 749, 780, 811, 842, 873, 904, 935, 966, + 997, 998, 967, 936, 905, 874, 843, 812, 781, 750, 719, 688, 657, 626, 595, 564, 533, 502, 471, 440, 409, 378, 347, 316, 285, 254, 223, 255, 286, 317, 348, 379, + 410, 441, 472, 503, 534, 565, 596, 627, 658, 689, 720, 751, 782, 813, 844, 875, 906, 937, 968, 999, 1000, 969, 938, 907, 876, 845, 814, 783, 752, 721, 690, 659, + 628, 597, 566, 535, 504, 473, 442, 411, 380, 349, 318, 287, 319, 350, 381, 412, 443, 474, 505, 536, 567, 598, 629, 660, 691, 722, 753, 784, 815, 846, 877, 908, + 939, 970, 1001, 1002, 971, 940, 909, 878, 847, 816, 785, 754, 723, 692, 661, 630, 599, 568, 537, 506, 475, 444, 413, 382, 351, 383, 414, 445, 476, 507, 538, 569, + 600, 631, 662, 693, 724, 755, 786, 817, 848, 879, 910, 941, 972, 1003, 1004, 973, 942, 911, 880, 849, 818, 787, 756, 725, 694, 663, 632, 601, 570, 539, 508, 477, + 446, 415, 447, 478, 509, 540, 571, 602, 633, 664, 695, 726, 757, 788, 819, 850, 881, 912, 943, 974, 1005, 1006, 975, 944, 913, 882, 851, 820, 789, 758, 727, 696, + 665, 634, 603, 572, 541, 510, 479, 511, 542, 573, 604, 635, 666, 697, 728, 759, 790, 821, 852, 883, 914, 945, 976, 1007, 1008, 977, 946, 915, 884, 853, 822, 791, + 760, 729, 698, 667, 636, 605, 574, 543, 575, 606, 637, 668, 699, 730, 761, 792, 823, 854, 885, 916, 947, 978, 1009, 1010, 979, 948, 917, 886, 855, 824, 793, 762, + 731, 700, 669, 638, 607, 639, 670, 701, 732, 763, 794, 825, 856, 887, 918, 949, 980, 1011, 1012, 981, 950, 919, 888, 857, 826, 795, 764, 733, 702, 671, 703, 734, + 765, 796, 827, 858, 889, 920, 951, 982, 1013, 1014, 983, 952, 921, 890, 859, 828, 797, 766, 735, 767, 798, 829, 860, 891, 922, 953, 984, 1015, 1016, 985, 954, 923, + 892, 861, 830, 799, 831, 862, 893, 924, 955, 986, 1017, 1018, 987, 956, 925, 894, 863, 895, 926, 957, 988, 1019, 1020, 989, 958, 927, 959, 990, 1021, 1022, 991, 1023, +}; + +const uint16_t *const dav1d_scans[N_RECT_TX_SIZES][3] = { + [TX_4X4] = { + [TX_CLASS_2D] = av1_default_scan_4x4, + [TX_CLASS_V] = av1_mrow_scan_4x4, + [TX_CLASS_H] = av1_mcol_scan_16x16, + }, [TX_8X8] = { + [TX_CLASS_2D] = av1_default_scan_8x8, + [TX_CLASS_V] = av1_mrow_scan_8x8, + [TX_CLASS_H] = av1_mcol_scan_16x16, + }, [TX_16X16] = { + [TX_CLASS_2D] = av1_default_scan_16x16, + [TX_CLASS_V] = av1_mrow_scan_16x16, + [TX_CLASS_H] = av1_mcol_scan_16x16, + }, [TX_32X32] = { + [TX_CLASS_2D] = av1_default_scan_32x32, + }, [TX_64X64] = { + [TX_CLASS_2D] = av1_default_scan_32x32, + }, [RTX_4X8] = { + [TX_CLASS_2D] = av1_default_scan_4x8, + [TX_CLASS_V] = av1_mrow_scan_4x8, + [TX_CLASS_H] = av1_mcol_scan_16x16, + }, [RTX_8X4] = { + [TX_CLASS_2D] = av1_default_scan_8x4, + [TX_CLASS_V] = av1_mrow_scan_8x4, + [TX_CLASS_H] = av1_mcol_scan_16x16, + }, [RTX_8X16] = { + [TX_CLASS_2D] = av1_default_scan_8x16, + [TX_CLASS_V] = av1_mrow_scan_8x16, + [TX_CLASS_H] = av1_mcol_scan_16x16, + }, [RTX_16X8] = { + [TX_CLASS_2D] = av1_default_scan_16x8, + [TX_CLASS_V] = av1_mrow_scan_16x8, + [TX_CLASS_H] = av1_mcol_scan_16x16, + }, [RTX_16X32] = { + [TX_CLASS_2D] = av1_default_scan_16x32, + }, [RTX_32X16] = { + [TX_CLASS_2D] = av1_default_scan_32x16, + }, [RTX_32X64] = { + [TX_CLASS_2D] = av1_default_scan_32x32, + }, [RTX_64X32] = { + [TX_CLASS_2D] = av1_default_scan_32x32, + }, [RTX_4X16] = { + [TX_CLASS_2D] = av1_default_scan_4x16, + [TX_CLASS_V] = av1_mrow_scan_4x16, + [TX_CLASS_H] = av1_mcol_scan_16x16, + }, [RTX_16X4] = { + [TX_CLASS_2D] = av1_default_scan_16x4, + [TX_CLASS_V] = av1_mrow_scan_16x4, + [TX_CLASS_H] = av1_mcol_scan_16x16, + }, [RTX_8X32] = { + [TX_CLASS_2D] = av1_default_scan_8x32, + }, [RTX_32X8] = { + [TX_CLASS_2D] = av1_default_scan_32x8, + }, [RTX_16X64] = { + [TX_CLASS_2D] = av1_default_scan_16x32, + }, [RTX_64X16] = { + [TX_CLASS_2D] = av1_default_scan_32x16, + }, +}; diff --git a/third_party/dav1d/src/scan.h b/third_party/dav1d/src/scan.h new file mode 100644 index 0000000000..c474b7f055 --- /dev/null +++ b/third_party/dav1d/src/scan.h @@ -0,0 +1,37 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_SCAN_H +#define DAV1D_SRC_SCAN_H + +#include + +#include "src/levels.h" + +extern const uint16_t *const dav1d_scans[N_RECT_TX_SIZES][3]; + +#endif /* DAV1D_SRC_SCAN_H */ diff --git a/third_party/dav1d/src/tables.c b/third_party/dav1d/src/tables.c new file mode 100644 index 0000000000..840b409518 --- /dev/null +++ b/third_party/dav1d/src/tables.c @@ -0,0 +1,1022 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include + +#include "common/attributes.h" + +#include "src/levels.h" +#include "src/tables.h" + +const uint8_t dav1d_al_part_ctx[2][N_BL_LEVELS][N_PARTITIONS] = { + { + // partitions: + // none, h, v, split, tts, tbs, tls, trs, h4, v4 + { 0x00, 0x00, 0x10, -1, 0x00, 0x10, 0x10, 0x10, -1, -1 }, // bl128 + { 0x10, 0x10, 0x18, -1, 0x10, 0x18, 0x18, 0x18, 0x10, 0x1c }, // bl64 + { 0x18, 0x18, 0x1c, -1, 0x18, 0x1c, 0x1c, 0x1c, 0x18, 0x1e }, // bl32 + { 0x1c, 0x1c, 0x1e, -1, 0x1c, 0x1e, 0x1e, 0x1e, 0x1c, 0x1f }, // bl16 + { 0x1e, 0x1e, 0x1f, 0x1f, -1, -1, -1, -1, -1, -1 }, // bl8 + }, { + { 0x00, 0x10, 0x00, -1, 0x10, 0x10, 0x00, 0x10, -1, -1 }, // bl128 + { 0x10, 0x18, 0x10, -1, 0x18, 0x18, 0x10, 0x18, 0x1c, 0x10 }, // bl64 + { 0x18, 0x1c, 0x18, -1, 0x1c, 0x1c, 0x18, 0x1c, 0x1e, 0x18 }, // bl32 + { 0x1c, 0x1e, 0x1c, -1, 0x1e, 0x1e, 0x1c, 0x1e, 0x1f, 0x1c }, // bl16 + { 0x1e, 0x1f, 0x1e, 0x1f, -1, -1, -1, -1, -1, -1 }, // bl8 + } +}; + +const uint8_t /* enum BlockSize */ + dav1d_block_sizes[N_BL_LEVELS][N_PARTITIONS][2] = +{ + [BL_128X128] = { + [PARTITION_NONE] = { BS_128x128 }, + [PARTITION_H] = { BS_128x64 }, + [PARTITION_V] = { BS_64x128 }, + [PARTITION_T_TOP_SPLIT] = { BS_64x64, BS_128x64 }, + [PARTITION_T_BOTTOM_SPLIT] = { BS_128x64, BS_64x64 }, + [PARTITION_T_LEFT_SPLIT] = { BS_64x64, BS_64x128 }, + [PARTITION_T_RIGHT_SPLIT] = { BS_64x128, BS_64x64 }, + }, [BL_64X64] = { + [PARTITION_NONE] = { BS_64x64 }, + [PARTITION_H] = { BS_64x32 }, + [PARTITION_V] = { BS_32x64 }, + [PARTITION_T_TOP_SPLIT] = { BS_32x32, BS_64x32 }, + [PARTITION_T_BOTTOM_SPLIT] = { BS_64x32, BS_32x32 }, + [PARTITION_T_LEFT_SPLIT] = { BS_32x32, BS_32x64 }, + [PARTITION_T_RIGHT_SPLIT] = { BS_32x64, BS_32x32 }, + [PARTITION_H4] = { BS_64x16 }, + [PARTITION_V4] = { BS_16x64 }, + }, [BL_32X32] = { + [PARTITION_NONE] = { BS_32x32 }, + [PARTITION_H] = { BS_32x16 }, + [PARTITION_V] = { BS_16x32 }, + [PARTITION_T_TOP_SPLIT] = { BS_16x16, BS_32x16 }, + [PARTITION_T_BOTTOM_SPLIT] = { BS_32x16, BS_16x16 }, + [PARTITION_T_LEFT_SPLIT] = { BS_16x16, BS_16x32 }, + [PARTITION_T_RIGHT_SPLIT] = { BS_16x32, BS_16x16 }, + [PARTITION_H4] = { BS_32x8 }, + [PARTITION_V4] = { BS_8x32 }, + }, [BL_16X16] = { + [PARTITION_NONE] = { BS_16x16 }, + [PARTITION_H] = { BS_16x8 }, + [PARTITION_V] = { BS_8x16 }, + [PARTITION_T_TOP_SPLIT] = { BS_8x8, BS_16x8 }, + [PARTITION_T_BOTTOM_SPLIT] = { BS_16x8, BS_8x8 }, + [PARTITION_T_LEFT_SPLIT] = { BS_8x8, BS_8x16 }, + [PARTITION_T_RIGHT_SPLIT] = { BS_8x16, BS_8x8 }, + [PARTITION_H4] = { BS_16x4 }, + [PARTITION_V4] = { BS_4x16 }, + }, [BL_8X8] = { + [PARTITION_NONE] = { BS_8x8 }, + [PARTITION_H] = { BS_8x4 }, + [PARTITION_V] = { BS_4x8 }, + [PARTITION_SPLIT] = { BS_4x4 }, + } +}; + +const uint8_t dav1d_block_dimensions[N_BS_SIZES][4] = { + [BS_128x128] = { 32, 32, 5, 5 }, + [BS_128x64] = { 32, 16, 5, 4 }, + [BS_64x128] = { 16, 32, 4, 5 }, + [BS_64x64] = { 16, 16, 4, 4 }, + [BS_64x32] = { 16, 8, 4, 3 }, + [BS_64x16] = { 16, 4, 4, 2 }, + [BS_32x64] = { 8, 16, 3, 4 }, + [BS_32x32] = { 8, 8, 3, 3 }, + [BS_32x16] = { 8, 4, 3, 2 }, + [BS_32x8] = { 8, 2, 3, 1 }, + [BS_16x64] = { 4, 16, 2, 4 }, + [BS_16x32] = { 4, 8, 2, 3 }, + [BS_16x16] = { 4, 4, 2, 2 }, + [BS_16x8] = { 4, 2, 2, 1 }, + [BS_16x4] = { 4, 1, 2, 0 }, + [BS_8x32] = { 2, 8, 1, 3 }, + [BS_8x16] = { 2, 4, 1, 2 }, + [BS_8x8] = { 2, 2, 1, 1 }, + [BS_8x4] = { 2, 1, 1, 0 }, + [BS_4x16] = { 1, 4, 0, 2 }, + [BS_4x8] = { 1, 2, 0, 1 }, + [BS_4x4] = { 1, 1, 0, 0 }, +}; + +const TxfmInfo dav1d_txfm_dimensions[N_RECT_TX_SIZES] = { + [ TX_4X4] = { .w = 1, .h = 1, .lw = 0, .lh = 0, + .min = 0, .max = 0, .ctx = 0 }, + [ TX_8X8] = { .w = 2, .h = 2, .lw = 1, .lh = 1, + .min = 1, .max = 1, .sub = TX_4X4, .ctx = 1 }, + [ TX_16X16] = { .w = 4, .h = 4, .lw = 2, .lh = 2, + .min = 2, .max = 2, .sub = TX_8X8, .ctx = 2 }, + [ TX_32X32] = { .w = 8, .h = 8, .lw = 3, .lh = 3, + .min = 3, .max = 3, .sub = TX_16X16, .ctx = 3 }, + [ TX_64X64] = { .w = 16, .h = 16, .lw = 4, .lh = 4, + .min = 4, .max = 4, .sub = TX_32X32, .ctx = 4 }, + [RTX_4X8] = { .w = 1, .h = 2, .lw = 0, .lh = 1, + .min = 0, .max = 1, .sub = TX_4X4, .ctx = 1 }, + [RTX_8X4] = { .w = 2, .h = 1, .lw = 1, .lh = 0, + .min = 0, .max = 1, .sub = TX_4X4, .ctx = 1 }, + [RTX_8X16] = { .w = 2, .h = 4, .lw = 1, .lh = 2, + .min = 1, .max = 2, .sub = TX_8X8, .ctx = 2 }, + [RTX_16X8] = { .w = 4, .h = 2, .lw = 2, .lh = 1, + .min = 1, .max = 2, .sub = TX_8X8, .ctx = 2 }, + [RTX_16X32] = { .w = 4, .h = 8, .lw = 2, .lh = 3, + .min = 2, .max = 3, .sub = TX_16X16, .ctx = 3 }, + [RTX_32X16] = { .w = 8, .h = 4, .lw = 3, .lh = 2, + .min = 2, .max = 3, .sub = TX_16X16, .ctx = 3 }, + [RTX_32X64] = { .w = 8, .h = 16, .lw = 3, .lh = 4, + .min = 3, .max = 4, .sub = TX_32X32, .ctx = 4 }, + [RTX_64X32] = { .w = 16, .h = 8, .lw = 4, .lh = 3, + .min = 3, .max = 4, .sub = TX_32X32, .ctx = 4 }, + [RTX_4X16] = { .w = 1, .h = 4, .lw = 0, .lh = 2, + .min = 0, .max = 2, .sub = RTX_4X8, .ctx = 1 }, + [RTX_16X4] = { .w = 4, .h = 1, .lw = 2, .lh = 0, + .min = 0, .max = 2, .sub = RTX_8X4, .ctx = 1 }, + [RTX_8X32] = { .w = 2, .h = 8, .lw = 1, .lh = 3, + .min = 1, .max = 3, .sub = RTX_8X16, .ctx = 2 }, + [RTX_32X8] = { .w = 8, .h = 2, .lw = 3, .lh = 1, + .min = 1, .max = 3, .sub = RTX_16X8, .ctx = 2 }, + [RTX_16X64] = { .w = 4, .h = 16, .lw = 2, .lh = 4, + .min = 2, .max = 4, .sub = RTX_16X32, .ctx = 3 }, + [RTX_64X16] = { .w = 16, .h = 4, .lw = 4, .lh = 2, + .min = 2, .max = 4, .sub = RTX_32X16, .ctx = 3 }, +}; + +const uint8_t /* enum (Rect)TxfmSize */ + dav1d_max_txfm_size_for_bs[N_BS_SIZES][4 /* y, 420, 422, 444 */] = +{ + [BS_128x128] = { TX_64X64, TX_32X32, TX_32X32, TX_32X32 }, + [BS_128x64] = { TX_64X64, TX_32X32, TX_32X32, TX_32X32 }, + [BS_64x128] = { TX_64X64, TX_32X32, 0, TX_32X32 }, + [BS_64x64] = { TX_64X64, TX_32X32, TX_32X32, TX_32X32 }, + [BS_64x32] = { RTX_64X32, RTX_32X16, TX_32X32, TX_32X32 }, + [BS_64x16] = { RTX_64X16, RTX_32X8, RTX_32X16, RTX_32X16 }, + [BS_32x64] = { RTX_32X64, RTX_16X32, 0, TX_32X32 }, + [BS_32x32] = { TX_32X32, TX_16X16, RTX_16X32, TX_32X32 }, + [BS_32x16] = { RTX_32X16, RTX_16X8, TX_16X16, RTX_32X16 }, + [BS_32x8] = { RTX_32X8, RTX_16X4, RTX_16X8, RTX_32X8 }, + [BS_16x64] = { RTX_16X64, RTX_8X32, 0, RTX_16X32 }, + [BS_16x32] = { RTX_16X32, RTX_8X16, 0, RTX_16X32 }, + [BS_16x16] = { TX_16X16, TX_8X8, RTX_8X16, TX_16X16 }, + [BS_16x8] = { RTX_16X8, RTX_8X4, TX_8X8, RTX_16X8 }, + [BS_16x4] = { RTX_16X4, RTX_8X4, RTX_8X4, RTX_16X4 }, + [BS_8x32] = { RTX_8X32, RTX_4X16, 0, RTX_8X32 }, + [BS_8x16] = { RTX_8X16, RTX_4X8, 0, RTX_8X16 }, + [BS_8x8] = { TX_8X8, TX_4X4, RTX_4X8, TX_8X8 }, + [BS_8x4] = { RTX_8X4, TX_4X4, TX_4X4, RTX_8X4 }, + [BS_4x16] = { RTX_4X16, RTX_4X8, 0, RTX_4X16 }, + [BS_4x8] = { RTX_4X8, TX_4X4, 0, RTX_4X8 }, + [BS_4x4] = { TX_4X4, TX_4X4, TX_4X4, TX_4X4 }, +}; + +const uint8_t /* enum TxfmType */ + dav1d_txtp_from_uvmode[N_UV_INTRA_PRED_MODES] = +{ + [DC_PRED] = DCT_DCT, + [VERT_PRED] = ADST_DCT, + [HOR_PRED] = DCT_ADST, + [DIAG_DOWN_LEFT_PRED] = DCT_DCT, + [DIAG_DOWN_RIGHT_PRED] = ADST_ADST, + [VERT_RIGHT_PRED] = ADST_DCT, + [HOR_DOWN_PRED] = DCT_ADST, + [HOR_UP_PRED] = DCT_ADST, + [VERT_LEFT_PRED] = ADST_DCT, + [SMOOTH_PRED] = ADST_ADST, + [SMOOTH_V_PRED] = ADST_DCT, + [SMOOTH_H_PRED] = DCT_ADST, + [PAETH_PRED] = ADST_ADST, +}; + +const uint8_t /* enum InterPredMode */ + dav1d_comp_inter_pred_modes[N_COMP_INTER_PRED_MODES][2] = +{ + [NEARESTMV_NEARESTMV] = { NEARESTMV, NEARESTMV }, + [NEARMV_NEARMV] = { NEARMV, NEARMV }, + [NEWMV_NEWMV] = { NEWMV, NEWMV }, + [GLOBALMV_GLOBALMV] = { GLOBALMV, GLOBALMV }, + [NEWMV_NEARESTMV] = { NEWMV, NEARESTMV }, + [NEWMV_NEARMV] = { NEWMV, NEARMV }, + [NEARESTMV_NEWMV] = { NEARESTMV, NEWMV }, + [NEARMV_NEWMV] = { NEARMV, NEWMV }, +}; + +const uint8_t dav1d_partition_type_count[N_BL_LEVELS] = { + [BL_128X128] = N_PARTITIONS - 3, + [BL_64X64] = N_PARTITIONS - 1, + [BL_32X32] = N_PARTITIONS - 1, + [BL_16X16] = N_PARTITIONS - 1, + [BL_8X8] = N_SUB8X8_PARTITIONS - 1, +}; + +const uint8_t /* enum TxfmType */ dav1d_tx_types_per_set[40] = { + /* Intra2 */ + IDTX, DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST, + /* Intra1 */ + IDTX, DCT_DCT, V_DCT, H_DCT, ADST_ADST, ADST_DCT, DCT_ADST, + /* Inter2 */ + IDTX, V_DCT, H_DCT, DCT_DCT, ADST_DCT, DCT_ADST, FLIPADST_DCT, + DCT_FLIPADST, ADST_ADST, FLIPADST_FLIPADST, ADST_FLIPADST, FLIPADST_ADST, + /* Inter1 */ + IDTX, V_DCT, H_DCT, V_ADST, H_ADST, V_FLIPADST, H_FLIPADST, + DCT_DCT, ADST_DCT, DCT_ADST, FLIPADST_DCT, DCT_FLIPADST, + ADST_ADST, FLIPADST_FLIPADST, ADST_FLIPADST, FLIPADST_ADST, +}; + +const uint8_t dav1d_ymode_size_context[N_BS_SIZES] = { + [BS_128x128] = 3, + [BS_128x64] = 3, + [BS_64x128] = 3, + [BS_64x64] = 3, + [BS_64x32] = 3, + [BS_64x16] = 2, + [BS_32x64] = 3, + [BS_32x32] = 3, + [BS_32x16] = 2, + [BS_32x8 ] = 1, + [BS_16x64] = 2, + [BS_16x32] = 2, + [BS_16x16] = 2, + [BS_16x8 ] = 1, + [BS_16x4 ] = 0, + [BS_8x32 ] = 1, + [BS_8x16 ] = 1, + [BS_8x8 ] = 1, + [BS_8x4 ] = 0, + [BS_4x16 ] = 0, + [BS_4x8 ] = 0, + [BS_4x4 ] = 0, +}; + +const uint8_t dav1d_lo_ctx_offsets[3][5][5] = { + { /* w == h */ + { 0, 1, 6, 6, 21 }, + { 1, 6, 6, 21, 21 }, + { 6, 6, 21, 21, 21 }, + { 6, 21, 21, 21, 21 }, + { 21, 21, 21, 21, 21 }, + }, { /* w > h */ + { 0, 16, 6, 6, 21 }, + { 16, 16, 6, 21, 21 }, + { 16, 16, 21, 21, 21 }, + { 16, 16, 21, 21, 21 }, + { 16, 16, 21, 21, 21 }, + }, { /* w < h */ + { 0, 11, 11, 11, 11 }, + { 11, 11, 11, 11, 11 }, + { 6, 6, 21, 21, 21 }, + { 6, 21, 21, 21, 21 }, + { 21, 21, 21, 21, 21 }, + }, +}; + +const uint8_t dav1d_skip_ctx[5][5] = { + { 1, 2, 2, 2, 3 }, + { 2, 4, 4, 4, 5 }, + { 2, 4, 4, 4, 5 }, + { 2, 4, 4, 4, 5 }, + { 3, 5, 5, 5, 6 }, +}; + +const uint8_t /* enum TxClass */ dav1d_tx_type_class[N_TX_TYPES_PLUS_LL] = { + [DCT_DCT] = TX_CLASS_2D, + [ADST_DCT] = TX_CLASS_2D, + [DCT_ADST] = TX_CLASS_2D, + [ADST_ADST] = TX_CLASS_2D, + [FLIPADST_DCT] = TX_CLASS_2D, + [DCT_FLIPADST] = TX_CLASS_2D, + [FLIPADST_FLIPADST] = TX_CLASS_2D, + [ADST_FLIPADST] = TX_CLASS_2D, + [FLIPADST_ADST] = TX_CLASS_2D, + [IDTX] = TX_CLASS_2D, + [V_DCT] = TX_CLASS_V, + [H_DCT] = TX_CLASS_H, + [V_ADST] = TX_CLASS_V, + [H_ADST] = TX_CLASS_H, + [V_FLIPADST] = TX_CLASS_V, + [H_FLIPADST] = TX_CLASS_H, + [WHT_WHT] = TX_CLASS_2D, +}; + +const uint8_t /* enum Filter2d */ dav1d_filter_2d[DAV1D_N_FILTERS][DAV1D_N_FILTERS] = { + [DAV1D_FILTER_8TAP_REGULAR] = { + [DAV1D_FILTER_8TAP_REGULAR] = FILTER_2D_8TAP_REGULAR, + [DAV1D_FILTER_8TAP_SHARP] = FILTER_2D_8TAP_REGULAR_SHARP, + [DAV1D_FILTER_8TAP_SMOOTH] = FILTER_2D_8TAP_REGULAR_SMOOTH, + }, [DAV1D_FILTER_8TAP_SHARP] = { + [DAV1D_FILTER_8TAP_REGULAR] = FILTER_2D_8TAP_SHARP_REGULAR, + [DAV1D_FILTER_8TAP_SHARP] = FILTER_2D_8TAP_SHARP, + [DAV1D_FILTER_8TAP_SMOOTH] = FILTER_2D_8TAP_SHARP_SMOOTH, + }, [DAV1D_FILTER_8TAP_SMOOTH] = { + [DAV1D_FILTER_8TAP_REGULAR] = FILTER_2D_8TAP_SMOOTH_REGULAR, + [DAV1D_FILTER_8TAP_SHARP] = FILTER_2D_8TAP_SMOOTH_SHARP, + [DAV1D_FILTER_8TAP_SMOOTH] = FILTER_2D_8TAP_SMOOTH, + }, [DAV1D_FILTER_BILINEAR] = { + [DAV1D_FILTER_BILINEAR] = FILTER_2D_BILINEAR, + } +}; + +const uint8_t /* enum Dav1dFilterMode */ dav1d_filter_dir[N_2D_FILTERS][2] = { + [FILTER_2D_8TAP_REGULAR] = { DAV1D_FILTER_8TAP_REGULAR, DAV1D_FILTER_8TAP_REGULAR }, + [FILTER_2D_8TAP_REGULAR_SMOOTH] = { DAV1D_FILTER_8TAP_SMOOTH, DAV1D_FILTER_8TAP_REGULAR }, + [FILTER_2D_8TAP_REGULAR_SHARP] = { DAV1D_FILTER_8TAP_SHARP, DAV1D_FILTER_8TAP_REGULAR }, + [FILTER_2D_8TAP_SHARP_REGULAR] = { DAV1D_FILTER_8TAP_REGULAR, DAV1D_FILTER_8TAP_SHARP }, + [FILTER_2D_8TAP_SHARP_SMOOTH] = { DAV1D_FILTER_8TAP_SMOOTH, DAV1D_FILTER_8TAP_SHARP }, + [FILTER_2D_8TAP_SHARP] = { DAV1D_FILTER_8TAP_SHARP, DAV1D_FILTER_8TAP_SHARP }, + [FILTER_2D_8TAP_SMOOTH_REGULAR] = { DAV1D_FILTER_8TAP_REGULAR, DAV1D_FILTER_8TAP_SMOOTH }, + [FILTER_2D_8TAP_SMOOTH] = { DAV1D_FILTER_8TAP_SMOOTH, DAV1D_FILTER_8TAP_SMOOTH }, + [FILTER_2D_8TAP_SMOOTH_SHARP] = { DAV1D_FILTER_8TAP_SHARP, DAV1D_FILTER_8TAP_SMOOTH }, + [FILTER_2D_BILINEAR] = { DAV1D_FILTER_BILINEAR, DAV1D_FILTER_BILINEAR }, +}; + +const uint8_t dav1d_filter_mode_to_y_mode[5] = { + DC_PRED, VERT_PRED, HOR_PRED, HOR_DOWN_PRED, DC_PRED +}; + +const uint8_t dav1d_intra_mode_context[N_INTRA_PRED_MODES] = { + [DC_PRED] = 0, + [VERT_PRED] = 1, + [HOR_PRED] = 2, + [DIAG_DOWN_LEFT_PRED] = 3, + [DIAG_DOWN_RIGHT_PRED] = 4, + [VERT_RIGHT_PRED] = 4, + [HOR_DOWN_PRED] = 4, + [HOR_UP_PRED] = 4, + [VERT_LEFT_PRED] = 3, + [SMOOTH_PRED] = 0, + [SMOOTH_V_PRED] = 1, + [SMOOTH_H_PRED] = 2, + [PAETH_PRED] = 0, +}; + +const uint8_t dav1d_wedge_ctx_lut[N_BS_SIZES] = { + [BS_32x32] = 6, + [BS_32x16] = 5, + [BS_32x8] = 8, + [BS_16x32] = 4, + [BS_16x16] = 3, + [BS_16x8] = 2, + [BS_8x32] = 7, + [BS_8x16] = 1, + [BS_8x8] = 0, +}; + +const Dav1dWarpedMotionParams dav1d_default_wm_params = { + .type = DAV1D_WM_TYPE_IDENTITY, + .matrix = { + 0, 0, 1 << 16, + 0, 0, 1 << 16, + }, + .u.p.alpha = 0, + .u.p.beta = 0, + .u.p.gamma = 0, + .u.p.delta = 0, +}; + +const int8_t dav1d_cdef_directions[2 + 8 + 2 /* dir */][2 /* pass */] = { + { 1 * 12 + 0, 2 * 12 + 0 }, // 6 + { 1 * 12 + 0, 2 * 12 - 1 }, // 7 + { -1 * 12 + 1, -2 * 12 + 2 }, // 0 + { 0 * 12 + 1, -1 * 12 + 2 }, // 1 + { 0 * 12 + 1, 0 * 12 + 2 }, // 2 + { 0 * 12 + 1, 1 * 12 + 2 }, // 3 + { 1 * 12 + 1, 2 * 12 + 2 }, // 4 + { 1 * 12 + 0, 2 * 12 + 1 }, // 5 + { 1 * 12 + 0, 2 * 12 + 0 }, // 6 + { 1 * 12 + 0, 2 * 12 - 1 }, // 7 + { -1 * 12 + 1, -2 * 12 + 2 }, // 0 + { 0 * 12 + 1, -1 * 12 + 2 }, // 1 +}; + +const int16_t dav1d_sgr_params[16][4] = { // r0, r1, e0, e1 + { 2, 1, 140, 3236 }, { 2, 1, 112, 2158 }, { 2, 1, 93, 1618 }, + { 2, 1, 80, 1438 }, { 2, 1, 70, 1295 }, { 2, 1, 58, 1177 }, + { 2, 1, 47, 1079 }, { 2, 1, 37, 996 }, { 2, 1, 30, 925 }, + { 2, 1, 25, 863 }, { 0, 1, -1, 2589 }, { 0, 1, -1, 1618 }, + { 0, 1, -1, 1177 }, { 0, 1, -1, 925 }, { 2, 0, 56, -1 }, + { 2, 0, 22, -1 }, +}; + +const uint8_t ALIGN(dav1d_sgr_x_by_x[256], 16) = { + 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, + 16, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, + 8, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, + 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0 +}; + +const int8_t ALIGN(dav1d_mc_subpel_filters[5+ARCH_X86_64][15][8], 8) = { + [DAV1D_FILTER_8TAP_REGULAR] = { + { 0, 1, -3, 63, 4, -1, 0, 0 }, + { 0, 1, -5, 61, 9, -2, 0, 0 }, + { 0, 1, -6, 58, 14, -4, 1, 0 }, + { 0, 1, -7, 55, 19, -5, 1, 0 }, + { 0, 1, -7, 51, 24, -6, 1, 0 }, + { 0, 1, -8, 47, 29, -6, 1, 0 }, + { 0, 1, -7, 42, 33, -6, 1, 0 }, + { 0, 1, -7, 38, 38, -7, 1, 0 }, + { 0, 1, -6, 33, 42, -7, 1, 0 }, + { 0, 1, -6, 29, 47, -8, 1, 0 }, + { 0, 1, -6, 24, 51, -7, 1, 0 }, + { 0, 1, -5, 19, 55, -7, 1, 0 }, + { 0, 1, -4, 14, 58, -6, 1, 0 }, + { 0, 0, -2, 9, 61, -5, 1, 0 }, + { 0, 0, -1, 4, 63, -3, 1, 0 } + }, [DAV1D_FILTER_8TAP_SMOOTH] = { + { 0, 1, 14, 31, 17, 1, 0, 0 }, + { 0, 0, 13, 31, 18, 2, 0, 0 }, + { 0, 0, 11, 31, 20, 2, 0, 0 }, + { 0, 0, 10, 30, 21, 3, 0, 0 }, + { 0, 0, 9, 29, 22, 4, 0, 0 }, + { 0, 0, 8, 28, 23, 5, 0, 0 }, + { 0, -1, 8, 27, 24, 6, 0, 0 }, + { 0, -1, 7, 26, 26, 7, -1, 0 }, + { 0, 0, 6, 24, 27, 8, -1, 0 }, + { 0, 0, 5, 23, 28, 8, 0, 0 }, + { 0, 0, 4, 22, 29, 9, 0, 0 }, + { 0, 0, 3, 21, 30, 10, 0, 0 }, + { 0, 0, 2, 20, 31, 11, 0, 0 }, + { 0, 0, 2, 18, 31, 13, 0, 0 }, + { 0, 0, 1, 17, 31, 14, 1, 0 } + }, [DAV1D_FILTER_8TAP_SHARP] = { + { -1, 1, -3, 63, 4, -1, 1, 0 }, + { -1, 3, -6, 62, 8, -3, 2, -1 }, + { -1, 4, -9, 60, 13, -5, 3, -1 }, + { -2, 5, -11, 58, 19, -7, 3, -1 }, + { -2, 5, -11, 54, 24, -9, 4, -1 }, + { -2, 5, -12, 50, 30, -10, 4, -1 }, + { -2, 5, -12, 45, 35, -11, 5, -1 }, + { -2, 6, -12, 40, 40, -12, 6, -2 }, + { -1, 5, -11, 35, 45, -12, 5, -2 }, + { -1, 4, -10, 30, 50, -12, 5, -2 }, + { -1, 4, -9, 24, 54, -11, 5, -2 }, + { -1, 3, -7, 19, 58, -11, 5, -2 }, + { -1, 3, -5, 13, 60, -9, 4, -1 }, + { -1, 2, -3, 8, 62, -6, 3, -1 }, + { 0, 1, -1, 4, 63, -3, 1, -1 } + /* width <= 4 */ + }, [3 + DAV1D_FILTER_8TAP_REGULAR] = { + { 0, 0, -2, 63, 4, -1, 0, 0 }, + { 0, 0, -4, 61, 9, -2, 0, 0 }, + { 0, 0, -5, 58, 14, -3, 0, 0 }, + { 0, 0, -6, 55, 19, -4, 0, 0 }, + { 0, 0, -6, 51, 24, -5, 0, 0 }, + { 0, 0, -7, 47, 29, -5, 0, 0 }, + { 0, 0, -6, 42, 33, -5, 0, 0 }, + { 0, 0, -6, 38, 38, -6, 0, 0 }, + { 0, 0, -5, 33, 42, -6, 0, 0 }, + { 0, 0, -5, 29, 47, -7, 0, 0 }, + { 0, 0, -5, 24, 51, -6, 0, 0 }, + { 0, 0, -4, 19, 55, -6, 0, 0 }, + { 0, 0, -3, 14, 58, -5, 0, 0 }, + { 0, 0, -2, 9, 61, -4, 0, 0 }, + { 0, 0, -1, 4, 63, -2, 0, 0 } + }, [3 + DAV1D_FILTER_8TAP_SMOOTH] = { + { 0, 0, 15, 31, 17, 1, 0, 0 }, + { 0, 0, 13, 31, 18, 2, 0, 0 }, + { 0, 0, 11, 31, 20, 2, 0, 0 }, + { 0, 0, 10, 30, 21, 3, 0, 0 }, + { 0, 0, 9, 29, 22, 4, 0, 0 }, + { 0, 0, 8, 28, 23, 5, 0, 0 }, + { 0, 0, 7, 27, 24, 6, 0, 0 }, + { 0, 0, 6, 26, 26, 6, 0, 0 }, + { 0, 0, 6, 24, 27, 7, 0, 0 }, + { 0, 0, 5, 23, 28, 8, 0, 0 }, + { 0, 0, 4, 22, 29, 9, 0, 0 }, + { 0, 0, 3, 21, 30, 10, 0, 0 }, + { 0, 0, 2, 20, 31, 11, 0, 0 }, + { 0, 0, 2, 18, 31, 13, 0, 0 }, + { 0, 0, 1, 17, 31, 15, 0, 0 } +#if ARCH_X86_64 + /* Bilin scaled being very rarely used, add a new table entry + * and use the put/prep_8tap_scaled code, thus acting as a + * scaled bilinear filter. */ + }, [5] = { + { 0, 0, 0, 60, 4, 0, 0, 0 }, + { 0, 0, 0, 56, 8, 0, 0, 0 }, + { 0, 0, 0, 52, 12, 0, 0, 0 }, + { 0, 0, 0, 48, 16, 0, 0, 0 }, + { 0, 0, 0, 44, 20, 0, 0, 0 }, + { 0, 0, 0, 40, 24, 0, 0, 0 }, + { 0, 0, 0, 36, 28, 0, 0, 0 }, + { 0, 0, 0, 32, 32, 0, 0, 0 }, + { 0, 0, 0, 28, 36, 0, 0, 0 }, + { 0, 0, 0, 24, 40, 0, 0, 0 }, + { 0, 0, 0, 20, 44, 0, 0, 0 }, + { 0, 0, 0, 16, 48, 0, 0, 0 }, + { 0, 0, 0, 12, 52, 0, 0, 0 }, + { 0, 0, 0, 8, 56, 0, 0, 0 }, + { 0, 0, 0, 4, 60, 0, 0, 0 } +#endif + } +}; + +#if ARCH_X86 +#define W(v0, v1, v2, v3, v4, v5, v6, v7) { v0, v2, v4, v6, v1, v3, v5, v7 } +#else +#define W(v0, v1, v2, v3, v4, v5, v6, v7) { v0, v1, v2, v3, v4, v5, v6, v7 } +#endif +const int8_t ALIGN(dav1d_mc_warp_filter[193][8], 8) = { + // [-1, 0) + W( 0, 0, 127, 1, 0, 0, 0, 0 ), W( 0, - 1, 127, 2, 0, 0, 0, 0 ), + W( 1, - 3, 127, 4, - 1, 0, 0, 0 ), W( 1, - 4, 126, 6, - 2, 1, 0, 0 ), + W( 1, - 5, 126, 8, - 3, 1, 0, 0 ), W( 1, - 6, 125, 11, - 4, 1, 0, 0 ), + W( 1, - 7, 124, 13, - 4, 1, 0, 0 ), W( 2, - 8, 123, 15, - 5, 1, 0, 0 ), + W( 2, - 9, 122, 18, - 6, 1, 0, 0 ), W( 2, -10, 121, 20, - 6, 1, 0, 0 ), + W( 2, -11, 120, 22, - 7, 2, 0, 0 ), W( 2, -12, 119, 25, - 8, 2, 0, 0 ), + W( 3, -13, 117, 27, - 8, 2, 0, 0 ), W( 3, -13, 116, 29, - 9, 2, 0, 0 ), + W( 3, -14, 114, 32, -10, 3, 0, 0 ), W( 3, -15, 113, 35, -10, 2, 0, 0 ), + W( 3, -15, 111, 37, -11, 3, 0, 0 ), W( 3, -16, 109, 40, -11, 3, 0, 0 ), + W( 3, -16, 108, 42, -12, 3, 0, 0 ), W( 4, -17, 106, 45, -13, 3, 0, 0 ), + W( 4, -17, 104, 47, -13, 3, 0, 0 ), W( 4, -17, 102, 50, -14, 3, 0, 0 ), + W( 4, -17, 100, 52, -14, 3, 0, 0 ), W( 4, -18, 98, 55, -15, 4, 0, 0 ), + W( 4, -18, 96, 58, -15, 3, 0, 0 ), W( 4, -18, 94, 60, -16, 4, 0, 0 ), + W( 4, -18, 91, 63, -16, 4, 0, 0 ), W( 4, -18, 89, 65, -16, 4, 0, 0 ), + W( 4, -18, 87, 68, -17, 4, 0, 0 ), W( 4, -18, 85, 70, -17, 4, 0, 0 ), + W( 4, -18, 82, 73, -17, 4, 0, 0 ), W( 4, -18, 80, 75, -17, 4, 0, 0 ), + W( 4, -18, 78, 78, -18, 4, 0, 0 ), W( 4, -17, 75, 80, -18, 4, 0, 0 ), + W( 4, -17, 73, 82, -18, 4, 0, 0 ), W( 4, -17, 70, 85, -18, 4, 0, 0 ), + W( 4, -17, 68, 87, -18, 4, 0, 0 ), W( 4, -16, 65, 89, -18, 4, 0, 0 ), + W( 4, -16, 63, 91, -18, 4, 0, 0 ), W( 4, -16, 60, 94, -18, 4, 0, 0 ), + W( 3, -15, 58, 96, -18, 4, 0, 0 ), W( 4, -15, 55, 98, -18, 4, 0, 0 ), + W( 3, -14, 52, 100, -17, 4, 0, 0 ), W( 3, -14, 50, 102, -17, 4, 0, 0 ), + W( 3, -13, 47, 104, -17, 4, 0, 0 ), W( 3, -13, 45, 106, -17, 4, 0, 0 ), + W( 3, -12, 42, 108, -16, 3, 0, 0 ), W( 3, -11, 40, 109, -16, 3, 0, 0 ), + W( 3, -11, 37, 111, -15, 3, 0, 0 ), W( 2, -10, 35, 113, -15, 3, 0, 0 ), + W( 3, -10, 32, 114, -14, 3, 0, 0 ), W( 2, - 9, 29, 116, -13, 3, 0, 0 ), + W( 2, - 8, 27, 117, -13, 3, 0, 0 ), W( 2, - 8, 25, 119, -12, 2, 0, 0 ), + W( 2, - 7, 22, 120, -11, 2, 0, 0 ), W( 1, - 6, 20, 121, -10, 2, 0, 0 ), + W( 1, - 6, 18, 122, - 9, 2, 0, 0 ), W( 1, - 5, 15, 123, - 8, 2, 0, 0 ), + W( 1, - 4, 13, 124, - 7, 1, 0, 0 ), W( 1, - 4, 11, 125, - 6, 1, 0, 0 ), + W( 1, - 3, 8, 126, - 5, 1, 0, 0 ), W( 1, - 2, 6, 126, - 4, 1, 0, 0 ), + W( 0, - 1, 4, 127, - 3, 1, 0, 0 ), W( 0, 0, 2, 127, - 1, 0, 0, 0 ), + // [0, 1) + W( 0, 0, 0, 127, 1, 0, 0, 0),W( 0, 0, -1, 127, 2, 0, 0, 0), + W( 0, 1, -3, 127, 4, -2, 1, 0),W( 0, 1, -5, 127, 6, -2, 1, 0), + W( 0, 2, -6, 126, 8, -3, 1, 0),W(-1, 2, -7, 126, 11, -4, 2, -1), + W(-1, 3, -8, 125, 13, -5, 2, -1),W(-1, 3, -10, 124, 16, -6, 3, -1), + W(-1, 4, -11, 123, 18, -7, 3, -1),W(-1, 4, -12, 122, 20, -7, 3, -1), + W(-1, 4, -13, 121, 23, -8, 3, -1),W(-2, 5, -14, 120, 25, -9, 4, -1), + W(-1, 5, -15, 119, 27, -10, 4, -1),W(-1, 5, -16, 118, 30, -11, 4, -1), + W(-2, 6, -17, 116, 33, -12, 5, -1),W(-2, 6, -17, 114, 35, -12, 5, -1), + W(-2, 6, -18, 113, 38, -13, 5, -1),W(-2, 7, -19, 111, 41, -14, 6, -2), + W(-2, 7, -19, 110, 43, -15, 6, -2),W(-2, 7, -20, 108, 46, -15, 6, -2), + W(-2, 7, -20, 106, 49, -16, 6, -2),W(-2, 7, -21, 104, 51, -16, 7, -2), + W(-2, 7, -21, 102, 54, -17, 7, -2),W(-2, 8, -21, 100, 56, -18, 7, -2), + W(-2, 8, -22, 98, 59, -18, 7, -2),W(-2, 8, -22, 96, 62, -19, 7, -2), + W(-2, 8, -22, 94, 64, -19, 7, -2),W(-2, 8, -22, 91, 67, -20, 8, -2), + W(-2, 8, -22, 89, 69, -20, 8, -2),W(-2, 8, -22, 87, 72, -21, 8, -2), + W(-2, 8, -21, 84, 74, -21, 8, -2),W(-2, 8, -22, 82, 77, -21, 8, -2), + W(-2, 8, -21, 79, 79, -21, 8, -2),W(-2, 8, -21, 77, 82, -22, 8, -2), + W(-2, 8, -21, 74, 84, -21, 8, -2),W(-2, 8, -21, 72, 87, -22, 8, -2), + W(-2, 8, -20, 69, 89, -22, 8, -2),W(-2, 8, -20, 67, 91, -22, 8, -2), + W(-2, 7, -19, 64, 94, -22, 8, -2),W(-2, 7, -19, 62, 96, -22, 8, -2), + W(-2, 7, -18, 59, 98, -22, 8, -2),W(-2, 7, -18, 56, 100, -21, 8, -2), + W(-2, 7, -17, 54, 102, -21, 7, -2),W(-2, 7, -16, 51, 104, -21, 7, -2), + W(-2, 6, -16, 49, 106, -20, 7, -2),W(-2, 6, -15, 46, 108, -20, 7, -2), + W(-2, 6, -15, 43, 110, -19, 7, -2),W(-2, 6, -14, 41, 111, -19, 7, -2), + W(-1, 5, -13, 38, 113, -18, 6, -2),W(-1, 5, -12, 35, 114, -17, 6, -2), + W(-1, 5, -12, 33, 116, -17, 6, -2),W(-1, 4, -11, 30, 118, -16, 5, -1), + W(-1, 4, -10, 27, 119, -15, 5, -1),W(-1, 4, -9, 25, 120, -14, 5, -2), + W(-1, 3, -8, 23, 121, -13, 4, -1),W(-1, 3, -7, 20, 122, -12, 4, -1), + W(-1, 3, -7, 18, 123, -11, 4, -1),W(-1, 3, -6, 16, 124, -10, 3, -1), + W(-1, 2, -5, 13, 125, -8, 3, -1),W(-1, 2, -4, 11, 126, -7, 2, -1), + W( 0, 1, -3, 8, 126, -6, 2, 0),W( 0, 1, -2, 6, 127, -5, 1, 0), + W( 0, 1, -2, 4, 127, -3, 1, 0),W( 0, 0, 0, 2, 127, -1, 0, 0), + // [1, 2) + W( 0, 0, 0, 1, 127, 0, 0, 0 ),W( 0, 0, 0, - 1, 127, 2, 0, 0 ), + W( 0, 0, 1, - 3, 127, 4, - 1, 0 ), W( 0, 0, 1, - 4, 126, 6, - 2, 1 ), + W( 0, 0, 1, - 5, 126, 8, - 3, 1 ), W( 0, 0, 1, - 6, 125, 11, - 4, 1 ), + W( 0, 0, 1, - 7, 124, 13, - 4, 1 ), W( 0, 0, 2, - 8, 123, 15, - 5, 1 ), + W( 0, 0, 2, - 9, 122, 18, - 6, 1 ), W( 0, 0, 2, -10, 121, 20, - 6, 1 ), + W( 0, 0, 2, -11, 120, 22, - 7, 2 ), W( 0, 0, 2, -12, 119, 25, - 8, 2 ), + W( 0, 0, 3, -13, 117, 27, - 8, 2 ), W( 0, 0, 3, -13, 116, 29, - 9, 2 ), + W( 0, 0, 3, -14, 114, 32, -10, 3 ), W( 0, 0, 3, -15, 113, 35, -10, 2 ), + W( 0, 0, 3, -15, 111, 37, -11, 3 ), W( 0, 0, 3, -16, 109, 40, -11, 3 ), + W( 0, 0, 3, -16, 108, 42, -12, 3 ), W( 0, 0, 4, -17, 106, 45, -13, 3 ), + W( 0, 0, 4, -17, 104, 47, -13, 3 ), W( 0, 0, 4, -17, 102, 50, -14, 3 ), + W( 0, 0, 4, -17, 100, 52, -14, 3 ), W( 0, 0, 4, -18, 98, 55, -15, 4 ), + W( 0, 0, 4, -18, 96, 58, -15, 3 ), W( 0, 0, 4, -18, 94, 60, -16, 4 ), + W( 0, 0, 4, -18, 91, 63, -16, 4 ), W( 0, 0, 4, -18, 89, 65, -16, 4 ), + W( 0, 0, 4, -18, 87, 68, -17, 4 ), W( 0, 0, 4, -18, 85, 70, -17, 4 ), + W( 0, 0, 4, -18, 82, 73, -17, 4 ), W( 0, 0, 4, -18, 80, 75, -17, 4 ), + W( 0, 0, 4, -18, 78, 78, -18, 4 ), W( 0, 0, 4, -17, 75, 80, -18, 4 ), + W( 0, 0, 4, -17, 73, 82, -18, 4 ), W( 0, 0, 4, -17, 70, 85, -18, 4 ), + W( 0, 0, 4, -17, 68, 87, -18, 4 ), W( 0, 0, 4, -16, 65, 89, -18, 4 ), + W( 0, 0, 4, -16, 63, 91, -18, 4 ), W( 0, 0, 4, -16, 60, 94, -18, 4 ), + W( 0, 0, 3, -15, 58, 96, -18, 4 ), W( 0, 0, 4, -15, 55, 98, -18, 4 ), + W( 0, 0, 3, -14, 52, 100, -17, 4 ), W( 0, 0, 3, -14, 50, 102, -17, 4 ), + W( 0, 0, 3, -13, 47, 104, -17, 4 ), W( 0, 0, 3, -13, 45, 106, -17, 4 ), + W( 0, 0, 3, -12, 42, 108, -16, 3 ), W( 0, 0, 3, -11, 40, 109, -16, 3 ), + W( 0, 0, 3, -11, 37, 111, -15, 3 ), W( 0, 0, 2, -10, 35, 113, -15, 3 ), + W( 0, 0, 3, -10, 32, 114, -14, 3 ), W( 0, 0, 2, - 9, 29, 116, -13, 3 ), + W( 0, 0, 2, - 8, 27, 117, -13, 3 ), W( 0, 0, 2, - 8, 25, 119, -12, 2 ), + W( 0, 0, 2, - 7, 22, 120, -11, 2 ), W( 0, 0, 1, - 6, 20, 121, -10, 2 ), + W( 0, 0, 1, - 6, 18, 122, - 9, 2 ), W( 0, 0, 1, - 5, 15, 123, - 8, 2 ), + W( 0, 0, 1, - 4, 13, 124, - 7, 1 ), W( 0, 0, 1, - 4, 11, 125, - 6, 1 ), + W( 0, 0, 1, - 3, 8, 126, - 5, 1 ), W( 0, 0, 1, - 2, 6, 126, - 4, 1 ), + W( 0, 0, 0, - 1, 4, 127, - 3, 1 ), W( 0, 0, 0, 0, 2, 127, - 1, 0 ), + // dummy (replicate row index 191) + W( 0, 0, 0, 0, 2, 127, - 1, 0 ), +}; + +const int8_t ALIGN(dav1d_resize_filter[64][8], 8) = { + { 0, 0, 0, -128, 0, 0, 0, 0 }, { 0, 0, 1, -128, -2, 1, 0, 0 }, + { 0, -1, 3, -127, -4, 2, -1, 0 }, { 0, -1, 4, -127, -6, 3, -1, 0 }, + { 0, -2, 6, -126, -8, 3, -1, 0 }, { 0, -2, 7, -125, -11, 4, -1, 0 }, + { 1, -2, 8, -125, -13, 5, -2, 0 }, { 1, -3, 9, -124, -15, 6, -2, 0 }, + { 1, -3, 10, -123, -18, 6, -2, 1 }, { 1, -3, 11, -122, -20, 7, -3, 1 }, + { 1, -4, 12, -121, -22, 8, -3, 1 }, { 1, -4, 13, -120, -25, 9, -3, 1 }, + { 1, -4, 14, -118, -28, 9, -3, 1 }, { 1, -4, 15, -117, -30, 10, -4, 1 }, + { 1, -5, 16, -116, -32, 11, -4, 1 }, { 1, -5, 16, -114, -35, 12, -4, 1 }, + { 1, -5, 17, -112, -38, 12, -4, 1 }, { 1, -5, 18, -111, -40, 13, -5, 1 }, + { 1, -5, 18, -109, -43, 14, -5, 1 }, { 1, -6, 19, -107, -45, 14, -5, 1 }, + { 1, -6, 19, -105, -48, 15, -5, 1 }, { 1, -6, 19, -103, -51, 16, -5, 1 }, + { 1, -6, 20, -101, -53, 16, -6, 1 }, { 1, -6, 20, -99, -56, 17, -6, 1 }, + { 1, -6, 20, -97, -58, 17, -6, 1 }, { 1, -6, 20, -95, -61, 18, -6, 1 }, + { 2, -7, 20, -93, -64, 18, -6, 2 }, { 2, -7, 20, -91, -66, 19, -6, 1 }, + { 2, -7, 20, -88, -69, 19, -6, 1 }, { 2, -7, 20, -86, -71, 19, -6, 1 }, + { 2, -7, 20, -84, -74, 20, -7, 2 }, { 2, -7, 20, -81, -76, 20, -7, 1 }, + { 2, -7, 20, -79, -79, 20, -7, 2 }, { 1, -7, 20, -76, -81, 20, -7, 2 }, + { 2, -7, 20, -74, -84, 20, -7, 2 }, { 1, -6, 19, -71, -86, 20, -7, 2 }, + { 1, -6, 19, -69, -88, 20, -7, 2 }, { 1, -6, 19, -66, -91, 20, -7, 2 }, + { 2, -6, 18, -64, -93, 20, -7, 2 }, { 1, -6, 18, -61, -95, 20, -6, 1 }, + { 1, -6, 17, -58, -97, 20, -6, 1 }, { 1, -6, 17, -56, -99, 20, -6, 1 }, + { 1, -6, 16, -53, -101, 20, -6, 1 }, { 1, -5, 16, -51, -103, 19, -6, 1 }, + { 1, -5, 15, -48, -105, 19, -6, 1 }, { 1, -5, 14, -45, -107, 19, -6, 1 }, + { 1, -5, 14, -43, -109, 18, -5, 1 }, { 1, -5, 13, -40, -111, 18, -5, 1 }, + { 1, -4, 12, -38, -112, 17, -5, 1 }, { 1, -4, 12, -35, -114, 16, -5, 1 }, + { 1, -4, 11, -32, -116, 16, -5, 1 }, { 1, -4, 10, -30, -117, 15, -4, 1 }, + { 1, -3, 9, -28, -118, 14, -4, 1 }, { 1, -3, 9, -25, -120, 13, -4, 1 }, + { 1, -3, 8, -22, -121, 12, -4, 1 }, { 1, -3, 7, -20, -122, 11, -3, 1 }, + { 1, -2, 6, -18, -123, 10, -3, 1 }, { 0, -2, 6, -15, -124, 9, -3, 1 }, + { 0, -2, 5, -13, -125, 8, -2, 1 }, { 0, -1, 4, -11, -125, 7, -2, 0 }, + { 0, -1, 3, -8, -126, 6, -2, 0 }, { 0, -1, 3, -6, -127, 4, -1, 0 }, + { 0, -1, 2, -4, -127, 3, -1, 0 }, { 0, 0, 1, -2, -128, 1, 0, 0 }, +}; + +const uint8_t ALIGN(dav1d_sm_weights[128], 16) = { + // Unused, because we always offset by bs, which is at least 2. + 0, 0, + // bs = 2 + 255, 128, + // bs = 4 + 255, 149, 85, 64, + // bs = 8 + 255, 197, 146, 105, 73, 50, 37, 32, + // bs = 16 + 255, 225, 196, 170, 145, 123, 102, 84, + 68, 54, 43, 33, 26, 20, 17, 16, + // bs = 32 + 255, 240, 225, 210, 196, 182, 169, 157, + 145, 133, 122, 111, 101, 92, 83, 74, + 66, 59, 52, 45, 39, 34, 29, 25, + 21, 17, 14, 12, 10, 9, 8, 8, + // bs = 64 + 255, 248, 240, 233, 225, 218, 210, 203, + 196, 189, 182, 176, 169, 163, 156, 150, + 144, 138, 133, 127, 121, 116, 111, 106, + 101, 96, 91, 86, 82, 77, 73, 69, + 65, 61, 57, 54, 50, 47, 44, 41, + 38, 35, 32, 29, 27, 25, 22, 20, + 18, 16, 15, 13, 12, 10, 9, 8, + 7, 6, 6, 5, 5, 4, 4, 4 +}; + +const uint16_t dav1d_dr_intra_derivative[44] = { + // Values that are 0 will never be used + 0, // Angles: + 1023, 0, // 3, 93, 183 + 547, // 6, 96, 186 + 372, 0, 0, // 9, 99, 189 + 273, // 14, 104, 194 + 215, 0, // 17, 107, 197 + 178, // 20, 110, 200 + 151, 0, // 23, 113, 203 (113 & 203 are base angles) + 132, // 26, 116, 206 + 116, 0, // 29, 119, 209 + 102, 0, // 32, 122, 212 + 90, // 36, 126, 216 + 80, 0, // 39, 129, 219 + 71, // 42, 132, 222 + 64, 0, // 45, 135, 225 (45 & 135 are base angles) + 57, // 48, 138, 228 + 51, 0, // 51, 141, 231 + 45, 0, // 54, 144, 234 + 40, // 58, 148, 238 + 35, 0, // 61, 151, 241 + 31, // 64, 154, 244 + 27, 0, // 67, 157, 247 (67 & 157 are base angles) + 23, // 70, 160, 250 + 19, 0, // 73, 163, 253 + 15, 0, // 76, 166, 256 + 11, 0, // 81, 171, 261 + 7, // 84, 174, 264 + 3 // 87, 177, 267 +}; + +#if ARCH_X86 +#define F(idx, f0, f1, f2, f3, f4, f5, f6) \ + [2*idx+0] = f0, [2*idx+1] = f1, \ + [2*idx+16] = f2, [2*idx+17] = f3, \ + [2*idx+32] = f4, [2*idx+33] = f5, \ + [2*idx+48] = f6 +#else +#define F(idx, f0, f1, f2, f3, f4, f5, f6) \ + [1*idx+0] = f0, [1*idx+8] = f1, \ + [1*idx+16] = f2, [1*idx+24] = f3, \ + [1*idx+32] = f4, [1*idx+40] = f5, \ + [1*idx+48] = f6 +#endif +const int8_t ALIGN(dav1d_filter_intra_taps[5][64], 16) = { + { + F( 0, -6, 10, 0, 0, 0, 12, 0 ), + F( 1, -5, 2, 10, 0, 0, 9, 0 ), + F( 2, -3, 1, 1, 10, 0, 7, 0 ), + F( 3, -3, 1, 1, 2, 10, 5, 0 ), + F( 4, -4, 6, 0, 0, 0, 2, 12 ), + F( 5, -3, 2, 6, 0, 0, 2, 9 ), + F( 6, -3, 2, 2, 6, 0, 2, 7 ), + F( 7, -3, 1, 2, 2, 6, 3, 5 ), + }, { + F( 0, -10, 16, 0, 0, 0, 10, 0 ), + F( 1, -6, 0, 16, 0, 0, 6, 0 ), + F( 2, -4, 0, 0, 16, 0, 4, 0 ), + F( 3, -2, 0, 0, 0, 16, 2, 0 ), + F( 4, -10, 16, 0, 0, 0, 0, 10 ), + F( 5, -6, 0, 16, 0, 0, 0, 6 ), + F( 6, -4, 0, 0, 16, 0, 0, 4 ), + F( 7, -2, 0, 0, 0, 16, 0, 2 ), + }, { + F( 0, -8, 8, 0, 0, 0, 16, 0 ), + F( 1, -8, 0, 8, 0, 0, 16, 0 ), + F( 2, -8, 0, 0, 8, 0, 16, 0 ), + F( 3, -8, 0, 0, 0, 8, 16, 0 ), + F( 4, -4, 4, 0, 0, 0, 0, 16 ), + F( 5, -4, 0, 4, 0, 0, 0, 16 ), + F( 6, -4, 0, 0, 4, 0, 0, 16 ), + F( 7, -4, 0, 0, 0, 4, 0, 16 ), + }, { + F( 0, -2, 8, 0, 0, 0, 10, 0 ), + F( 1, -1, 3, 8, 0, 0, 6, 0 ), + F( 2, -1, 2, 3, 8, 0, 4, 0 ), + F( 3, 0, 1, 2, 3, 8, 2, 0 ), + F( 4, -1, 4, 0, 0, 0, 3, 10 ), + F( 5, -1, 3, 4, 0, 0, 4, 6 ), + F( 6, -1, 2, 3, 4, 0, 4, 4 ), + F( 7, -1, 2, 2, 3, 4, 3, 3 ), + }, { + F( 0, -12, 14, 0, 0, 0, 14, 0 ), + F( 1, -10, 0, 14, 0, 0, 12, 0 ), + F( 2, -9, 0, 0, 14, 0, 11, 0 ), + F( 3, -8, 0, 0, 0, 14, 10, 0 ), + F( 4, -10, 12, 0, 0, 0, 0, 14 ), + F( 5, -9, 1, 12, 0, 0, 0, 12 ), + F( 6, -8, 0, 0, 12, 0, 1, 11 ), + F( 7, -7, 0, 0, 1, 12, 1, 9 ), + } +}; + +const uint8_t ALIGN(dav1d_obmc_masks[64], 16) = { + /* Unused */ + 0, 0, + /* 2 */ + 19, 0, + /* 4 */ + 25, 14, 5, 0, + /* 8 */ + 28, 22, 16, 11, 7, 3, 0, 0, + /* 16 */ + 30, 27, 24, 21, 18, 15, 12, 10, 8, 6, 4, 3, 0, 0, 0, 0, + /* 32 */ + 31, 29, 28, 26, 24, 23, 21, 20, 19, 17, 16, 14, 13, 12, 11, 9, + 8, 7, 6, 5, 4, 4, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +// Taken from the spec. Range is [-2048, 2047], mean is 0 and stddev is 512 +const int16_t dav1d_gaussian_sequence[2048] = { + 56, 568, -180, 172, 124, -84, 172, -64, -900, 24, 820, + 224, 1248, 996, 272, -8, -916, -388, -732, -104, -188, 800, + 112, -652, -320, -376, 140, -252, 492, -168, 44, -788, 588, + -584, 500, -228, 12, 680, 272, -476, 972, -100, 652, 368, + 432, -196, -720, -192, 1000, -332, 652, -136, -552, -604, -4, + 192, -220, -136, 1000, -52, 372, -96, -624, 124, -24, 396, + 540, -12, -104, 640, 464, 244, -208, -84, 368, -528, -740, + 248, -968, -848, 608, 376, -60, -292, -40, -156, 252, -292, + 248, 224, -280, 400, -244, 244, -60, 76, -80, 212, 532, + 340, 128, -36, 824, -352, -60, -264, -96, -612, 416, -704, + 220, -204, 640, -160, 1220, -408, 900, 336, 20, -336, -96, + -792, 304, 48, -28, -1232, -1172, -448, 104, -292, -520, 244, + 60, -948, 0, -708, 268, 108, 356, -548, 488, -344, -136, + 488, -196, -224, 656, -236, -1128, 60, 4, 140, 276, -676, + -376, 168, -108, 464, 8, 564, 64, 240, 308, -300, -400, + -456, -136, 56, 120, -408, -116, 436, 504, -232, 328, 844, + -164, -84, 784, -168, 232, -224, 348, -376, 128, 568, 96, + -1244, -288, 276, 848, 832, -360, 656, 464, -384, -332, -356, + 728, -388, 160, -192, 468, 296, 224, 140, -776, -100, 280, + 4, 196, 44, -36, -648, 932, 16, 1428, 28, 528, 808, + 772, 20, 268, 88, -332, -284, 124, -384, -448, 208, -228, + -1044, -328, 660, 380, -148, -300, 588, 240, 540, 28, 136, + -88, -436, 256, 296, -1000, 1400, 0, -48, 1056, -136, 264, + -528, -1108, 632, -484, -592, -344, 796, 124, -668, -768, 388, + 1296, -232, -188, -200, -288, -4, 308, 100, -168, 256, -500, + 204, -508, 648, -136, 372, -272, -120, -1004, -552, -548, -384, + 548, -296, 428, -108, -8, -912, -324, -224, -88, -112, -220, + -100, 996, -796, 548, 360, -216, 180, 428, -200, -212, 148, + 96, 148, 284, 216, -412, -320, 120, -300, -384, -604, -572, + -332, -8, -180, -176, 696, 116, -88, 628, 76, 44, -516, + 240, -208, -40, 100, -592, 344, -308, -452, -228, 20, 916, + -1752, -136, -340, -804, 140, 40, 512, 340, 248, 184, -492, + 896, -156, 932, -628, 328, -688, -448, -616, -752, -100, 560, + -1020, 180, -800, -64, 76, 576, 1068, 396, 660, 552, -108, + -28, 320, -628, 312, -92, -92, -472, 268, 16, 560, 516, + -672, -52, 492, -100, 260, 384, 284, 292, 304, -148, 88, + -152, 1012, 1064, -228, 164, -376, -684, 592, -392, 156, 196, + -524, -64, -884, 160, -176, 636, 648, 404, -396, -436, 864, + 424, -728, 988, -604, 904, -592, 296, -224, 536, -176, -920, + 436, -48, 1176, -884, 416, -776, -824, -884, 524, -548, -564, + -68, -164, -96, 692, 364, -692, -1012, -68, 260, -480, 876, + -1116, 452, -332, -352, 892, -1088, 1220, -676, 12, -292, 244, + 496, 372, -32, 280, 200, 112, -440, -96, 24, -644, -184, + 56, -432, 224, -980, 272, -260, 144, -436, 420, 356, 364, + -528, 76, 172, -744, -368, 404, -752, -416, 684, -688, 72, + 540, 416, 92, 444, 480, -72, -1416, 164, -1172, -68, 24, + 424, 264, 1040, 128, -912, -524, -356, 64, 876, -12, 4, + -88, 532, 272, -524, 320, 276, -508, 940, 24, -400, -120, + 756, 60, 236, -412, 100, 376, -484, 400, -100, -740, -108, + -260, 328, -268, 224, -200, -416, 184, -604, -564, -20, 296, + 60, 892, -888, 60, 164, 68, -760, 216, -296, 904, -336, + -28, 404, -356, -568, -208, -1480, -512, 296, 328, -360, -164, + -1560, -776, 1156, -428, 164, -504, -112, 120, -216, -148, -264, + 308, 32, 64, -72, 72, 116, 176, -64, -272, 460, -536, + -784, -280, 348, 108, -752, -132, 524, -540, -776, 116, -296, + -1196, -288, -560, 1040, -472, 116, -848, -1116, 116, 636, 696, + 284, -176, 1016, 204, -864, -648, -248, 356, 972, -584, -204, + 264, 880, 528, -24, -184, 116, 448, -144, 828, 524, 212, + -212, 52, 12, 200, 268, -488, -404, -880, 824, -672, -40, + 908, -248, 500, 716, -576, 492, -576, 16, 720, -108, 384, + 124, 344, 280, 576, -500, 252, 104, -308, 196, -188, -8, + 1268, 296, 1032, -1196, 436, 316, 372, -432, -200, -660, 704, + -224, 596, -132, 268, 32, -452, 884, 104, -1008, 424, -1348, + -280, 4, -1168, 368, 476, 696, 300, -8, 24, 180, -592, + -196, 388, 304, 500, 724, -160, 244, -84, 272, -256, -420, + 320, 208, -144, -156, 156, 364, 452, 28, 540, 316, 220, + -644, -248, 464, 72, 360, 32, -388, 496, -680, -48, 208, + -116, -408, 60, -604, -392, 548, -840, 784, -460, 656, -544, + -388, -264, 908, -800, -628, -612, -568, 572, -220, 164, 288, + -16, -308, 308, -112, -636, -760, 280, -668, 432, 364, 240, + -196, 604, 340, 384, 196, 592, -44, -500, 432, -580, -132, + 636, -76, 392, 4, -412, 540, 508, 328, -356, -36, 16, + -220, -64, -248, -60, 24, -192, 368, 1040, 92, -24, -1044, + -32, 40, 104, 148, 192, -136, -520, 56, -816, -224, 732, + 392, 356, 212, -80, -424, -1008, -324, 588, -1496, 576, 460, + -816, -848, 56, -580, -92, -1372, -112, -496, 200, 364, 52, + -140, 48, -48, -60, 84, 72, 40, 132, -356, -268, -104, + -284, -404, 732, -520, 164, -304, -540, 120, 328, -76, -460, + 756, 388, 588, 236, -436, -72, -176, -404, -316, -148, 716, + -604, 404, -72, -88, -888, -68, 944, 88, -220, -344, 960, + 472, 460, -232, 704, 120, 832, -228, 692, -508, 132, -476, + 844, -748, -364, -44, 1116, -1104, -1056, 76, 428, 552, -692, + 60, 356, 96, -384, -188, -612, -576, 736, 508, 892, 352, + -1132, 504, -24, -352, 324, 332, -600, -312, 292, 508, -144, + -8, 484, 48, 284, -260, -240, 256, -100, -292, -204, -44, + 472, -204, 908, -188, -1000, -256, 92, 1164, -392, 564, 356, + 652, -28, -884, 256, 484, -192, 760, -176, 376, -524, -452, + -436, 860, -736, 212, 124, 504, -476, 468, 76, -472, 552, + -692, -944, -620, 740, -240, 400, 132, 20, 192, -196, 264, + -668, -1012, -60, 296, -316, -828, 76, -156, 284, -768, -448, + -832, 148, 248, 652, 616, 1236, 288, -328, -400, -124, 588, + 220, 520, -696, 1032, 768, -740, -92, -272, 296, 448, -464, + 412, -200, 392, 440, -200, 264, -152, -260, 320, 1032, 216, + 320, -8, -64, 156, -1016, 1084, 1172, 536, 484, -432, 132, + 372, -52, -256, 84, 116, -352, 48, 116, 304, -384, 412, + 924, -300, 528, 628, 180, 648, 44, -980, -220, 1320, 48, + 332, 748, 524, -268, -720, 540, -276, 564, -344, -208, -196, + 436, 896, 88, -392, 132, 80, -964, -288, 568, 56, -48, + -456, 888, 8, 552, -156, -292, 948, 288, 128, -716, -292, + 1192, -152, 876, 352, -600, -260, -812, -468, -28, -120, -32, + -44, 1284, 496, 192, 464, 312, -76, -516, -380, -456, -1012, + -48, 308, -156, 36, 492, -156, -808, 188, 1652, 68, -120, + -116, 316, 160, -140, 352, 808, -416, 592, 316, -480, 56, + 528, -204, -568, 372, -232, 752, -344, 744, -4, 324, -416, + -600, 768, 268, -248, -88, -132, -420, -432, 80, -288, 404, + -316, -1216, -588, 520, -108, 92, -320, 368, -480, -216, -92, + 1688, -300, 180, 1020, -176, 820, -68, -228, -260, 436, -904, + 20, 40, -508, 440, -736, 312, 332, 204, 760, -372, 728, + 96, -20, -632, -520, -560, 336, 1076, -64, -532, 776, 584, + 192, 396, -728, -520, 276, -188, 80, -52, -612, -252, -48, + 648, 212, -688, 228, -52, -260, 428, -412, -272, -404, 180, + 816, -796, 48, 152, 484, -88, -216, 988, 696, 188, -528, + 648, -116, -180, 316, 476, 12, -564, 96, 476, -252, -364, + -376, -392, 556, -256, -576, 260, -352, 120, -16, -136, -260, + -492, 72, 556, 660, 580, 616, 772, 436, 424, -32, -324, + -1268, 416, -324, -80, 920, 160, 228, 724, 32, -516, 64, + 384, 68, -128, 136, 240, 248, -204, -68, 252, -932, -120, + -480, -628, -84, 192, 852, -404, -288, -132, 204, 100, 168, + -68, -196, -868, 460, 1080, 380, -80, 244, 0, 484, -888, + 64, 184, 352, 600, 460, 164, 604, -196, 320, -64, 588, + -184, 228, 12, 372, 48, -848, -344, 224, 208, -200, 484, + 128, -20, 272, -468, -840, 384, 256, -720, -520, -464, -580, + 112, -120, 644, -356, -208, -608, -528, 704, 560, -424, 392, + 828, 40, 84, 200, -152, 0, -144, 584, 280, -120, 80, + -556, -972, -196, -472, 724, 80, 168, -32, 88, 160, -688, + 0, 160, 356, 372, -776, 740, -128, 676, -248, -480, 4, + -364, 96, 544, 232, -1032, 956, 236, 356, 20, -40, 300, + 24, -676, -596, 132, 1120, -104, 532, -1096, 568, 648, 444, + 508, 380, 188, -376, -604, 1488, 424, 24, 756, -220, -192, + 716, 120, 920, 688, 168, 44, -460, 568, 284, 1144, 1160, + 600, 424, 888, 656, -356, -320, 220, 316, -176, -724, -188, + -816, -628, -348, -228, -380, 1012, -452, -660, 736, 928, 404, + -696, -72, -268, -892, 128, 184, -344, -780, 360, 336, 400, + 344, 428, 548, -112, 136, -228, -216, -820, -516, 340, 92, + -136, 116, -300, 376, -244, 100, -316, -520, -284, -12, 824, + 164, -548, -180, -128, 116, -924, -828, 268, -368, -580, 620, + 192, 160, 0, -1676, 1068, 424, -56, -360, 468, -156, 720, + 288, -528, 556, -364, 548, -148, 504, 316, 152, -648, -620, + -684, -24, -376, -384, -108, -920, -1032, 768, 180, -264, -508, + -1268, -260, -60, 300, -240, 988, 724, -376, -576, -212, -736, + 556, 192, 1092, -620, -880, 376, -56, -4, -216, -32, 836, + 268, 396, 1332, 864, -600, 100, 56, -412, -92, 356, 180, + 884, -468, -436, 292, -388, -804, -704, -840, 368, -348, 140, + -724, 1536, 940, 372, 112, -372, 436, -480, 1136, 296, -32, + -228, 132, -48, -220, 868, -1016, -60, -1044, -464, 328, 916, + 244, 12, -736, -296, 360, 468, -376, -108, -92, 788, 368, + -56, 544, 400, -672, -420, 728, 16, 320, 44, -284, -380, + -796, 488, 132, 204, -596, -372, 88, -152, -908, -636, -572, + -624, -116, -692, -200, -56, 276, -88, 484, -324, 948, 864, + 1000, -456, -184, -276, 292, -296, 156, 676, 320, 160, 908, + -84, -1236, -288, -116, 260, -372, -644, 732, -756, -96, 84, + 344, -520, 348, -688, 240, -84, 216, -1044, -136, -676, -396, + -1500, 960, -40, 176, 168, 1516, 420, -504, -344, -364, -360, + 1216, -940, -380, -212, 252, -660, -708, 484, -444, -152, 928, + -120, 1112, 476, -260, 560, -148, -344, 108, -196, 228, -288, + 504, 560, -328, -88, 288, -1008, 460, -228, 468, -836, -196, + 76, 388, 232, 412, -1168, -716, -644, 756, -172, -356, -504, + 116, 432, 528, 48, 476, -168, -608, 448, 160, -532, -272, + 28, -676, -12, 828, 980, 456, 520, 104, -104, 256, -344, + -4, -28, -368, -52, -524, -572, -556, -200, 768, 1124, -208, + -512, 176, 232, 248, -148, -888, 604, -600, -304, 804, -156, + -212, 488, -192, -804, -256, 368, -360, -916, -328, 228, -240, + -448, -472, 856, -556, -364, 572, -12, -156, -368, -340, 432, + 252, -752, -152, 288, 268, -580, -848, -592, 108, -76, 244, + 312, -716, 592, -80, 436, 360, 4, -248, 160, 516, 584, + 732, 44, -468, -280, -292, -156, -588, 28, 308, 912, 24, + 124, 156, 180, -252, 944, -924, -772, -520, -428, -624, 300, + -212, -1144, 32, -724, 800, -1128, -212, -1288, -848, 180, -416, + 440, 192, -576, -792, -76, -1080, 80, -532, -352, -132, 380, + -820, 148, 1112, 128, 164, 456, 700, -924, 144, -668, -384, + 648, -832, 508, 552, -52, -100, -656, 208, -568, 748, -88, + 680, 232, 300, 192, -408, -1012, -152, -252, -268, 272, -876, + -664, -648, -332, -136, 16, 12, 1152, -28, 332, -536, 320, + -672, -460, -316, 532, -260, 228, -40, 1052, -816, 180, 88, + -496, -556, -672, -368, 428, 92, 356, 404, -408, 252, 196, + -176, -556, 792, 268, 32, 372, 40, 96, -332, 328, 120, + 372, -900, -40, 472, -264, -592, 952, 128, 656, 112, 664, + -232, 420, 4, -344, -464, 556, 244, -416, -32, 252, 0, + -412, 188, -696, 508, -476, 324, -1096, 656, -312, 560, 264, + -136, 304, 160, -64, -580, 248, 336, -720, 560, -348, -288, + -276, -196, -500, 852, -544, -236, -1128, -992, -776, 116, 56, + 52, 860, 884, 212, -12, 168, 1020, 512, -552, 924, -148, + 716, 188, 164, -340, -520, -184, 880, -152, -680, -208, -1156, + -300, -528, -472, 364, 100, -744, -1056, -32, 540, 280, 144, + -676, -32, -232, -280, -224, 96, 568, -76, 172, 148, 148, + 104, 32, -296, -32, 788, -80, 32, -16, 280, 288, 944, + 428, -484 +}; diff --git a/third_party/dav1d/src/tables.h b/third_party/dav1d/src/tables.h new file mode 100644 index 0000000000..abcf26592f --- /dev/null +++ b/third_party/dav1d/src/tables.h @@ -0,0 +1,125 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_TABLES_H +#define DAV1D_SRC_TABLES_H + +#include + +#include "common/intops.h" + +#include "src/levels.h" + +extern const uint8_t dav1d_al_part_ctx[2][N_BL_LEVELS][N_PARTITIONS]; +extern const uint8_t /* enum BlockSize */ + dav1d_block_sizes[N_BL_LEVELS][N_PARTITIONS][2]; +// width, height (in 4px blocks), log2 versions of these two +extern const uint8_t dav1d_block_dimensions[N_BS_SIZES][4]; +typedef struct TxfmInfo { + // width, height (in 4px blocks), log2 of them, min/max of log2, sub, pad + uint8_t w, h, lw, lh, min, max, sub, ctx; +} TxfmInfo; +extern const TxfmInfo dav1d_txfm_dimensions[N_RECT_TX_SIZES]; +extern const uint8_t /* enum (Rect)TxfmSize */ + dav1d_max_txfm_size_for_bs[N_BS_SIZES][4 /* y, 420, 422, 444 */]; +extern const uint8_t /* enum TxfmType */ + dav1d_txtp_from_uvmode[N_UV_INTRA_PRED_MODES]; + +extern const uint8_t /* enum InterPredMode */ + dav1d_comp_inter_pred_modes[N_COMP_INTER_PRED_MODES][2]; + +extern const uint8_t dav1d_partition_type_count[N_BL_LEVELS]; +extern const uint8_t /* enum TxfmType */ dav1d_tx_types_per_set[40]; + +extern const uint8_t dav1d_filter_mode_to_y_mode[5]; +extern const uint8_t dav1d_ymode_size_context[N_BS_SIZES]; +extern const uint8_t dav1d_lo_ctx_offsets[3][5][5]; +extern const uint8_t dav1d_skip_ctx[5][5]; +extern const uint8_t /* enum TxClass */ + dav1d_tx_type_class[N_TX_TYPES_PLUS_LL]; +extern const uint8_t /* enum Filter2d */ + dav1d_filter_2d[DAV1D_N_FILTERS /* h */][DAV1D_N_FILTERS /* v */]; +extern const uint8_t /* enum Dav1dFilterMode */ dav1d_filter_dir[N_2D_FILTERS][2]; +extern const uint8_t dav1d_intra_mode_context[N_INTRA_PRED_MODES]; +extern const uint8_t dav1d_wedge_ctx_lut[N_BS_SIZES]; + +static const unsigned cfl_allowed_mask = + (1 << BS_32x32) | + (1 << BS_32x16) | + (1 << BS_32x8) | + (1 << BS_16x32) | + (1 << BS_16x16) | + (1 << BS_16x8) | + (1 << BS_16x4) | + (1 << BS_8x32) | + (1 << BS_8x16) | + (1 << BS_8x8) | + (1 << BS_8x4) | + (1 << BS_4x16) | + (1 << BS_4x8) | + (1 << BS_4x4); + +static const unsigned wedge_allowed_mask = + (1 << BS_32x32) | + (1 << BS_32x16) | + (1 << BS_32x8) | + (1 << BS_16x32) | + (1 << BS_16x16) | + (1 << BS_16x8) | + (1 << BS_8x32) | + (1 << BS_8x16) | + (1 << BS_8x8); + +static const unsigned interintra_allowed_mask = + (1 << BS_32x32) | + (1 << BS_32x16) | + (1 << BS_16x32) | + (1 << BS_16x16) | + (1 << BS_16x8) | + (1 << BS_8x16) | + (1 << BS_8x8); + +extern const Dav1dWarpedMotionParams dav1d_default_wm_params; + +extern const int8_t dav1d_cdef_directions[12][2]; + +extern const int16_t dav1d_sgr_params[16][4]; +extern const uint8_t dav1d_sgr_x_by_x[256]; + +extern const int8_t dav1d_mc_subpel_filters[5+ARCH_X86_64][15][8]; +extern const int8_t dav1d_mc_warp_filter[193][8]; +extern const int8_t dav1d_resize_filter[64][8]; + +extern const uint8_t dav1d_sm_weights[128]; +extern const uint16_t dav1d_dr_intra_derivative[44]; +extern const int8_t dav1d_filter_intra_taps[5][64]; + +extern const uint8_t dav1d_obmc_masks[64]; + +extern const int16_t dav1d_gaussian_sequence[2048]; // for fgs + +#endif /* DAV1D_SRC_TABLES_H */ diff --git a/third_party/dav1d/src/thread.h b/third_party/dav1d/src/thread.h new file mode 100644 index 0000000000..6cd304e2bd --- /dev/null +++ b/third_party/dav1d/src/thread.h @@ -0,0 +1,180 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_THREAD_H +#define DAV1D_SRC_THREAD_H + +#if defined(_WIN32) + +#include +#include + +#define PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT + +typedef struct { + HANDLE h; + void *(*func)(void*); + void *arg; +} pthread_t; + +typedef struct { + unsigned stack_size; +} pthread_attr_t; + +typedef SRWLOCK pthread_mutex_t; +typedef CONDITION_VARIABLE pthread_cond_t; +typedef INIT_ONCE pthread_once_t; + +void dav1d_init_thread(void); +void dav1d_set_thread_name(const wchar_t *name); +#define dav1d_set_thread_name(name) dav1d_set_thread_name(L##name) + +int dav1d_pthread_create(pthread_t *thread, const pthread_attr_t *attr, + void *(*func)(void*), void *arg); +int dav1d_pthread_join(pthread_t *thread, void **res); +int dav1d_pthread_once(pthread_once_t *once_control, + void (*init_routine)(void)); + +#define pthread_create dav1d_pthread_create +#define pthread_join(thread, res) dav1d_pthread_join(&(thread), res) +#define pthread_once dav1d_pthread_once + +static inline int pthread_attr_init(pthread_attr_t *const attr) { + attr->stack_size = 0; + return 0; +} + +static inline int pthread_attr_destroy(pthread_attr_t *const attr) { + return 0; +} + +static inline int pthread_attr_setstacksize(pthread_attr_t *const attr, + const size_t stack_size) +{ + if (stack_size > UINT_MAX) return 1; + attr->stack_size = (unsigned) stack_size; + return 0; +} + +static inline int pthread_mutex_init(pthread_mutex_t *const mutex, + const void *const attr) +{ + InitializeSRWLock(mutex); + return 0; +} + +static inline int pthread_mutex_destroy(pthread_mutex_t *const mutex) { + return 0; +} + +static inline int pthread_mutex_lock(pthread_mutex_t *const mutex) { + AcquireSRWLockExclusive(mutex); + return 0; +} + +static inline int pthread_mutex_unlock(pthread_mutex_t *const mutex) { + ReleaseSRWLockExclusive(mutex); + return 0; +} + +static inline int pthread_cond_init(pthread_cond_t *const cond, + const void *const attr) +{ + InitializeConditionVariable(cond); + return 0; +} + +static inline int pthread_cond_destroy(pthread_cond_t *const cond) { + return 0; +} + +static inline int pthread_cond_wait(pthread_cond_t *const cond, + pthread_mutex_t *const mutex) +{ + return !SleepConditionVariableSRW(cond, mutex, INFINITE, 0); +} + +static inline int pthread_cond_signal(pthread_cond_t *const cond) { + WakeConditionVariable(cond); + return 0; +} + +static inline int pthread_cond_broadcast(pthread_cond_t *const cond) { + WakeAllConditionVariable(cond); + return 0; +} + +#else + +#include + +#define dav1d_init_thread() do {} while (0) + +/* Thread naming support */ + +#ifdef __linux__ + +#include + +static inline void dav1d_set_thread_name(const char *const name) { + prctl(PR_SET_NAME, name); +} + +#elif defined(__APPLE__) + +static inline void dav1d_set_thread_name(const char *const name) { + pthread_setname_np(name); +} + +#elif defined(__DragonFly__) || defined(__FreeBSD__) || defined(__OpenBSD__) + +#if defined(__FreeBSD__) + /* ALIGN from conflicts with ALIGN from "common/attributes.h" */ +#define _SYS_PARAM_H_ +#include +#endif +#include + +static inline void dav1d_set_thread_name(const char *const name) { + pthread_set_name_np(pthread_self(), name); +} + +#elif defined(__NetBSD__) + +static inline void dav1d_set_thread_name(const char *const name) { + pthread_setname_np(pthread_self(), "%s", (void*)name); +} + +#else + +#define dav1d_set_thread_name(name) do {} while (0) + +#endif + +#endif + +#endif /* DAV1D_SRC_THREAD_H */ diff --git a/third_party/dav1d/src/thread_data.h b/third_party/dav1d/src/thread_data.h new file mode 100644 index 0000000000..62814e6348 --- /dev/null +++ b/third_party/dav1d/src/thread_data.h @@ -0,0 +1,40 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_THREAD_DATA_H +#define DAV1D_SRC_THREAD_DATA_H + +#include "src/thread.h" + +struct thread_data { + pthread_t thread; + pthread_cond_t cond; + pthread_mutex_t lock; + int inited; +}; + +#endif /* DAV1D_SRC_THREAD_DATA_H */ diff --git a/third_party/dav1d/src/thread_task.c b/third_party/dav1d/src/thread_task.c new file mode 100644 index 0000000000..6c1c13907e --- /dev/null +++ b/third_party/dav1d/src/thread_task.c @@ -0,0 +1,142 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include "src/thread_task.h" + +void *dav1d_frame_task(void *const data) { + Dav1dFrameContext *const f = data; + + dav1d_set_thread_name("dav1d-frame"); + pthread_mutex_lock(&f->frame_thread.td.lock); + for (;;) { + while (!f->n_tile_data && !f->frame_thread.die) { + pthread_cond_wait(&f->frame_thread.td.cond, + &f->frame_thread.td.lock); + } + if (f->frame_thread.die) break; + pthread_mutex_unlock(&f->frame_thread.td.lock); + + if (dav1d_decode_frame(f)) + memset(f->frame_thread.cf, 0, + (size_t)f->frame_thread.cf_sz * 128 * 128 / 2); + + pthread_mutex_lock(&f->frame_thread.td.lock); + f->n_tile_data = 0; + pthread_cond_signal(&f->frame_thread.td.cond); + } + pthread_mutex_unlock(&f->frame_thread.td.lock); + + return NULL; +} + +void *dav1d_tile_task(void *const data) { + Dav1dTileContext *const t = data; + struct FrameTileThreadData *const fttd = t->tile_thread.fttd; + const Dav1dFrameContext *const f = t->f; + const int tile_thread_idx = (int) (t - f->tc); + const uint64_t mask = 1ULL << tile_thread_idx; + + dav1d_set_thread_name("dav1d-tile"); + + for (;;) { + pthread_mutex_lock(&fttd->lock); + fttd->available |= mask; + int did_signal = 0; + while (!fttd->tasks_left && !t->tile_thread.die) { + if (!did_signal) { + did_signal = 1; + pthread_cond_signal(&fttd->icond); + } + pthread_cond_wait(&fttd->cond, &fttd->lock); + } + if (t->tile_thread.die) { + pthread_cond_signal(&fttd->icond); + pthread_mutex_unlock(&fttd->lock); + break; + } + fttd->available &= ~mask; + const int task_idx = fttd->num_tasks - fttd->tasks_left--; + pthread_mutex_unlock(&fttd->lock); + + if (f->frame_thread.pass == 1 || f->n_tc >= f->frame_hdr->tiling.cols) { + // we can (or in fact, if >, we need to) do full tile decoding. + // loopfilter happens in the main thread + Dav1dTileState *const ts = t->ts = &f->ts[task_idx]; + for (t->by = ts->tiling.row_start; t->by < ts->tiling.row_end; + t->by += f->sb_step) + { + const int error = dav1d_decode_tile_sbrow(t); + const int progress = error ? TILE_ERROR : 1 + (t->by >> f->sb_shift); + + // signal progress + pthread_mutex_lock(&ts->tile_thread.lock); + atomic_store(&ts->progress, progress); + pthread_cond_signal(&ts->tile_thread.cond); + pthread_mutex_unlock(&ts->tile_thread.lock); + if (error) break; + } + } else { + const int sby = f->tile_thread.task_idx_to_sby_and_tile_idx[task_idx][0]; + const int tile_idx = f->tile_thread.task_idx_to_sby_and_tile_idx[task_idx][1]; + Dav1dTileState *const ts = &f->ts[tile_idx]; + int progress; + + // the interleaved decoding can sometimes cause dependency issues + // if one part of the frame decodes signifcantly faster than others. + // Ideally, we'd "skip" tile_sbrows where dependencies are missing, + // and resume them later as dependencies are met. This also would + // solve the broadcast() below and allow us to use signal(). However, + // for now, we use linear dependency tracking because it's simpler. + if ((progress = atomic_load(&ts->progress)) < sby) { + pthread_mutex_lock(&ts->tile_thread.lock); + while ((progress = atomic_load(&ts->progress)) < sby) + pthread_cond_wait(&ts->tile_thread.cond, + &ts->tile_thread.lock); + pthread_mutex_unlock(&ts->tile_thread.lock); + } + if (progress == TILE_ERROR) continue; + + // we need to interleave sbrow decoding for all tile cols in a + // tile row, since otherwise subsequent threads will be blocked + // waiting for the post-filter to complete + t->ts = ts; + t->by = sby << f->sb_shift; + const int error = dav1d_decode_tile_sbrow(t); + progress = error ? TILE_ERROR : 1 + sby; + + // signal progress + pthread_mutex_lock(&ts->tile_thread.lock); + atomic_store(&ts->progress, progress); + pthread_cond_broadcast(&ts->tile_thread.cond); + pthread_mutex_unlock(&ts->tile_thread.lock); + } + } + + return NULL; +} diff --git a/third_party/dav1d/src/thread_task.h b/third_party/dav1d/src/thread_task.h new file mode 100644 index 0000000000..309a714255 --- /dev/null +++ b/third_party/dav1d/src/thread_task.h @@ -0,0 +1,44 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_THREAD_TASK_H +#define DAV1D_SRC_THREAD_TASK_H + +#include + +#include "src/internal.h" + +#define FRAME_ERROR (UINT_MAX - 1) +#define TILE_ERROR (INT_MAX - 1) + +int dav1d_decode_frame(Dav1dFrameContext *f); +void *dav1d_frame_task(void *data); + +int dav1d_decode_tile_sbrow(Dav1dTileContext *t); +void *dav1d_tile_task(void *data); + +#endif /* DAV1D_SRC_THREAD_TASK_H */ diff --git a/third_party/dav1d/src/warpmv.c b/third_party/dav1d/src/warpmv.c new file mode 100644 index 0000000000..439c4304c7 --- /dev/null +++ b/third_party/dav1d/src/warpmv.c @@ -0,0 +1,209 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include + +#include "common/intops.h" + +#include "src/warpmv.h" + +static const uint16_t div_lut[257] = { + 16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768, + 15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142, + 15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564, + 14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028, + 13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530, + 13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066, + 13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633, + 12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228, + 12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848, + 11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491, + 11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155, + 11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838, + 10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538, + 10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255, + 10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986, + 9963, 9939, 9916, 9892, 9869, 9846, 9823, 9800, 9777, 9754, 9732, + 9709, 9687, 9664, 9642, 9620, 9598, 9576, 9554, 9533, 9511, 9489, + 9468, 9447, 9425, 9404, 9383, 9362, 9341, 9321, 9300, 9279, 9259, + 9239, 9218, 9198, 9178, 9158, 9138, 9118, 9098, 9079, 9059, 9039, + 9020, 9001, 8981, 8962, 8943, 8924, 8905, 8886, 8867, 8849, 8830, + 8812, 8793, 8775, 8756, 8738, 8720, 8702, 8684, 8666, 8648, 8630, + 8613, 8595, 8577, 8560, 8542, 8525, 8508, 8490, 8473, 8456, 8439, + 8422, 8405, 8389, 8372, 8355, 8339, 8322, 8306, 8289, 8273, 8257, + 8240, 8224, 8208, 8192, +}; + +static inline int iclip_wmp(const int v) { + const int cv = iclip(v, INT16_MIN, INT16_MAX); + + return apply_sign((abs(cv) + 32) >> 6, cv) * (1 << 6); +} + +static inline int resolve_divisor_32(const unsigned d, int *const shift) { + *shift = ulog2(d); + const int e = d - (1 << *shift); + const int f = *shift > 8 ? (e + (1 << (*shift - 9))) >> (*shift - 8) : + e << (8 - *shift); + assert(f <= 256); + *shift += 14; + // Use f as lookup into the precomputed table of multipliers + return div_lut[f]; +} + +int dav1d_get_shear_params(Dav1dWarpedMotionParams *const wm) { + const int32_t *const mat = wm->matrix; + + if (mat[2] <= 0) return 1; + + wm->u.p.alpha = iclip_wmp(mat[2] - 0x10000); + wm->u.p.beta = iclip_wmp(mat[3]); + + int shift; + const int y = apply_sign(resolve_divisor_32(abs(mat[2]), &shift), mat[2]); + const int64_t v1 = ((int64_t) mat[4] * 0x10000) * y; + const int rnd = (1 << shift) >> 1; + wm->u.p.gamma = iclip_wmp(apply_sign64((int) ((llabs(v1) + rnd) >> shift), v1)); + const int64_t v2 = ((int64_t) mat[3] * mat[4]) * y; + wm->u.p.delta = iclip_wmp(mat[5] - + apply_sign64((int) ((llabs(v2) + rnd) >> shift), v2) - + 0x10000); + + return (4 * abs(wm->u.p.alpha) + 7 * abs(wm->u.p.beta) >= 0x10000) || + (4 * abs(wm->u.p.gamma) + 4 * abs(wm->u.p.delta) >= 0x10000); +} + +static int resolve_divisor_64(const uint64_t d, int *const shift) { + *shift = u64log2(d); + const int64_t e = d - (1LL << *shift); + const int64_t f = *shift > 8 ? (e + (1LL << (*shift - 9))) >> (*shift - 8) : + e << (8 - *shift); + assert(f <= 256); + *shift += 14; + // Use f as lookup into the precomputed table of multipliers + return div_lut[f]; +} + +static int get_mult_shift_ndiag(const int64_t px, + const int idet, const int shift) +{ + const int64_t v1 = px * idet; + const int v2 = apply_sign64((int) ((llabs(v1) + + ((1LL << shift) >> 1)) >> shift), + v1); + return iclip(v2, -0x1fff, 0x1fff); +} + +static int get_mult_shift_diag(const int64_t px, + const int idet, const int shift) +{ + const int64_t v1 = px * idet; + const int v2 = apply_sign64((int) ((llabs(v1) + + ((1LL << shift) >> 1)) >> shift), + v1); + return iclip(v2, 0xe001, 0x11fff); +} + +void dav1d_set_affine_mv2d(const int bw4, const int bh4, + const mv mv, Dav1dWarpedMotionParams *const wm, + const int bx4, const int by4) +{ + int32_t *const mat = wm->matrix; + const int rsuy = 2 * bh4 - 1; + const int rsux = 2 * bw4 - 1; + const int isuy = by4 * 4 + rsuy; + const int isux = bx4 * 4 + rsux; + + mat[0] = iclip(mv.x * 0x2000 - (isux * (mat[2] - 0x10000) + isuy * mat[3]), + -0x800000, 0x7fffff); + mat[1] = iclip(mv.y * 0x2000 - (isux * mat[4] + isuy * (mat[5] - 0x10000)), + -0x800000, 0x7fffff); +} + +int dav1d_find_affine_int(const int (*pts)[2][2], const int np, + const int bw4, const int bh4, + const mv mv, Dav1dWarpedMotionParams *const wm, + const int bx4, const int by4) +{ + int32_t *const mat = wm->matrix; + int a[2][2] = { { 0, 0 }, { 0, 0 } }; + int bx[2] = { 0, 0 }; + int by[2] = { 0, 0 }; + const int rsuy = 2 * bh4 - 1; + const int rsux = 2 * bw4 - 1; + const int suy = rsuy * 8; + const int sux = rsux * 8; + const int duy = suy + mv.y; + const int dux = sux + mv.x; + const int isuy = by4 * 4 + rsuy; + const int isux = bx4 * 4 + rsux; + + for (int i = 0; i < np; i++) { + const int dx = pts[i][1][0] - dux; + const int dy = pts[i][1][1] - duy; + const int sx = pts[i][0][0] - sux; + const int sy = pts[i][0][1] - suy; + if (abs(sx - dx) < 256 && abs(sy - dy) < 256) { + a[0][0] += ((sx * sx) >> 2) + sx * 2 + 8; + a[0][1] += ((sx * sy) >> 2) + sx + sy + 4; + a[1][1] += ((sy * sy) >> 2) + sy * 2 + 8; + bx[0] += ((sx * dx) >> 2) + sx + dx + 8; + bx[1] += ((sy * dx) >> 2) + sy + dx + 4; + by[0] += ((sx * dy) >> 2) + sx + dy + 4; + by[1] += ((sy * dy) >> 2) + sy + dy + 8; + } + } + + // compute determinant of a + const int64_t det = (int64_t) a[0][0] * a[1][1] - (int64_t) a[0][1] * a[0][1]; + if (det == 0) return 1; + int shift, idet = apply_sign64(resolve_divisor_64(llabs(det), &shift), det); + shift -= 16; + if (shift < 0) { + idet <<= -shift; + shift = 0; + } + + // solve the least-squares + mat[2] = get_mult_shift_diag((int64_t) a[1][1] * bx[0] - + (int64_t) a[0][1] * bx[1], idet, shift); + mat[3] = get_mult_shift_ndiag((int64_t) a[0][0] * bx[1] - + (int64_t) a[0][1] * bx[0], idet, shift); + mat[4] = get_mult_shift_ndiag((int64_t) a[1][1] * by[0] - + (int64_t) a[0][1] * by[1], idet, shift); + mat[5] = get_mult_shift_diag((int64_t) a[0][0] * by[1] - + (int64_t) a[0][1] * by[0], idet, shift); + + mat[0] = iclip(mv.x * 0x2000 - (isux * (mat[2] - 0x10000) + isuy * mat[3]), + -0x800000, 0x7fffff); + mat[1] = iclip(mv.y * 0x2000 - (isux * mat[4] + isuy * (mat[5] - 0x10000)), + -0x800000, 0x7fffff); + + return 0; +} diff --git a/third_party/dav1d/src/warpmv.h b/third_party/dav1d/src/warpmv.h new file mode 100644 index 0000000000..08e841d1ca --- /dev/null +++ b/third_party/dav1d/src/warpmv.h @@ -0,0 +1,39 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_WARPMV_H +#define DAV1D_SRC_WARPMV_H + +#include "src/levels.h" + +int dav1d_get_shear_params(Dav1dWarpedMotionParams *wm); +int dav1d_find_affine_int(const int (*pts)[2][2], int np, int bw4, int bh4, + mv mv, Dav1dWarpedMotionParams *wm, int bx, int by); +void dav1d_set_affine_mv2d(int bw4, int bh4, + mv mv, Dav1dWarpedMotionParams *wm, int bx, int by); + +#endif /* DAV1D_SRC_WARPMV_H */ diff --git a/third_party/dav1d/src/wedge.c b/third_party/dav1d/src/wedge.c new file mode 100644 index 0000000000..6b14e9a442 --- /dev/null +++ b/third_party/dav1d/src/wedge.c @@ -0,0 +1,342 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include +#include + +#include "common/intops.h" + +#include "src/wedge.h" + +enum WedgeDirectionType { + WEDGE_HORIZONTAL = 0, + WEDGE_VERTICAL = 1, + WEDGE_OBLIQUE27 = 2, + WEDGE_OBLIQUE63 = 3, + WEDGE_OBLIQUE117 = 4, + WEDGE_OBLIQUE153 = 5, + N_WEDGE_DIRECTIONS +}; + +typedef struct { + enum WedgeDirectionType direction; + int x_offset; + int y_offset; +} wedge_code_type; + +static const wedge_code_type wedge_codebook_16_hgtw[16] = { + { WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 }, + { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 }, + { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 4 }, + { WEDGE_HORIZONTAL, 4, 6 }, { WEDGE_VERTICAL, 4, 4 }, + { WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 }, + { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 }, + { WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 }, + { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 }, +}; + +static const wedge_code_type wedge_codebook_16_hltw[16] = { + { WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 }, + { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 }, + { WEDGE_VERTICAL, 2, 4 }, { WEDGE_VERTICAL, 4, 4 }, + { WEDGE_VERTICAL, 6, 4 }, { WEDGE_HORIZONTAL, 4, 4 }, + { WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 }, + { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 }, + { WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 }, + { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 }, +}; + +static const wedge_code_type wedge_codebook_16_heqw[16] = { + { WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 }, + { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 }, + { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 6 }, + { WEDGE_VERTICAL, 2, 4 }, { WEDGE_VERTICAL, 6, 4 }, + { WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 }, + { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 }, + { WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 }, + { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 }, +}; + +static uint8_t ALIGN(wedge_masks_444_32x32[2 * 16 * 32 * 32], 64); +static uint8_t ALIGN(wedge_masks_444_32x16[2 * 16 * 32 * 16], 64); +static uint8_t ALIGN(wedge_masks_444_32x8 [2 * 16 * 32 * 8], 64); +static uint8_t ALIGN(wedge_masks_444_16x32[2 * 16 * 16 * 32], 64); +static uint8_t ALIGN(wedge_masks_444_16x16[2 * 16 * 16 * 16], 64); +static uint8_t ALIGN(wedge_masks_444_16x8 [2 * 16 * 16 * 8], 64); +static uint8_t ALIGN(wedge_masks_444_8x32 [2 * 16 * 8 * 32], 64); +static uint8_t ALIGN(wedge_masks_444_8x16 [2 * 16 * 8 * 16], 64); +static uint8_t ALIGN(wedge_masks_444_8x8 [2 * 16 * 8 * 8], 64); + +static uint8_t ALIGN(wedge_masks_422_16x32[2 * 16 * 16 * 32], 64); +static uint8_t ALIGN(wedge_masks_422_16x16[2 * 16 * 16 * 16], 64); +static uint8_t ALIGN(wedge_masks_422_16x8 [2 * 16 * 16 * 8], 64); +static uint8_t ALIGN(wedge_masks_422_8x32 [2 * 16 * 8 * 32], 64); +static uint8_t ALIGN(wedge_masks_422_8x16 [2 * 16 * 8 * 16], 64); +static uint8_t ALIGN(wedge_masks_422_8x8 [2 * 16 * 8 * 8], 64); +static uint8_t ALIGN(wedge_masks_422_4x32 [2 * 16 * 4 * 32], 64); +static uint8_t ALIGN(wedge_masks_422_4x16 [2 * 16 * 4 * 16], 64); +static uint8_t ALIGN(wedge_masks_422_4x8 [2 * 16 * 4 * 8], 32); + +static uint8_t ALIGN(wedge_masks_420_16x16[2 * 16 * 16 * 16], 64); +static uint8_t ALIGN(wedge_masks_420_16x8 [2 * 16 * 16 * 8], 64); +static uint8_t ALIGN(wedge_masks_420_16x4 [2 * 16 * 16 * 4], 64); +static uint8_t ALIGN(wedge_masks_420_8x16 [2 * 16 * 8 * 16], 64); +static uint8_t ALIGN(wedge_masks_420_8x8 [2 * 16 * 8 * 8], 64); +static uint8_t ALIGN(wedge_masks_420_8x4 [2 * 16 * 8 * 4], 64); +static uint8_t ALIGN(wedge_masks_420_4x16 [2 * 16 * 4 * 16], 64); +static uint8_t ALIGN(wedge_masks_420_4x8 [2 * 16 * 4 * 8], 32); +static uint8_t ALIGN(wedge_masks_420_4x4 [2 * 16 * 4 * 4], 16); + +const uint8_t *dav1d_wedge_masks[N_BS_SIZES][3][2][16]; + +static void insert_border(uint8_t *const dst, const uint8_t *const src, + const int ctr) +{ + if (ctr > 4) memset(dst, 0, ctr - 4); + memcpy(dst + imax(ctr, 4) - 4, src + imax(4 - ctr, 0), imin(64 - ctr, 8)); + if (ctr < 64 - 4) + memset(dst + ctr + 4, 64, 64 - 4 - ctr); +} + +static void transpose(uint8_t *const dst, const uint8_t *const src) { + for (int y = 0, y_off = 0; y < 64; y++, y_off += 64) + for (int x = 0, x_off = 0; x < 64; x++, x_off += 64) + dst[x_off + y] = src[y_off + x]; +} + +static void hflip(uint8_t *const dst, const uint8_t *const src) { + for (int y = 0, y_off = 0; y < 64; y++, y_off += 64) + for (int x = 0; x < 64; x++) + dst[y_off + 64 - 1 - x] = src[y_off + x]; +} + +static void invert(uint8_t *const dst, const uint8_t *const src, + const int w, const int h) +{ + for (int y = 0, y_off = 0; y < h; y++, y_off += w) + for (int x = 0; x < w; x++) + dst[y_off + x] = 64 - src[y_off + x]; +} + +static void copy2d(uint8_t *dst, const uint8_t *src, + const int w, const int h, const int x_off, const int y_off) +{ + src += y_off * 64 + x_off; + for (int y = 0; y < h; y++) { + memcpy(dst, src, w); + src += 64; + dst += w; + } +} + +static COLD void init_chroma(uint8_t *chroma, const uint8_t *luma, + const int sign, const int w, const int h, + const int ss_ver) +{ + for (int y = 0; y < h; y += 1 + ss_ver) { + for (int x = 0; x < w; x += 2) { + int sum = luma[x] + luma[x + 1] + 1; + if (ss_ver) sum += luma[w + x] + luma[w + x + 1] + 1; + chroma[x >> 1] = (sum - sign) >> (1 + ss_ver); + } + luma += w << ss_ver; + chroma += w >> 1; + } +} + +static COLD void fill2d_16x2(uint8_t *dst, const int w, const int h, + const enum BlockSize bs, + const uint8_t (*const master)[64 * 64], + const wedge_code_type *const cb, + uint8_t *masks_444, uint8_t *masks_422, + uint8_t *masks_420, const unsigned signs) +{ + uint8_t *ptr = dst; + for (int n = 0; n < 16; n++) { + copy2d(ptr, master[cb[n].direction], w, h, + 32 - (w * cb[n].x_offset >> 3), 32 - (h * cb[n].y_offset >> 3)); + ptr += w * h; + } + for (int n = 0, off = 0; n < 16; n++, off += w * h) + invert(ptr + off, dst + off, w, h); + + const int n_stride_444 = (w * h); + const int n_stride_422 = n_stride_444 >> 1; + const int n_stride_420 = n_stride_444 >> 2; + const int sign_stride_444 = 16 * n_stride_444; + const int sign_stride_422 = 16 * n_stride_422; + const int sign_stride_420 = 16 * n_stride_420; + // assign pointers in externally visible array + for (int n = 0; n < 16; n++) { + const int sign = (signs >> n) & 1; + dav1d_wedge_masks[bs][0][0][n] = &masks_444[ sign * sign_stride_444]; + // not using !sign is intentional here, since 444 does not require + // any rounding since no chroma subsampling is applied. + dav1d_wedge_masks[bs][0][1][n] = &masks_444[ sign * sign_stride_444]; + dav1d_wedge_masks[bs][1][0][n] = &masks_422[ sign * sign_stride_422]; + dav1d_wedge_masks[bs][1][1][n] = &masks_422[!sign * sign_stride_422]; + dav1d_wedge_masks[bs][2][0][n] = &masks_420[ sign * sign_stride_420]; + dav1d_wedge_masks[bs][2][1][n] = &masks_420[!sign * sign_stride_420]; + masks_444 += n_stride_444; + masks_422 += n_stride_422; + masks_420 += n_stride_420; + + // since the pointers come from inside, we know that + // violation of the const is OK here. Any other approach + // means we would have to duplicate the sign correction + // logic in two places, which isn't very nice, or mark + // the table faced externally as non-const, which also sucks + init_chroma((uint8_t *)dav1d_wedge_masks[bs][1][0][n], + dav1d_wedge_masks[bs][0][0][n], 0, w, h, 0); + init_chroma((uint8_t *)dav1d_wedge_masks[bs][1][1][n], + dav1d_wedge_masks[bs][0][0][n], 1, w, h, 0); + init_chroma((uint8_t *)dav1d_wedge_masks[bs][2][0][n], + dav1d_wedge_masks[bs][0][0][n], 0, w, h, 1); + init_chroma((uint8_t *)dav1d_wedge_masks[bs][2][1][n], + dav1d_wedge_masks[bs][0][0][n], 1, w, h, 1); + } +} + +COLD void dav1d_init_wedge_masks(void) { + // This function is guaranteed to be called only once + + enum WedgeMasterLineType { + WEDGE_MASTER_LINE_ODD, + WEDGE_MASTER_LINE_EVEN, + WEDGE_MASTER_LINE_VERT, + N_WEDGE_MASTER_LINES, + }; + static const uint8_t wedge_master_border[N_WEDGE_MASTER_LINES][8] = { + [WEDGE_MASTER_LINE_ODD] = { 1, 2, 6, 18, 37, 53, 60, 63 }, + [WEDGE_MASTER_LINE_EVEN] = { 1, 4, 11, 27, 46, 58, 62, 63 }, + [WEDGE_MASTER_LINE_VERT] = { 0, 2, 7, 21, 43, 57, 62, 64 }, + }; + uint8_t master[6][64 * 64]; + + // create master templates + for (int y = 0, off = 0; y < 64; y++, off += 64) + insert_border(&master[WEDGE_VERTICAL][off], + wedge_master_border[WEDGE_MASTER_LINE_VERT], 32); + for (int y = 0, off = 0, ctr = 48; y < 64; y += 2, off += 128, ctr--) + { + insert_border(&master[WEDGE_OBLIQUE63][off], + wedge_master_border[WEDGE_MASTER_LINE_EVEN], ctr); + insert_border(&master[WEDGE_OBLIQUE63][off + 64], + wedge_master_border[WEDGE_MASTER_LINE_ODD], ctr - 1); + } + + transpose(master[WEDGE_OBLIQUE27], master[WEDGE_OBLIQUE63]); + transpose(master[WEDGE_HORIZONTAL], master[WEDGE_VERTICAL]); + hflip(master[WEDGE_OBLIQUE117], master[WEDGE_OBLIQUE63]); + hflip(master[WEDGE_OBLIQUE153], master[WEDGE_OBLIQUE27]); + +#define fill(w, h, sz_422, sz_420, hvsw, signs) \ + fill2d_16x2((uint8_t *) wedge_masks_444_##w##x##h, w, h, BS_##w##x##h, \ + master, wedge_codebook_16_##hvsw, wedge_masks_444_##w##x##h, \ + wedge_masks_422_##sz_422, wedge_masks_420_##sz_420, signs) + + fill(32, 32, 16x32, 16x16, heqw, 0x7bfb); + fill(32, 16, 16x16, 16x8, hltw, 0x7beb); + fill(32, 8, 16x8, 16x4, hltw, 0x6beb); + fill(16, 32, 8x32, 8x16, hgtw, 0x7beb); + fill(16, 16, 8x16, 8x8, heqw, 0x7bfb); + fill(16, 8, 8x8, 8x4, hltw, 0x7beb); + fill( 8, 32, 4x32, 4x16, hgtw, 0x7aeb); + fill( 8, 16, 4x16, 4x8, hgtw, 0x7beb); + fill( 8, 8, 4x8, 4x4, heqw, 0x7bfb); +#undef fill +} + +#define N_II_PRED_MODES (N_INTER_INTRA_PRED_MODES - 1) +static uint8_t ALIGN(ii_dc_mask[32 * 32], 64); +static uint8_t ALIGN(ii_nondc_mask_32x32[N_II_PRED_MODES][32 * 32], 64); +static uint8_t ALIGN(ii_nondc_mask_16x32[N_II_PRED_MODES][16 * 32], 64); +static uint8_t ALIGN(ii_nondc_mask_16x16[N_II_PRED_MODES][16 * 16], 64); +static uint8_t ALIGN(ii_nondc_mask_8x32 [N_II_PRED_MODES][ 8 * 32], 64); +static uint8_t ALIGN(ii_nondc_mask_8x16 [N_II_PRED_MODES][ 8 * 16], 64); +static uint8_t ALIGN(ii_nondc_mask_8x8 [N_II_PRED_MODES][ 8 * 8], 64); +static uint8_t ALIGN(ii_nondc_mask_4x16 [N_II_PRED_MODES][ 4 * 16], 64); +static uint8_t ALIGN(ii_nondc_mask_4x8 [N_II_PRED_MODES][ 4 * 8], 32); +static uint8_t ALIGN(ii_nondc_mask_4x4 [N_II_PRED_MODES][ 4 * 4], 16); +#undef N_II_PRED_MODES + +#define set1(sz) \ + [II_DC_PRED] = ii_dc_mask, \ + [II_VERT_PRED] = ii_nondc_mask_##sz[II_VERT_PRED - 1], \ + [II_HOR_PRED] = ii_nondc_mask_##sz[II_HOR_PRED - 1], \ + [II_SMOOTH_PRED] = ii_nondc_mask_##sz[II_SMOOTH_PRED - 1] +#define set(sz_444, sz_422, sz_420) \ + { { set1(sz_444) }, { set1(sz_422) }, { set1(sz_420) } } +const uint8_t *dav1d_ii_masks[N_BS_SIZES][3][N_INTER_INTRA_PRED_MODES] = { + [BS_8x8] = set( 8x8, 4x8, 4x4), + [BS_8x16] = set( 8x16, 4x16, 4x8), + [BS_16x8] = set(16x16, 8x8, 8x8), + [BS_16x16] = set(16x16, 8x16, 8x8), + [BS_16x32] = set(16x32, 8x32, 8x16), + [BS_32x16] = set(32x32, 16x16, 16x16), + [BS_32x32] = set(32x32, 16x32, 16x16), +}; +#undef set +#undef set1 + +static COLD void build_nondc_ii_masks(uint8_t *const mask_v, + uint8_t *const mask_h, + uint8_t *const mask_sm, + const int w, const int h, const int step) +{ + static const uint8_t ii_weights_1d[] = { + 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, + 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, + }; + + for (int y = 0, off = 0; y < h; y++, off += w) { + memset(&mask_v[off], ii_weights_1d[y * step], w); + for (int x = 0; x < w; x++) { + mask_sm[off + x] = ii_weights_1d[imin(x, y) * step]; + mask_h[off + x] = ii_weights_1d[x * step]; + } + } +} + +COLD void dav1d_init_interintra_masks(void) { + // This function is guaranteed to be called only once + + memset(ii_dc_mask, 32, 32 * 32); +#define set(a) a[II_VERT_PRED - 1], a[II_HOR_PRED - 1], a[II_SMOOTH_PRED - 1] + build_nondc_ii_masks(set(ii_nondc_mask_32x32), 32, 32, 1); + build_nondc_ii_masks(set(ii_nondc_mask_16x32), 16, 32, 1); + build_nondc_ii_masks(set(ii_nondc_mask_16x16), 16, 16, 2); + build_nondc_ii_masks(set(ii_nondc_mask_8x32), 8, 32, 1); + build_nondc_ii_masks(set(ii_nondc_mask_8x16), 8, 16, 2); + build_nondc_ii_masks(set(ii_nondc_mask_8x8), 8, 8, 4); + build_nondc_ii_masks(set(ii_nondc_mask_4x16), 4, 16, 2); + build_nondc_ii_masks(set(ii_nondc_mask_4x8), 4, 8, 4); + build_nondc_ii_masks(set(ii_nondc_mask_4x4), 4, 4, 8); +#undef set +} diff --git a/third_party/dav1d/src/wedge.h b/third_party/dav1d/src/wedge.h new file mode 100644 index 0000000000..45f0570a27 --- /dev/null +++ b/third_party/dav1d/src/wedge.h @@ -0,0 +1,41 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_WEDGE_H +#define DAV1D_SRC_WEDGE_H + +#include "src/levels.h" + +void dav1d_init_wedge_masks(void); +extern const uint8_t *dav1d_wedge_masks[N_BS_SIZES][3 /* 444/luma, 422, 420 */] + [2 /* sign */][16 /* wedge_idx */]; + +void dav1d_init_interintra_masks(void); +extern const uint8_t *dav1d_ii_masks[N_BS_SIZES][3 /* 444/luma, 422, 420 */] + [N_INTER_INTRA_PRED_MODES]; + +#endif /* DAV1D_SRC_WEDGE_H */ diff --git a/third_party/dav1d/src/win32/thread.c b/third_party/dav1d/src/win32/thread.c new file mode 100644 index 0000000000..5e878bfe0e --- /dev/null +++ b/third_party/dav1d/src/win32/thread.c @@ -0,0 +1,96 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#if defined(_WIN32) + +#include +#include +#include + +#include "common/attributes.h" + +#include "src/thread.h" + +static HRESULT (WINAPI *set_thread_description)(HANDLE, PCWSTR); + +COLD void dav1d_init_thread(void) { + set_thread_description = + (void*)GetProcAddress(GetModuleHandleW(L"kernel32.dll"), + "SetThreadDescription"); +} + +#undef dav1d_set_thread_name +COLD void dav1d_set_thread_name(const wchar_t *const name) { + if (set_thread_description) /* Only available since Windows 10 1607 */ + set_thread_description(GetCurrentThread(), name); +} + +static COLD unsigned __stdcall thread_entrypoint(void *const data) { + pthread_t *const t = data; + t->arg = t->func(t->arg); + return 0; +} + +COLD int dav1d_pthread_create(pthread_t *const thread, + const pthread_attr_t *const attr, + void *(*const func)(void*), void *const arg) +{ + const unsigned stack_size = attr ? attr->stack_size : 0; + thread->func = func; + thread->arg = arg; + thread->h = (HANDLE)_beginthreadex(NULL, stack_size, thread_entrypoint, thread, + STACK_SIZE_PARAM_IS_A_RESERVATION, NULL); + return !thread->h; +} + +COLD int dav1d_pthread_join(pthread_t *const thread, void **const res) { + if (WaitForSingleObject(thread->h, INFINITE)) + return 1; + + if (res) + *res = thread->arg; + + return !CloseHandle(thread->h); +} + +COLD int dav1d_pthread_once(pthread_once_t *const once_control, + void (*const init_routine)(void)) +{ + BOOL pending = FALSE; + + if (InitOnceBeginInitialize(once_control, 0, &pending, NULL) != TRUE) + return 1; + + if (pending == TRUE) + init_routine(); + + return !InitOnceComplete(once_control, 0, NULL); +} + +#endif diff --git a/third_party/dav1d/src/x86/cdef_avx2.asm b/third_party/dav1d/src/x86/cdef_avx2.asm new file mode 100644 index 0000000000..672fae7a51 --- /dev/null +++ b/third_party/dav1d/src/x86/cdef_avx2.asm @@ -0,0 +1,1799 @@ +; Copyright © 2018, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +%macro JMP_TABLE 2-* + %xdefine %1_jmptable %%table + %xdefine %%base mangle(private_prefix %+ _%1_avx2) + %%table: + %rep %0 - 1 + dd %%base %+ .%2 - %%table + %rotate 1 + %endrep +%endmacro + +%macro CDEF_FILTER_JMP_TABLE 1 +JMP_TABLE cdef_filter_%1, \ + d6k0, d6k1, d7k0, d7k1, \ + d0k0, d0k1, d1k0, d1k1, d2k0, d2k1, d3k0, d3k1, \ + d4k0, d4k1, d5k0, d5k1, d6k0, d6k1, d7k0, d7k1, \ + d0k0, d0k1, d1k0, d1k1 +%endmacro + +SECTION_RODATA 32 + +pd_47130256: dd 4, 7, 1, 3, 0, 2, 5, 6 +blend_4x4: dd 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00 + dd 0x80, 0x00, 0x00 +blend_4x8_0: dd 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 +blend_4x8_1: dd 0x00, 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 + dd 0x00, 0x00 +blend_4x8_2: dd 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080 + dd 0x0000 +blend_4x8_3: dd 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080 + dd 0x0000, 0x0000 +blend_8x8_0: dq 0x00, 0x00, 0x80, 0x80, 0x80, 0x80 +blend_8x8_1: dq 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x0000, 0x0000 +div_table: dd 840, 420, 280, 210, 168, 140, 120, 105, 420, 210, 140, 105 +shufw_6543210x:db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15 +shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 +pw_128: times 2 dw 128 +pw_2048: times 2 dw 2048 +tap_table: ; masks for 8 bit shifts + db 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01 + ; weights + db 4, 2, 3, 3, 2, 1 + db -1 * 16 + 1, -2 * 16 + 2 + db 0 * 16 + 1, -1 * 16 + 2 + db 0 * 16 + 1, 0 * 16 + 2 + db 0 * 16 + 1, 1 * 16 + 2 + db 1 * 16 + 1, 2 * 16 + 2 + db 1 * 16 + 0, 2 * 16 + 1 + db 1 * 16 + 0, 2 * 16 + 0 + db 1 * 16 + 0, 2 * 16 - 1 + ; the last 6 are repeats of the first 6 so we don't need to & 7 + db -1 * 16 + 1, -2 * 16 + 2 + db 0 * 16 + 1, -1 * 16 + 2 + db 0 * 16 + 1, 0 * 16 + 2 + db 0 * 16 + 1, 1 * 16 + 2 + db 1 * 16 + 1, 2 * 16 + 2 + db 1 * 16 + 0, 2 * 16 + 1 + +CDEF_FILTER_JMP_TABLE 4x4 +CDEF_FILTER_JMP_TABLE 4x8 +CDEF_FILTER_JMP_TABLE 8x8 + +SECTION .text + +%macro PREP_REGS 2 ; w, h + ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k] + mov dird, r6m + lea tableq, [cdef_filter_%1x%2_jmptable] + lea dirq, [tableq+dirq*2*4] +%if %1 == 4 + %if %2 == 4 + DEFINE_ARGS dst, stride, left, top, pri, sec, \ + table, dir, dirjmp, dst4, stride3, k + %else + DEFINE_ARGS dst, stride, left, top, pri, sec, \ + table, dir, dirjmp, dst4, dst8, stride3, k + lea dst8q, [dstq+strideq*8] + %endif +%else + DEFINE_ARGS dst, stride, h, top1, pri, sec, \ + table, dir, dirjmp, top2, dst4, stride3, k + mov hq, -8 + lea top1q, [top1q+strideq*0] + lea top2q, [top1q+strideq*1] +%endif + lea dst4q, [dstq+strideq*4] +%if %1 == 4 + lea stride3q, [strideq*3] +%endif +%endmacro + +%macro LOAD_BLOCK 2-3 0 ; w, h, init_min_max + mov kd, 1 + pxor m15, m15 ; sum +%if %2 == 8 + pxor m12, m12 + %if %1 == 4 + movd xm4, [dstq +strideq*0] + movd xm6, [dstq +strideq*1] + movd xm5, [dstq +strideq*2] + movd xm7, [dstq +stride3q ] + vinserti128 m4, [dst4q+strideq*0], 1 + vinserti128 m6, [dst4q+strideq*1], 1 + vinserti128 m5, [dst4q+strideq*2], 1 + vinserti128 m7, [dst4q+stride3q ], 1 + punpckldq m4, m6 + punpckldq m5, m7 + %else + movq xm4, [dstq+strideq*0] + movq xm5, [dstq+strideq*1] + vinserti128 m4, [dstq+strideq*2], 1 + vinserti128 m5, [dstq+stride3q ], 1 + %endif + punpcklqdq m4, m5 +%else + movd xm4, [dstq+strideq*0] + movd xm5, [dstq+strideq*1] + vinserti128 m4, [dstq+strideq*2], 1 + vinserti128 m5, [dstq+stride3q ], 1 + punpckldq m4, m5 +%endif +%if %3 == 1 + mova m7, m4 ; min + mova m8, m4 ; max +%endif +%endmacro + +%macro ACCUMULATE_TAP_BYTE 7-8 0 ; tap_offset, shift, mask, strength + ; mul_tap, w, h, clip + ; load p0/p1 + movsxd dirjmpq, [dirq+kq*4+%1*2*4] + add dirjmpq, tableq + call dirjmpq + +%if %8 == 1 + pmaxub m7, m5 + pminub m8, m5 + pmaxub m7, m6 + pminub m8, m6 +%endif + + ; accumulate sum[m15] over p0/p1 +%if %7 == 4 + punpcklbw m5, m6 + punpcklbw m6, m4, m4 + psubusb m9, m5, m6 + psubusb m5, m6, m5 + por m9, m5 ; abs_diff_p01(p01 - px) + pcmpeqb m5, m9 + por m5, %5 + psignb m6, %5, m5 + psrlw m5, m9, %2 ; emulate 8-bit shift + pand m5, %3 + psubusb m5, %4, m5 + pminub m5, m9 + pmaddubsw m5, m6 + paddw m15, m5 +%else + psubusb m9, m5, m4 + psubusb m5, m4, m5 + psubusb m11, m6, m4 + psubusb m6, m4, m6 + por m9, m5 ; abs_diff_p0(p0 - px) + por m11, m6 ; abs_diff_p1(p1 - px) + pcmpeqb m5, m9 + pcmpeqb m6, m11 + punpckhbw m10, m9, m11 + punpcklbw m9, m11 + por m5, %5 + por m11, m6, %5 + punpckhbw m6, m5, m11 + punpcklbw m5, m11 + psignb m11, %5, m6 + psrlw m6, m10, %2 ; emulate 8-bit shift + pand m6, %3 + psubusb m6, %4, m6 + pminub m6, m10 + pmaddubsw m6, m11 + paddw m12, m6 + psignb m11, %5, m5 + psrlw m5, m9, %2 ; emulate 8-bit shift + pand m5, %3 + psubusb m5, %4, m5 + pminub m5, m9 + pmaddubsw m5, m11 + paddw m15, m5 +%endif +%endmacro + +%macro ADJUST_PIXEL 4-5 0 ; w, h, zero, pw_2048, clip +%if %2 == 4 + %if %5 == 1 + punpcklbw m4, %3 + %endif + pcmpgtw %3, m15 + paddw m15, %3 + pmulhrsw m15, %4 + %if %5 == 0 + packsswb m15, m15 + paddb m4, m15 + %else + paddw m4, m15 + packuswb m4, m4 ; clip px in [0x0,0xff] + pminub m4, m7 + pmaxub m4, m8 + %endif + vextracti128 xm5, m4, 1 + movd [dstq+strideq*0], xm4 + movd [dstq+strideq*2], xm5 + pextrd [dstq+strideq*1], xm4, 1 + pextrd [dstq+stride3q ], xm5, 1 +%else + pcmpgtw m6, %3, m12 + pcmpgtw m5, %3, m15 + paddw m12, m6 + paddw m15, m5 + %if %5 == 1 + punpckhbw m5, m4, %3 + punpcklbw m4, %3 + %endif + pmulhrsw m12, %4 + pmulhrsw m15, %4 + %if %5 == 0 + packsswb m15, m12 + paddb m4, m15 + %else + paddw m5, m12 + paddw m4, m15 + packuswb m4, m5 ; clip px in [0x0,0xff] + pminub m4, m7 + pmaxub m4, m8 + %endif + vextracti128 xm5, m4, 1 + %if %1 == 4 + movd [dstq +strideq*0], xm4 + movd [dst4q+strideq*0], xm5 + pextrd [dstq +strideq*1], xm4, 1 + pextrd [dst4q+strideq*1], xm5, 1 + pextrd [dstq +strideq*2], xm4, 2 + pextrd [dst4q+strideq*2], xm5, 2 + pextrd [dstq +stride3q ], xm4, 3 + pextrd [dst4q+stride3q ], xm5, 3 + %else + movq [dstq+strideq*0], xm4 + movq [dstq+strideq*2], xm5 + movhps [dstq+strideq*1], xm4 + movhps [dstq+stride3q ], xm5 + %endif +%endif +%endmacro + +%macro BORDER_PREP_REGS 2 ; w, h + ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k] + mov dird, r6m + lea dirq, [tableq+dirq*2+14] +%if %1*%2*2/mmsize > 1 + %if %1 == 4 + DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, h, off, k + %else + DEFINE_ARGS dst, stride, dir, stk, pri, sec, h, off, k + %endif + mov hd, %1*%2*2/mmsize +%else + DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, off, k +%endif + lea stkq, [px] + pxor m11, m11 +%endmacro + +%macro BORDER_LOAD_BLOCK 2-3 0 ; w, h, init_min_max + mov kd, 1 +%if %1 == 4 + movq xm4, [stkq+32*0] + movhps xm4, [stkq+32*1] + movq xm5, [stkq+32*2] + movhps xm5, [stkq+32*3] + vinserti128 m4, xm5, 1 +%else + mova xm4, [stkq+32*0] ; px + vinserti128 m4, [stkq+32*1], 1 +%endif + pxor m15, m15 ; sum +%if %3 == 1 + mova m7, m4 ; max + mova m8, m4 ; min +%endif +%endmacro + +%macro ACCUMULATE_TAP_WORD 6-7 0 ; tap_offset, shift, mask, strength + ; mul_tap, w, clip + ; load p0/p1 + movsx offq, byte [dirq+kq+%1] ; off1 +%if %6 == 4 + movq xm5, [stkq+offq*2+32*0] ; p0 + movq xm6, [stkq+offq*2+32*2] + movhps xm5, [stkq+offq*2+32*1] + movhps xm6, [stkq+offq*2+32*3] + vinserti128 m5, xm6, 1 +%else + movu xm5, [stkq+offq*2+32*0] ; p0 + vinserti128 m5, [stkq+offq*2+32*1], 1 +%endif + neg offq ; -off1 +%if %6 == 4 + movq xm6, [stkq+offq*2+32*0] ; p1 + movq xm9, [stkq+offq*2+32*2] + movhps xm6, [stkq+offq*2+32*1] + movhps xm9, [stkq+offq*2+32*3] + vinserti128 m6, xm9, 1 +%else + movu xm6, [stkq+offq*2+32*0] ; p1 + vinserti128 m6, [stkq+offq*2+32*1], 1 +%endif +%if %7 == 1 + ; out of bounds values are set to a value that is a both a large unsigned + ; value and a negative signed value. + ; use signed max and unsigned min to remove them + pmaxsw m7, m5 ; max after p0 + pminuw m8, m5 ; min after p0 + pmaxsw m7, m6 ; max after p1 + pminuw m8, m6 ; min after p1 +%endif + + ; accumulate sum[m15] over p0/p1 + ; calculate difference before converting + psubw m5, m4 ; diff_p0(p0 - px) + psubw m6, m4 ; diff_p1(p1 - px) + + ; convert to 8-bits with signed saturation + ; saturating to large diffs has no impact on the results + packsswb m5, m6 + + ; group into pairs so we can accumulate using maddubsw + pshufb m5, m12 + pabsb m9, m5 + psignb m10, %5, m5 + psrlw m5, m9, %2 ; emulate 8-bit shift + pand m5, %3 + psubusb m5, %4, m5 + + ; use unsigned min since abs diff can equal 0x80 + pminub m5, m9 + pmaddubsw m5, m10 + paddw m15, m5 +%endmacro + +%macro BORDER_ADJUST_PIXEL 2-3 0 ; w, pw_2048, clip + pcmpgtw m9, m11, m15 + paddw m15, m9 + pmulhrsw m15, %2 + paddw m4, m15 +%if %3 == 1 + pminsw m4, m7 + pmaxsw m4, m8 +%endif + packuswb m4, m4 + vextracti128 xm5, m4, 1 +%if %1 == 4 + movd [dstq+strideq*0], xm4 + pextrd [dstq+strideq*1], xm4, 1 + movd [dstq+strideq*2], xm5 + pextrd [dstq+stride3q], xm5, 1 +%else + movq [dstq+strideq*0], xm4 + movq [dstq+strideq*1], xm5 +%endif +%endmacro + +%macro CDEF_FILTER 2 ; w, h +INIT_YMM avx2 +cglobal cdef_filter_%1x%2, 4, 9, 0, dst, stride, left, top, \ + pri, sec, dir, damping, edge +%assign stack_offset_entry stack_offset + mov edged, edgem + cmp edged, 0xf + jne .border_block + + PUSH r9 + PUSH r10 + PUSH r11 +%if %2 == 4 + %assign regs_used 12 + %if STACK_ALIGNMENT < 32 + PUSH r%+regs_used + %assign regs_used regs_used+1 + %endif + ALLOC_STACK 0x60, 16 + pmovzxbw xm0, [leftq+1] + vpermq m0, m0, q0110 + psrldq m1, m0, 4 + vpalignr m2, m0, m0, 12 + movu [rsp+0x10], m0 + movu [rsp+0x28], m1 + movu [rsp+0x40], m2 +%elif %1 == 4 + PUSH r12 + %assign regs_used 13 + %if STACK_ALIGNMENT < 32 + PUSH r%+regs_used + %assign regs_used regs_used+1 + %endif + ALLOC_STACK 8*2+%1*%2*1, 16 + pmovzxwd m0, [leftq] + mova [rsp+0x10], m0 +%else + PUSH r12 + PUSH r13 + %assign regs_used 14 + %if STACK_ALIGNMENT < 32 + PUSH r%+regs_used + %assign regs_used regs_used+1 + %endif + ALLOC_STACK 8*2+%1*%2*2+32, 16 + lea r11, [strideq*3] + movu xm4, [dstq+strideq*2] + pmovzxwq m0, [leftq+0] + pmovzxwq m1, [leftq+8] + vinserti128 m4, [dstq+r11], 1 + pmovzxbd m2, [leftq+1] + pmovzxbd m3, [leftq+9] + mova [rsp+0x10], m0 + mova [rsp+0x30], m1 + mova [rsp+0x50], m2 + mova [rsp+0x70], m3 + mova [rsp+0x90], m4 +%endif + + DEFINE_ARGS dst, stride, left, top, pri, secdmp, zero, pridmp, damping + mov dampingd, r7m + xor zerod, zerod + movifnidn prid, prim + sub dampingd, 31 + movifnidn secdmpd, secdmpm + test prid, prid + jz .sec_only + movd xm0, prid + lzcnt pridmpd, prid + add pridmpd, dampingd + cmovs pridmpd, zerod + mov [rsp+0], pridmpq ; pri_shift + test secdmpd, secdmpd + jz .pri_only + movd xm1, secdmpd + lzcnt secdmpd, secdmpd + add secdmpd, dampingd + cmovs secdmpd, zerod + mov [rsp+8], secdmpq ; sec_shift + + DEFINE_ARGS dst, stride, left, top, pri, secdmp, table, pridmp + lea tableq, [tap_table] + vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask + vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask + + ; pri/sec_taps[k] [4 total] + DEFINE_ARGS dst, stride, left, top, pri, sec, table, dir + vpbroadcastb m0, xm0 ; pri_strength + vpbroadcastb m1, xm1 ; sec_strength + and prid, 1 + lea priq, [tableq+priq*2+8] ; pri_taps + lea secq, [tableq+12] ; sec_taps + + PREP_REGS %1, %2 +%if %1*%2 > mmsize +.v_loop: +%endif + LOAD_BLOCK %1, %2, 1 +.k_loop: + vpbroadcastb m2, [priq+kq] ; pri_taps + vpbroadcastb m3, [secq+kq] ; sec_taps + ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2, 1 ; dir + 0 + ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir + 2 + ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir - 2 + dec kq + jge .k_loop + + vpbroadcastd m10, [pw_2048] + pxor m9, m9 + ADJUST_PIXEL %1, %2, m9, m10, 1 +%if %1*%2 > mmsize + mov dstq, dst4q + lea top1q, [rsp+0x90] + lea top2q, [rsp+0xA0] + lea dst4q, [dst4q+strideq*4] + add hq, 4 + jl .v_loop +%endif + RET + +.pri_only: + DEFINE_ARGS dst, stride, left, top, pri, _, table, pridmp + lea tableq, [tap_table] + vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask + ; pri/sec_taps[k] [4 total] + DEFINE_ARGS dst, stride, left, top, pri, _, table, dir + vpbroadcastb m0, xm0 ; pri_strength + and prid, 1 + lea priq, [tableq+priq*2+8] ; pri_taps + PREP_REGS %1, %2 + vpbroadcastd m3, [pw_2048] + pxor m1, m1 +%if %1*%2 > mmsize +.pri_v_loop: +%endif + LOAD_BLOCK %1, %2 +.pri_k_loop: + vpbroadcastb m2, [priq+kq] ; pri_taps + ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2 ; dir + 0 + dec kq + jge .pri_k_loop + ADJUST_PIXEL %1, %2, m1, m3 +%if %1*%2 > mmsize + mov dstq, dst4q + lea top1q, [rsp+0x90] + lea top2q, [rsp+0xA0] + lea dst4q, [dst4q+strideq*4] + add hq, 4 + jl .pri_v_loop +%endif + RET + +.sec_only: + DEFINE_ARGS dst, stride, left, top, _, secdmp, zero, _, damping + movd xm1, secdmpd + lzcnt secdmpd, secdmpd + add secdmpd, dampingd + cmovs secdmpd, zerod + mov [rsp+8], secdmpq ; sec_shift + DEFINE_ARGS dst, stride, left, top, _, secdmp, table + lea tableq, [tap_table] + vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask + ; pri/sec_taps[k] [4 total] + DEFINE_ARGS dst, stride, left, top, _, sec, table, dir + vpbroadcastb m1, xm1 ; sec_strength + lea secq, [tableq+12] ; sec_taps + PREP_REGS %1, %2 + vpbroadcastd m2, [pw_2048] + pxor m0, m0 +%if %1*%2 > mmsize +.sec_v_loop: +%endif + LOAD_BLOCK %1, %2 +.sec_k_loop: + vpbroadcastb m3, [secq+kq] ; sec_taps + ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2 ; dir + 2 + ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2 ; dir - 2 + dec kq + jge .sec_k_loop + ADJUST_PIXEL %1, %2, m0, m2 +%if %1*%2 > mmsize + mov dstq, dst4q + lea top1q, [rsp+0x90] + lea top2q, [rsp+0xA0] + lea dst4q, [dst4q+strideq*4] + add hq, 4 + jl .sec_v_loop +%endif + RET + +.d0k0: +%if %1 == 4 + %if %2 == 4 + vpbroadcastq m6, [dstq+strideq*1-1] + vpbroadcastq m10, [dstq+strideq*2-1] + movd xm5, [topq+strideq*1+1] + movd xm9, [dstq+strideq*0+1] + psrldq m11, m6, 2 + psrldq m12, m10, 2 + vinserti128 m6, [dstq+stride3q -1], 1 + vinserti128 m10, [dstq+strideq*4-1], 1 + vpblendd m5, m11, 0x10 + vpblendd m9, m12, 0x10 + movu m11, [blend_4x4+16] + punpckldq m6, m10 + punpckldq m5, m9 + vpblendvb m6, [rsp+gprsize+0x28], m11 + %else + movd xm5, [topq +strideq*1+1] + movq xm6, [dstq +strideq*1-1] + movq xm10, [dstq +stride3q -1] + movq xm11, [dst4q+strideq*1-1] + pinsrd xm5, [dstq +strideq*0+1], 1 + movhps xm6, [dstq +strideq*2-1] + movhps xm10, [dst4q+strideq*0-1] + movhps xm11, [dst4q+strideq*2-1] + psrldq xm9, xm6, 2 + shufps xm5, xm9, q2010 ; -1 +0 +1 +2 + shufps xm6, xm10, q2020 ; +1 +2 +3 +4 + psrldq xm9, xm11, 2 + psrldq xm10, 2 + shufps xm10, xm9, q2020 ; +3 +4 +5 +6 + movd xm9, [dst4q+stride3q -1] + pinsrd xm9, [dst4q+strideq*4-1], 1 + shufps xm11, xm9, q1020 ; +5 +6 +7 +8 + pmovzxbw m9, [leftq+3] + vinserti128 m6, xm11, 1 + movu m11, [blend_4x8_0+4] + vinserti128 m5, xm10, 1 + vpblendvb m6, m9, m11 + %endif +%else + lea r13, [blend_8x8_0+16] + movq xm5, [top2q +1] + vbroadcasti128 m10, [dstq+strideq*1-1] + vbroadcasti128 m11, [dstq+strideq*2-1] + movhps xm5, [dstq+strideq*0+1] + vinserti128 m6, m10, [dstq+stride3q -1], 1 + vinserti128 m9, m11, [dstq+strideq*4-1], 1 + psrldq m10, 2 + psrldq m11, 2 + punpcklqdq m6, m9 + movu m9, [r13+hq*2*1+16*1] + punpcklqdq m10, m11 + vpblendd m5, m10, 0xF0 + vpblendvb m6, [rsp+gprsize+80+hq*8+64+8*1], m9 +%endif + ret +.d1k0: +.d2k0: +.d3k0: +%if %1 == 4 + %if %2 == 4 + movq xm6, [dstq+strideq*0-1] + movq xm9, [dstq+strideq*1-1] + vinserti128 m6, [dstq+strideq*2-1], 1 + vinserti128 m9, [dstq+stride3q -1], 1 + movu m11, [rsp+gprsize+0x10] + pcmpeqd m12, m12 + psrldq m5, m6, 2 + psrldq m10, m9, 2 + psrld m12, 24 + punpckldq m6, m9 + punpckldq m5, m10 + vpblendvb m6, m11, m12 + %else + movq xm6, [dstq +strideq*0-1] + movq xm9, [dstq +strideq*2-1] + movhps xm6, [dstq +strideq*1-1] + movhps xm9, [dstq +stride3q -1] + movq xm10, [dst4q+strideq*0-1] + movhps xm10, [dst4q+strideq*1-1] + psrldq xm5, xm6, 2 + psrldq xm11, xm9, 2 + shufps xm5, xm11, q2020 + movq xm11, [dst4q+strideq*2-1] + movhps xm11, [dst4q+stride3q -1] + shufps xm6, xm9, q2020 + shufps xm9, xm10, xm11, q2020 + vinserti128 m6, xm9, 1 + pmovzxbw m9, [leftq+1] + psrldq xm10, 2 + psrldq xm11, 2 + shufps xm10, xm11, q2020 + vpbroadcastd m11, [blend_4x8_0+4] + vinserti128 m5, xm10, 1 + vpblendvb m6, m9, m11 + %endif +%else + movu xm5, [dstq+strideq*0-1] + movu xm9, [dstq+strideq*1-1] + vinserti128 m5, [dstq+strideq*2-1], 1 + vinserti128 m9, [dstq+stride3q -1], 1 + movu m10, [blend_8x8_0+16] + punpcklqdq m6, m5, m9 + vpblendvb m6, [rsp+gprsize+80+hq*8+64], m10 + psrldq m5, 2 + psrldq m9, 2 + punpcklqdq m5, m9 +%endif + ret +.d4k0: +%if %1 == 4 + %if %2 == 4 + vpbroadcastq m10, [dstq+strideq*1-1] + vpbroadcastq m11, [dstq+strideq*2-1] + movd xm6, [topq+strideq*1-1] + movd xm9, [dstq+strideq*0-1] + psrldq m5, m10, 2 + psrldq m12, m11, 2 + vpblendd m6, m10, 0x10 + vpblendd m9, m11, 0x10 + movu m10, [blend_4x4] + vinserti128 m5, [dstq+stride3q +1], 1 + vinserti128 m12, [dstq+strideq*4+1], 1 + punpckldq m6, m9 + punpckldq m5, m12 + vpblendvb m6, [rsp+gprsize+0x40], m10 + %else + movd xm6, [topq +strideq*1-1] + movq xm9, [dstq +strideq*1-1] + movq xm10, [dstq +stride3q -1] + movq xm11, [dst4q+strideq*1-1] + pinsrd xm6, [dstq +strideq*0-1], 1 + movhps xm9, [dstq +strideq*2-1] + movhps xm10, [dst4q+strideq*0-1] + movhps xm11, [dst4q+strideq*2-1] + psrldq xm5, xm9, 2 + shufps xm6, xm9, q2010 + psrldq xm9, xm10, 2 + shufps xm5, xm9, q2020 + shufps xm10, xm11, q2020 + movd xm9, [dst4q+stride3q +1] + vinserti128 m6, xm10, 1 + pinsrd xm9, [dst4q+strideq*4+1], 1 + psrldq xm11, 2 + pmovzxbw m10, [leftq-1] + shufps xm11, xm9, q1020 + movu m9, [blend_4x8_0] + vinserti128 m5, xm11, 1 + vpblendvb m6, m10, m9 + %endif +%else + lea r13, [blend_8x8_0+8] + movq xm6, [top2q -1] + vbroadcasti128 m5, [dstq+strideq*1-1] + vbroadcasti128 m9, [dstq+strideq*2-1] + movhps xm6, [dstq+strideq*0-1] + movu m11, [r13+hq*2*1+16*1] + punpcklqdq m10, m5, m9 + vinserti128 m5, [dstq+stride3q -1], 1 + vinserti128 m9, [dstq+strideq*4-1], 1 + vpblendd m6, m10, 0xF0 + vpblendvb m6, [rsp+gprsize+80+hq*8+64-8*1], m11 + psrldq m5, 2 + psrldq m9, 2 + punpcklqdq m5, m9 +%endif + ret +.d5k0: +.d6k0: +.d7k0: +%if %1 == 4 + %if %2 == 4 + movd xm6, [topq+strideq*1 ] + vpbroadcastd m5, [dstq+strideq*1 ] + vpbroadcastd m9, [dstq+strideq*2 ] + vpblendd xm6, [dstq+strideq*0-4], 0x2 + vpblendd m5, m9, 0x22 + vpblendd m6, m5, 0x30 + vinserti128 m5, [dstq+stride3q ], 1 + vpblendd m5, [dstq+strideq*4-20], 0x20 + %else + movd xm6, [topq +strideq*1] + movd xm5, [dstq +strideq*1] + movd xm9, [dstq +stride3q ] + movd xm10, [dst4q+strideq*1] + movd xm11, [dst4q+stride3q ] + pinsrd xm6, [dstq +strideq*0], 1 + pinsrd xm5, [dstq +strideq*2], 1 + pinsrd xm9, [dst4q+strideq*0], 1 + pinsrd xm10, [dst4q+strideq*2], 1 + pinsrd xm11, [dst4q+strideq*4], 1 + punpcklqdq xm6, xm5 + punpcklqdq xm5, xm9 + punpcklqdq xm9, xm10 + punpcklqdq xm10, xm11 + vinserti128 m6, xm9, 1 + vinserti128 m5, xm10, 1 + %endif +%else + movq xm6, [top2q ] + movq xm5, [dstq+strideq*1] + movq xm9, [dstq+stride3q ] + movhps xm6, [dstq+strideq*0] + movhps xm5, [dstq+strideq*2] + movhps xm9, [dstq+strideq*4] + vinserti128 m6, xm5, 1 + vinserti128 m5, xm9, 1 +%endif + ret +.d0k1: +%if %1 == 4 + %if %2 == 4 + movd xm6, [dstq +strideq*2-2] + movd xm9, [dstq +stride3q -2] + movd xm5, [topq +strideq*0+2] + movd xm10, [topq +strideq*1+2] + pinsrw xm6, [leftq+4], 0 + pinsrw xm9, [leftq+6], 0 + vinserti128 m5, [dstq +strideq*0+2], 1 + vinserti128 m10, [dstq +strideq*1+2], 1 + vinserti128 m6, [dst4q+strideq*0-2], 1 + vinserti128 m9, [dst4q+strideq*1-2], 1 + punpckldq m5, m10 + punpckldq m6, m9 + %else + movq xm6, [dstq +strideq*2-2] + movd xm10, [dst4q+strideq*2-2] + movd xm5, [topq +strideq*0+2] + movq xm9, [dst4q+strideq*0-2] + movhps xm6, [dstq +stride3q -2] + pinsrw xm10, [dst4q+stride3q ], 3 + pinsrd xm5, [topq +strideq*1+2], 1 + movhps xm9, [dst4q+strideq*1-2] + pinsrd xm10, [dst8q+strideq*0-2], 2 + pinsrd xm5, [dstq +strideq*0+2], 2 + pinsrd xm10, [dst8q+strideq*1-2], 3 + pinsrd xm5, [dstq +strideq*1+2], 3 + shufps xm11, xm6, xm9, q3131 + shufps xm6, xm9, q2020 + movu m9, [blend_4x8_3+8] + vinserti128 m6, xm10, 1 + vinserti128 m5, xm11, 1 + vpblendvb m6, [rsp+gprsize+16+8], m9 + %endif +%else + lea r13, [blend_8x8_1+16] + movq xm6, [dstq +strideq*2-2] + movq xm9, [dstq +stride3q -2] + movq xm5, [top1q +2] + movq xm10, [top2q +2] + movu m11, [r13+hq*2*2+16*2] + vinserti128 m6, [dst4q+strideq*0-2], 1 + vinserti128 m9, [dst4q+strideq*1-2], 1 + vinserti128 m5, [dstq +strideq*0+2], 1 + vinserti128 m10, [dstq +strideq*1+2], 1 + punpcklqdq m6, m9 + punpcklqdq m5, m10 + vpblendvb m6, [rsp+gprsize+16+hq*8+64+8*2], m11 +%endif + ret +.d1k1: +%if %1 == 4 + %if %2 == 4 + vpbroadcastq m6, [dstq+strideq*1-2] + vpbroadcastq m9, [dstq+strideq*2-2] + movd xm5, [topq+strideq*1+2] + movd xm10, [dstq+strideq*0+2] + psrldq m11, m6, 4 + psrldq m12, m9, 4 + vpblendd m5, m11, 0x10 + movq xm11, [leftq+2] + vinserti128 m6, [dstq+stride3q -2], 1 + punpckldq xm11, xm11 + vpblendd m10, m12, 0x10 + pcmpeqd m12, m12 + pmovzxwd m11, xm11 + psrld m12, 16 + punpckldq m6, m9 + vpbroadcastd m9, [dstq+strideq*4-2] + vpblendvb m6, m11, m12 + punpckldq m5, m10 + vpblendd m6, m9, 0x20 + %else + movd xm5, [topq +strideq*1+2] + movq xm6, [dstq +strideq*1-2] + movq xm9, [dstq +stride3q -2] + movq xm10, [dst4q+strideq*1-2] + movd xm11, [dst4q+stride3q -2] + pinsrd xm5, [dstq +strideq*0+2], 1 + movhps xm6, [dstq +strideq*2-2] + movhps xm9, [dst4q+strideq*0-2] + movhps xm10, [dst4q+strideq*2-2] + pinsrd xm11, [dst4q+strideq*4-2], 1 + shufps xm5, xm6, q3110 + shufps xm6, xm9, q2020 + shufps xm9, xm10, q3131 + shufps xm10, xm11, q1020 + movu m11, [blend_4x8_2+4] + vinserti128 m6, xm10, 1 + vinserti128 m5, xm9, 1 + vpblendvb m6, [rsp+gprsize+16+4], m11 + %endif +%else + lea r13, [blend_8x8_1+16] + movq xm5, [top2q +2] + vbroadcasti128 m6, [dstq+strideq*1-2] + vbroadcasti128 m9, [dstq+strideq*2-2] + movhps xm5, [dstq+strideq*0+2] + shufps m10, m6, m9, q2121 + vinserti128 m6, [dstq+stride3q -2], 1 + vinserti128 m9, [dstq+strideq*4-2], 1 + movu m11, [r13+hq*2*1+16*1] + vpblendd m5, m10, 0xF0 + punpcklqdq m6, m9 + vpblendvb m6, [rsp+gprsize+16+hq*8+64+8*1], m11 +%endif + ret +.d2k1: +%if %1 == 4 + %if %2 == 4 + movq xm11, [leftq] + movq xm6, [dstq+strideq*0-2] + movq xm9, [dstq+strideq*1-2] + vinserti128 m6, [dstq+strideq*2-2], 1 + vinserti128 m9, [dstq+stride3q -2], 1 + punpckldq xm11, xm11 + psrldq m5, m6, 4 + psrldq m10, m9, 4 + pmovzxwd m11, xm11 + punpckldq m6, m9 + punpckldq m5, m10 + pblendw m6, m11, 0x05 + %else + movq xm5, [dstq +strideq*0-2] + movq xm9, [dstq +strideq*2-2] + movq xm10, [dst4q+strideq*0-2] + movq xm11, [dst4q+strideq*2-2] + movhps xm5, [dstq +strideq*1-2] + movhps xm9, [dstq +stride3q -2] + movhps xm10, [dst4q+strideq*1-2] + movhps xm11, [dst4q+stride3q -2] + shufps xm6, xm5, xm9, q2020 + shufps xm5, xm9, q3131 + shufps xm9, xm10, xm11, q2020 + shufps xm10, xm11, q3131 + pmovzxwd m11, [leftq] + vinserti128 m6, xm9, 1 + vinserti128 m5, xm10, 1 + pblendw m6, m11, 0x55 + %endif +%else + mova m11, [rsp+gprsize+16+hq*8+64] + movu xm5, [dstq+strideq*0-2] + movu xm9, [dstq+strideq*1-2] + vinserti128 m5, [dstq+strideq*2-2], 1 + vinserti128 m9, [dstq+stride3q -2], 1 + shufps m6, m5, m9, q1010 + shufps m5, m9, q2121 + pblendw m6, m11, 0x11 +%endif + ret +.d3k1: +%if %1 == 4 + %if %2 == 4 + vpbroadcastq m11, [dstq+strideq*1-2] + vpbroadcastq m12, [dstq+strideq*2-2] + movd xm6, [topq+strideq*1-2] + movd xm9, [dstq+strideq*0-2] + pblendw m11, [leftq-16+2], 0x01 + pblendw m12, [leftq-16+4], 0x01 + pinsrw xm9, [leftq- 0+0], 0 + psrldq m5, m11, 4 + psrldq m10, m12, 4 + vinserti128 m5, [dstq+stride3q +2], 1 + vinserti128 m10, [dstq+strideq*4+2], 1 + vpblendd m6, m11, 0x10 + vpblendd m9, m12, 0x10 + punpckldq m6, m9 + punpckldq m5, m10 + %else + movd xm6, [topq +strideq*1-2] + movq xm5, [dstq +strideq*1-2] + movq xm9, [dstq +stride3q -2] + movq xm10, [dst4q+strideq*1-2] + movd xm11, [dst4q+stride3q +2] + pinsrw xm6, [dstq +strideq*0 ], 3 + movhps xm5, [dstq +strideq*2-2] + movhps xm9, [dst4q+strideq*0-2] + movhps xm10, [dst4q+strideq*2-2] + pinsrd xm11, [dst4q+strideq*4+2], 1 + shufps xm6, xm5, q2010 + shufps xm5, xm9, q3131 + shufps xm9, xm10, q2020 + shufps xm10, xm11, q1031 + movu m11, [blend_4x8_2] + vinserti128 m6, xm9, 1 + vinserti128 m5, xm10, 1 + vpblendvb m6, [rsp+gprsize+16-4], m11 + %endif +%else + lea r13, [blend_8x8_1+8] + movq xm6, [top2q -2] + vbroadcasti128 m5, [dstq+strideq*1-2] + vbroadcasti128 m10, [dstq+strideq*2-2] + movhps xm6, [dstq+strideq*0-2] + punpcklqdq m9, m5, m10 + vinserti128 m5, [dstq+stride3q -2], 1 + vinserti128 m10, [dstq+strideq*4-2], 1 + movu m11, [r13+hq*2*1+16*1] + vpblendd m6, m9, 0xF0 + shufps m5, m10, q2121 + vpblendvb m6, [rsp+gprsize+16+hq*8+64-8*1], m11 +%endif + ret +.d4k1: +%if %1 == 4 + %if %2 == 4 + vinserti128 m6, [dstq +strideq*0-2], 1 + vinserti128 m9, [dstq +strideq*1-2], 1 + movd xm5, [dstq +strideq*2+2] + movd xm10, [dstq +stride3q +2] + pblendw m6, [leftq-16+0], 0x01 + pblendw m9, [leftq-16+2], 0x01 + vinserti128 m5, [dst4q+strideq*0+2], 1 + vinserti128 m10, [dst4q+strideq*1+2], 1 + vpblendd m6, [topq +strideq*0-2], 0x01 + vpblendd m9, [topq +strideq*1-2], 0x01 + punpckldq m5, m10 + punpckldq m6, m9 + %else + movd xm6, [topq +strideq*0-2] + movq xm5, [dstq +strideq*2-2] + movq xm9, [dst4q+strideq*0-2] + movd xm10, [dst4q+strideq*2+2] + pinsrd xm6, [topq +strideq*1-2], 1 + movhps xm5, [dstq +stride3q -2] + movhps xm9, [dst4q+strideq*1-2] + pinsrd xm10, [dst4q+stride3q +2], 1 + pinsrd xm6, [dstq +strideq*0-2], 2 + pinsrd xm10, [dst8q+strideq*0+2], 2 + pinsrd xm6, [dstq +strideq*1-2], 3 + pinsrd xm10, [dst8q+strideq*1+2], 3 + shufps xm11, xm5, xm9, q2020 + shufps xm5, xm9, q3131 + movu m9, [blend_4x8_3] + vinserti128 m6, xm11, 1 + vinserti128 m5, xm10, 1 + vpblendvb m6, [rsp+gprsize+16-8], m9 + %endif +%else + lea r13, [blend_8x8_1] + movu m11, [r13+hq*2*2+16*2] + movq xm6, [top1q -2] + movq xm9, [top2q -2] + movq xm5, [dstq +strideq*2+2] + movq xm10, [dstq +stride3q +2] + vinserti128 m6, [dstq +strideq*0-2], 1 + vinserti128 m9, [dstq +strideq*1-2], 1 + vinserti128 m5, [dst4q+strideq*0+2], 1 + vinserti128 m10, [dst4q+strideq*1+2], 1 + punpcklqdq m6, m9 + vpblendvb m6, [rsp+gprsize+16+hq*8+64-8*2], m11 + punpcklqdq m5, m10 +%endif + ret +.d5k1: +%if %1 == 4 + %if %2 == 4 + movd xm6, [topq +strideq*0-1] + movd xm9, [topq +strideq*1-1] + movd xm5, [dstq +strideq*2+1] + movd xm10, [dstq +stride3q +1] + pcmpeqd m12, m12 + pmovzxbw m11, [leftq-8+1] + psrld m12, 24 + vinserti128 m6, [dstq +strideq*0-1], 1 + vinserti128 m9, [dstq +strideq*1-1], 1 + vinserti128 m5, [dst4q+strideq*0+1], 1 + vinserti128 m10, [dst4q+strideq*1+1], 1 + punpckldq m6, m9 + pxor m9, m9 + vpblendd m12, m9, 0x0F + punpckldq m5, m10 + vpblendvb m6, m11, m12 + %else + movd xm6, [topq +strideq*0-1] + movq xm5, [dstq +strideq*2-1] + movq xm9, [dst4q+strideq*0-1] + movd xm10, [dst4q+strideq*2+1] + pinsrd xm6, [topq +strideq*1-1], 1 + movhps xm5, [dstq +stride3q -1] + movhps xm9, [dst4q+strideq*1-1] + pinsrd xm10, [dst4q+stride3q +1], 1 + pinsrd xm6, [dstq +strideq*0-1], 2 + pinsrd xm10, [dst8q+strideq*0+1], 2 + pinsrd xm6, [dstq +strideq*1-1], 3 + pinsrd xm10, [dst8q+strideq*1+1], 3 + shufps xm11, xm5, xm9, q2020 + vinserti128 m6, xm11, 1 + pmovzxbw m11, [leftq-3] + psrldq xm5, 2 + psrldq xm9, 2 + shufps xm5, xm9, q2020 + movu m9, [blend_4x8_1] + vinserti128 m5, xm10, 1 + vpblendvb m6, m11, m9 + %endif +%else + lea r13, [blend_8x8_0] + movu m11, [r13+hq*2*2+16*2] + movq xm6, [top1q -1] + movq xm9, [top2q -1] + movq xm5, [dstq +strideq*2+1] + movq xm10, [dstq +stride3q +1] + vinserti128 m6, [dstq +strideq*0-1], 1 + vinserti128 m9, [dstq +strideq*1-1], 1 + vinserti128 m5, [dst4q+strideq*0+1], 1 + vinserti128 m10, [dst4q+strideq*1+1], 1 + punpcklqdq m6, m9 + punpcklqdq m5, m10 + vpblendvb m6, [rsp+gprsize+80+hq*8+64-8*2], m11 +%endif + ret +.d6k1: +%if %1 == 4 + %if %2 == 4 + movd xm6, [topq +strideq*0] + movd xm9, [topq +strideq*1] + movd xm5, [dstq +strideq*2] + movd xm10, [dstq +stride3q ] + vinserti128 m6, [dstq +strideq*0], 1 + vinserti128 m9, [dstq +strideq*1], 1 + vinserti128 m5, [dst4q+strideq*0], 1 + vinserti128 m10, [dst4q+strideq*1], 1 + punpckldq m6, m9 + punpckldq m5, m10 + %else + movd xm5, [dstq +strideq*2] + movd xm6, [topq +strideq*0] + movd xm9, [dst4q+strideq*2] + pinsrd xm5, [dstq +stride3q ], 1 + pinsrd xm6, [topq +strideq*1], 1 + pinsrd xm9, [dst4q+stride3q ], 1 + pinsrd xm5, [dst4q+strideq*0], 2 + pinsrd xm6, [dstq +strideq*0], 2 + pinsrd xm9, [dst8q+strideq*0], 2 + pinsrd xm5, [dst4q+strideq*1], 3 + pinsrd xm6, [dstq +strideq*1], 3 + pinsrd xm9, [dst8q+strideq*1], 3 + vinserti128 m6, xm5, 1 + vinserti128 m5, xm9, 1 + %endif +%else + movq xm5, [dstq +strideq*2] + movq xm9, [dst4q+strideq*0] + movq xm6, [top1q ] + movq xm10, [dstq +strideq*0] + movhps xm5, [dstq +stride3q ] + movhps xm9, [dst4q+strideq*1] + movhps xm6, [top2q ] + movhps xm10, [dstq +strideq*1] + vinserti128 m5, xm9, 1 + vinserti128 m6, xm10, 1 +%endif + ret +.d7k1: +%if %1 == 4 + %if %2 == 4 + movd xm5, [dstq +strideq*2-1] + movd xm9, [dstq +stride3q -1] + movd xm6, [topq +strideq*0+1] + movd xm10, [topq +strideq*1+1] + pinsrb xm5, [leftq+ 5], 0 + pinsrb xm9, [leftq+ 7], 0 + vinserti128 m6, [dstq +strideq*0+1], 1 + vinserti128 m10, [dstq +strideq*1+1], 1 + vinserti128 m5, [dst4q+strideq*0-1], 1 + vinserti128 m9, [dst4q+strideq*1-1], 1 + punpckldq m6, m10 + punpckldq m5, m9 + %else + movd xm6, [topq +strideq*0+1] + movq xm9, [dstq +strideq*2-1] + movq xm10, [dst4q+strideq*0-1] + movd xm11, [dst4q+strideq*2-1] + pinsrd xm6, [topq +strideq*1+1], 1 + movhps xm9, [dstq +stride3q -1] + movhps xm10, [dst4q+strideq*1-1] + pinsrd xm11, [dst4q+stride3q -1], 1 + pinsrd xm6, [dstq +strideq*0+1], 2 + pinsrd xm11, [dst8q+strideq*0-1], 2 + pinsrd xm6, [dstq +strideq*1+1], 3 + pinsrd xm11, [dst8q+strideq*1-1], 3 + shufps xm5, xm9, xm10, q2020 + vinserti128 m5, xm11, 1 + pmovzxbw m11, [leftq+5] + psrldq xm9, 2 + psrldq xm10, 2 + shufps xm9, xm10, q2020 + movu m10, [blend_4x8_1+8] + vinserti128 m6, xm9, 1 + vpblendvb m5, m11, m10 + %endif +%else + lea r13, [blend_8x8_0+16] + movq xm5, [dstq +strideq*2-1] + movq xm9, [dst4q+strideq*0-1] + movq xm6, [top1q +1] + movq xm10, [dstq +strideq*0+1] + movhps xm5, [dstq +stride3q -1] + movhps xm9, [dst4q+strideq*1-1] + movhps xm6, [top2q +1] + movhps xm10, [dstq +strideq*1+1] + movu m11, [r13+hq*2*2+16*2] + vinserti128 m5, xm9, 1 + vinserti128 m6, xm10, 1 + vpblendvb m5, [rsp+gprsize+80+hq*8+64+8*2], m11 +%endif + ret + +.border_block: + DEFINE_ARGS dst, stride, left, top, pri, sec, stride3, dst4, edge +%define rstk rsp +%assign stack_offset stack_offset_entry +%if %1 == 4 && %2 == 8 + PUSH r9 + %assign regs_used 10 +%else + %assign regs_used 9 +%endif +%if STACK_ALIGNMENT < 32 + PUSH r%+regs_used + %assign regs_used regs_used+1 +%endif + ALLOC_STACK 2*16+(%2+4)*32, 16 +%define px rsp+2*16+2*32 + + pcmpeqw m14, m14 + psllw m14, 15 ; 0x8000 + + ; prepare pixel buffers - body/right +%if %1 == 4 + INIT_XMM avx2 +%endif +%if %2 == 8 + lea dst4q, [dstq+strideq*4] +%endif + lea stride3q, [strideq*3] + test edgeb, 2 ; have_right + jz .no_right + pmovzxbw m1, [dstq+strideq*0] + pmovzxbw m2, [dstq+strideq*1] + pmovzxbw m3, [dstq+strideq*2] + pmovzxbw m4, [dstq+stride3q] + mova [px+0*32], m1 + mova [px+1*32], m2 + mova [px+2*32], m3 + mova [px+3*32], m4 +%if %2 == 8 + pmovzxbw m1, [dst4q+strideq*0] + pmovzxbw m2, [dst4q+strideq*1] + pmovzxbw m3, [dst4q+strideq*2] + pmovzxbw m4, [dst4q+stride3q] + mova [px+4*32], m1 + mova [px+5*32], m2 + mova [px+6*32], m3 + mova [px+7*32], m4 +%endif + jmp .body_done +.no_right: +%if %1 == 4 + movd xm1, [dstq+strideq*0] + movd xm2, [dstq+strideq*1] + movd xm3, [dstq+strideq*2] + movd xm4, [dstq+stride3q] + pmovzxbw xm1, xm1 + pmovzxbw xm2, xm2 + pmovzxbw xm3, xm3 + pmovzxbw xm4, xm4 + movq [px+0*32], xm1 + movq [px+1*32], xm2 + movq [px+2*32], xm3 + movq [px+3*32], xm4 +%else + pmovzxbw xm1, [dstq+strideq*0] + pmovzxbw xm2, [dstq+strideq*1] + pmovzxbw xm3, [dstq+strideq*2] + pmovzxbw xm4, [dstq+stride3q] + mova [px+0*32], xm1 + mova [px+1*32], xm2 + mova [px+2*32], xm3 + mova [px+3*32], xm4 +%endif + movd [px+0*32+%1*2], xm14 + movd [px+1*32+%1*2], xm14 + movd [px+2*32+%1*2], xm14 + movd [px+3*32+%1*2], xm14 +%if %2 == 8 + %if %1 == 4 + movd xm1, [dst4q+strideq*0] + movd xm2, [dst4q+strideq*1] + movd xm3, [dst4q+strideq*2] + movd xm4, [dst4q+stride3q] + pmovzxbw xm1, xm1 + pmovzxbw xm2, xm2 + pmovzxbw xm3, xm3 + pmovzxbw xm4, xm4 + movq [px+4*32], xm1 + movq [px+5*32], xm2 + movq [px+6*32], xm3 + movq [px+7*32], xm4 + %else + pmovzxbw xm1, [dst4q+strideq*0] + pmovzxbw xm2, [dst4q+strideq*1] + pmovzxbw xm3, [dst4q+strideq*2] + pmovzxbw xm4, [dst4q+stride3q] + mova [px+4*32], xm1 + mova [px+5*32], xm2 + mova [px+6*32], xm3 + mova [px+7*32], xm4 + %endif + movd [px+4*32+%1*2], xm14 + movd [px+5*32+%1*2], xm14 + movd [px+6*32+%1*2], xm14 + movd [px+7*32+%1*2], xm14 +%endif +.body_done: + + ; top + test edgeb, 4 ; have_top + jz .no_top + test edgeb, 1 ; have_left + jz .top_no_left + test edgeb, 2 ; have_right + jz .top_no_right + pmovzxbw m1, [topq+strideq*0-(%1/2)] + pmovzxbw m2, [topq+strideq*1-(%1/2)] + movu [px-2*32-%1], m1 + movu [px-1*32-%1], m2 + jmp .top_done +.top_no_right: + pmovzxbw m1, [topq+strideq*0-%1] + pmovzxbw m2, [topq+strideq*1-%1] + movu [px-2*32-%1*2], m1 + movu [px-1*32-%1*2], m2 + movd [px-2*32+%1*2], xm14 + movd [px-1*32+%1*2], xm14 + jmp .top_done +.top_no_left: + test edgeb, 2 ; have_right + jz .top_no_left_right + pmovzxbw m1, [topq+strideq*0] + pmovzxbw m2, [topq+strideq*1] + mova [px-2*32+0], m1 + mova [px-1*32+0], m2 + movd [px-2*32-4], xm14 + movd [px-1*32-4], xm14 + jmp .top_done +.top_no_left_right: +%if %1 == 4 + movd xm1, [topq+strideq*0] + pinsrd xm1, [topq+strideq*1], 1 + pmovzxbw xm1, xm1 + movq [px-2*32+0], xm1 + movhps [px-1*32+0], xm1 +%else + pmovzxbw xm1, [topq+strideq*0] + pmovzxbw xm2, [topq+strideq*1] + mova [px-2*32+0], xm1 + mova [px-1*32+0], xm2 +%endif + movd [px-2*32-4], xm14 + movd [px-1*32-4], xm14 + movd [px-2*32+%1*2], xm14 + movd [px-1*32+%1*2], xm14 + jmp .top_done +.no_top: + movu [px-2*32-%1], m14 + movu [px-1*32-%1], m14 +.top_done: + + ; left + test edgeb, 1 ; have_left + jz .no_left + pmovzxbw xm1, [leftq+ 0] +%if %2 == 8 + pmovzxbw xm2, [leftq+ 8] +%endif + movd [px+0*32-4], xm1 + pextrd [px+1*32-4], xm1, 1 + pextrd [px+2*32-4], xm1, 2 + pextrd [px+3*32-4], xm1, 3 +%if %2 == 8 + movd [px+4*32-4], xm2 + pextrd [px+5*32-4], xm2, 1 + pextrd [px+6*32-4], xm2, 2 + pextrd [px+7*32-4], xm2, 3 +%endif + jmp .left_done +.no_left: + movd [px+0*32-4], xm14 + movd [px+1*32-4], xm14 + movd [px+2*32-4], xm14 + movd [px+3*32-4], xm14 +%if %2 == 8 + movd [px+4*32-4], xm14 + movd [px+5*32-4], xm14 + movd [px+6*32-4], xm14 + movd [px+7*32-4], xm14 +%endif +.left_done: + + ; bottom + DEFINE_ARGS dst, stride, dst8, dummy1, pri, sec, stride3, dummy3, edge + test edgeb, 8 ; have_bottom + jz .no_bottom + lea dst8q, [dstq+%2*strideq] + test edgeb, 1 ; have_left + jz .bottom_no_left + test edgeb, 2 ; have_right + jz .bottom_no_right + pmovzxbw m1, [dst8q-(%1/2)] + pmovzxbw m2, [dst8q+strideq-(%1/2)] + movu [px+(%2+0)*32-%1], m1 + movu [px+(%2+1)*32-%1], m2 + jmp .bottom_done +.bottom_no_right: + pmovzxbw m1, [dst8q-%1] + pmovzxbw m2, [dst8q+strideq-%1] + movu [px+(%2+0)*32-%1*2], m1 + movu [px+(%2+1)*32-%1*2], m2 +%if %1 == 8 + movd [px+(%2-1)*32+%1*2], xm14 ; overwritten by previous movu +%endif + movd [px+(%2+0)*32+%1*2], xm14 + movd [px+(%2+1)*32+%1*2], xm14 + jmp .bottom_done +.bottom_no_left: + test edgeb, 2 ; have_right + jz .bottom_no_left_right + pmovzxbw m1, [dst8q] + pmovzxbw m2, [dst8q+strideq] + mova [px+(%2+0)*32+0], m1 + mova [px+(%2+1)*32+0], m2 + movd [px+(%2+0)*32-4], xm14 + movd [px+(%2+1)*32-4], xm14 + jmp .bottom_done +.bottom_no_left_right: +%if %1 == 4 + movd xm1, [dst8q] + pinsrd xm1, [dst8q+strideq], 1 + pmovzxbw xm1, xm1 + movq [px+(%2+0)*32+0], xm1 + movhps [px+(%2+1)*32+0], xm1 +%else + pmovzxbw xm1, [dst8q] + pmovzxbw xm2, [dst8q+strideq] + mova [px+(%2+0)*32+0], xm1 + mova [px+(%2+1)*32+0], xm2 +%endif + movd [px+(%2+0)*32-4], xm14 + movd [px+(%2+1)*32-4], xm14 + movd [px+(%2+0)*32+%1*2], xm14 + movd [px+(%2+1)*32+%1*2], xm14 + jmp .bottom_done +.no_bottom: + movu [px+(%2+0)*32-%1], m14 + movu [px+(%2+1)*32-%1], m14 +.bottom_done: + + ; actual filter + INIT_YMM avx2 + DEFINE_ARGS dst, stride, pridmp, damping, pri, secdmp, stride3, zero +%undef edged + ; register to shuffle values into after packing + vbroadcasti128 m12, [shufb_lohi] + + mov dampingd, r7m + xor zerod, zerod + movifnidn prid, prim + sub dampingd, 31 + movifnidn secdmpd, secdmpm + test prid, prid + jz .border_sec_only + movd xm0, prid + lzcnt pridmpd, prid + add pridmpd, dampingd + cmovs pridmpd, zerod + mov [rsp+0], pridmpq ; pri_shift + test secdmpd, secdmpd + jz .border_pri_only + movd xm1, secdmpd + lzcnt secdmpd, secdmpd + add secdmpd, dampingd + cmovs secdmpd, zerod + mov [rsp+8], secdmpq ; sec_shift + + DEFINE_ARGS dst, stride, pridmp, table, pri, secdmp, stride3 + lea tableq, [tap_table] + vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask + vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask + + ; pri/sec_taps[k] [4 total] + DEFINE_ARGS dst, stride, dir, table, pri, sec, stride3 + vpbroadcastb m0, xm0 ; pri_strength + vpbroadcastb m1, xm1 ; sec_strength + and prid, 1 + lea priq, [tableq+priq*2+8] ; pri_taps + lea secq, [tableq+12] ; sec_taps + + BORDER_PREP_REGS %1, %2 +%if %1*%2*2/mmsize > 1 +.border_v_loop: +%endif + BORDER_LOAD_BLOCK %1, %2, 1 +.border_k_loop: + vpbroadcastb m2, [priq+kq] ; pri_taps + vpbroadcastb m3, [secq+kq] ; sec_taps + ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1, 1 + ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1, 1 + ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1, 1 + dec kq + jge .border_k_loop + + vpbroadcastd m10, [pw_2048] + BORDER_ADJUST_PIXEL %1, m10, 1 +%if %1*%2*2/mmsize > 1 + %define vloop_lines (mmsize/(%1*2)) + lea dstq, [dstq+strideq*vloop_lines] + add stkq, 32*vloop_lines + dec hd + jg .border_v_loop +%endif + RET + +.border_pri_only: + DEFINE_ARGS dst, stride, pridmp, table, pri, _, stride3 + lea tableq, [tap_table] + vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask + DEFINE_ARGS dst, stride, dir, table, pri, _, stride3 + vpbroadcastb m0, xm0 ; pri_strength + and prid, 1 + lea priq, [tableq+priq*2+8] ; pri_taps + BORDER_PREP_REGS %1, %2 + vpbroadcastd m1, [pw_2048] +%if %1*%2*2/mmsize > 1 +.border_pri_v_loop: +%endif + BORDER_LOAD_BLOCK %1, %2 +.border_pri_k_loop: + vpbroadcastb m2, [priq+kq] ; pri_taps + ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1 + dec kq + jge .border_pri_k_loop + BORDER_ADJUST_PIXEL %1, m1 +%if %1*%2*2/mmsize > 1 + %define vloop_lines (mmsize/(%1*2)) + lea dstq, [dstq+strideq*vloop_lines] + add stkq, 32*vloop_lines + dec hd + jg .border_pri_v_loop +%endif + RET + +.border_sec_only: + DEFINE_ARGS dst, stride, _, damping, _, secdmp, stride3, zero + movd xm1, secdmpd + lzcnt secdmpd, secdmpd + add secdmpd, dampingd + cmovs secdmpd, zerod + mov [rsp+8], secdmpq ; sec_shift + DEFINE_ARGS dst, stride, _, table, _, secdmp, stride3 + lea tableq, [tap_table] + vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask + DEFINE_ARGS dst, stride, dir, table, _, sec, stride3 + vpbroadcastb m1, xm1 ; sec_strength + lea secq, [tableq+12] ; sec_taps + BORDER_PREP_REGS %1, %2 + vpbroadcastd m0, [pw_2048] +%if %1*%2*2/mmsize > 1 +.border_sec_v_loop: +%endif + BORDER_LOAD_BLOCK %1, %2 +.border_sec_k_loop: + vpbroadcastb m3, [secq+kq] ; sec_taps + ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1 + ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1 + dec kq + jge .border_sec_k_loop + BORDER_ADJUST_PIXEL %1, m0 +%if %1*%2*2/mmsize > 1 + %define vloop_lines (mmsize/(%1*2)) + lea dstq, [dstq+strideq*vloop_lines] + add stkq, 32*vloop_lines + dec hd + jg .border_sec_v_loop +%endif + RET +%endmacro + +CDEF_FILTER 8, 8 +CDEF_FILTER 4, 8 +CDEF_FILTER 4, 4 + +INIT_YMM avx2 +cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3 + lea stride3q, [strideq*3] + movq xm0, [srcq+strideq*0] + movq xm1, [srcq+strideq*1] + movq xm2, [srcq+strideq*2] + movq xm3, [srcq+stride3q] + lea srcq, [srcq+strideq*4] + vpbroadcastq m4, [srcq+strideq*0] + vpbroadcastq m5, [srcq+strideq*1] + vpbroadcastq m6, [srcq+strideq*2] + vpbroadcastq m7, [srcq+stride3q] + vpbroadcastd m8, [pw_128] + pxor m9, m9 + + vpblendd m0, m0, m7, 0xf0 + vpblendd m1, m1, m6, 0xf0 + vpblendd m2, m2, m5, 0xf0 + vpblendd m3, m3, m4, 0xf0 + + punpcklbw m0, m9 + punpcklbw m1, m9 + punpcklbw m2, m9 + punpcklbw m3, m9 + + psubw m0, m8 + psubw m1, m8 + psubw m2, m8 + psubw m3, m8 + + ; shuffle registers to generate partial_sum_diag[0-1] together + vpermq m7, m0, q1032 + vpermq m6, m1, q1032 + vpermq m5, m2, q1032 + vpermq m4, m3, q1032 + + ; start with partial_sum_hv[0-1] + paddw m8, m0, m1 + paddw m9, m2, m3 + phaddw m10, m0, m1 + phaddw m11, m2, m3 + paddw m8, m9 + phaddw m10, m11 + vextracti128 xm9, m8, 1 + vextracti128 xm11, m10, 1 + paddw xm8, xm9 ; partial_sum_hv[1] + phaddw xm10, xm11 ; partial_sum_hv[0] + vinserti128 m8, xm10, 1 + vpbroadcastd m9, [div_table+44] + pmaddwd m8, m8 + pmulld m8, m9 ; cost6[2a-d] | cost2[a-d] + + ; create aggregates [lower half]: + ; m9 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234+ + ; m4:xxxx0123+m5:xxxxx012+m6:xxxxxx01+m7:xxxxxxx0 + ; m10= m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx+ + ; m4:4567xxxx+m5:34567xxx+m6:234567xx+m7:1234567x + ; and [upper half]: + ; m9 = m0:xxxxxxx0+m1:xxxxxx01+m2:xxxxx012+m3:xxxx0123+ + ; m4:xxx01234+m5:xx012345+m6:x0123456+m7:01234567 + ; m10= m0:1234567x+m1:234567xx+m2:34567xxx+m3:4567xxxx+ + ; m4:567xxxxx+m5:67xxxxxx+m6:7xxxxxxx + ; and then shuffle m11 [shufw_6543210x], unpcklwd, pmaddwd, pmulld, paddd + + pslldq m9, m1, 2 + psrldq m10, m1, 14 + pslldq m11, m2, 4 + psrldq m12, m2, 12 + pslldq m13, m3, 6 + psrldq m14, m3, 10 + paddw m9, m11 + paddw m10, m12 + paddw m9, m13 + paddw m10, m14 + pslldq m11, m4, 8 + psrldq m12, m4, 8 + pslldq m13, m5, 10 + psrldq m14, m5, 6 + paddw m9, m11 + paddw m10, m12 + paddw m9, m13 + paddw m10, m14 + pslldq m11, m6, 12 + psrldq m12, m6, 4 + pslldq m13, m7, 14 + psrldq m14, m7, 2 + paddw m9, m11 + paddw m10, m12 + paddw m9, m13 + paddw m10, m14 ; partial_sum_diag[0/1][8-14,zero] + vbroadcasti128 m14, [shufw_6543210x] + vbroadcasti128 m13, [div_table+16] + vbroadcasti128 m12, [div_table+0] + paddw m9, m0 ; partial_sum_diag[0/1][0-7] + pshufb m10, m14 + punpckhwd m11, m9, m10 + punpcklwd m9, m10 + pmaddwd m11, m11 + pmaddwd m9, m9 + pmulld m11, m13 + pmulld m9, m12 + paddd m9, m11 ; cost0[a-d] | cost4[a-d] + + ; merge horizontally and vertically for partial_sum_alt[0-3] + paddw m10, m0, m1 + paddw m11, m2, m3 + paddw m12, m4, m5 + paddw m13, m6, m7 + phaddw m0, m4 + phaddw m1, m5 + phaddw m2, m6 + phaddw m3, m7 + + ; create aggregates [lower half]: + ; m4 = m10:01234567+m11:x0123456+m12:xx012345+m13:xxx01234 + ; m11= m11:7xxxxxxx+m12:67xxxxxx+m13:567xxxxx + ; and [upper half]: + ; m4 = m10:xxx01234+m11:xx012345+m12:x0123456+m13:01234567 + ; m11= m10:567xxxxx+m11:67xxxxxx+m12:7xxxxxxx + ; and then pshuflw m11 3012, unpcklwd, pmaddwd, pmulld, paddd + + pslldq m4, m11, 2 + psrldq m11, 14 + pslldq m5, m12, 4 + psrldq m12, 12 + pslldq m6, m13, 6 + psrldq m13, 10 + paddw m4, m10 + paddw m11, m12 + vpbroadcastd m12, [div_table+44] + paddw m5, m6 + paddw m11, m13 ; partial_sum_alt[3/2] right + vbroadcasti128 m13, [div_table+32] + paddw m4, m5 ; partial_sum_alt[3/2] left + pshuflw m5, m11, q3012 + punpckhwd m6, m11, m4 + punpcklwd m4, m5 + pmaddwd m6, m6 + pmaddwd m4, m4 + pmulld m6, m12 + pmulld m4, m13 + paddd m4, m6 ; cost7[a-d] | cost5[a-d] + + ; create aggregates [lower half]: + ; m5 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234 + ; m1 = m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx + ; and [upper half]: + ; m5 = m0:xxx01234+m1:xx012345+m2:x0123456+m3:01234567 + ; m1 = m0:567xxxxx+m1:67xxxxxx+m2:7xxxxxxx + ; and then pshuflw m1 3012, unpcklwd, pmaddwd, pmulld, paddd + + pslldq m5, m1, 2 + psrldq m1, 14 + pslldq m6, m2, 4 + psrldq m2, 12 + pslldq m7, m3, 6 + psrldq m3, 10 + paddw m5, m0 + paddw m1, m2 + paddw m6, m7 + paddw m1, m3 ; partial_sum_alt[0/1] right + paddw m5, m6 ; partial_sum_alt[0/1] left + pshuflw m0, m1, q3012 + punpckhwd m1, m5 + punpcklwd m5, m0 + pmaddwd m1, m1 + pmaddwd m5, m5 + pmulld m1, m12 + pmulld m5, m13 + paddd m5, m1 ; cost1[a-d] | cost3[a-d] + + mova xm0, [pd_47130256+ 16] + mova m1, [pd_47130256] + phaddd m9, m8 + phaddd m5, m4 + phaddd m9, m5 + vpermd m0, m9 ; cost[0-3] + vpermd m1, m9 ; cost[4-7] | cost[0-3] + + ; now find the best cost + pmaxsd xm2, xm0, xm1 + pshufd xm3, xm2, q1032 + pmaxsd xm2, xm3 + pshufd xm3, xm2, q2301 + pmaxsd xm2, xm3 ; best cost + + ; find the idx using minpos + ; make everything other than the best cost negative via subtraction + ; find the min of unsigned 16-bit ints to sort out the negative values + psubd xm4, xm1, xm2 + psubd xm3, xm0, xm2 + packssdw xm3, xm4 + phminposuw xm3, xm3 + + ; convert idx to 32-bits + psrld xm3, 16 + movd eax, xm3 + + ; get idx^4 complement + vpermd m3, m1 + psubd xm2, xm3 + psrld xm2, 10 + movd [varq], xm2 + RET + +%endif ; ARCH_X86_64 diff --git a/third_party/dav1d/src/x86/cdef_avx512.asm b/third_party/dav1d/src/x86/cdef_avx512.asm new file mode 100644 index 0000000000..b1fa1ad16f --- /dev/null +++ b/third_party/dav1d/src/x86/cdef_avx512.asm @@ -0,0 +1,868 @@ +; Copyright © 2020, VideoLAN and dav1d authors +; Copyright © 2020, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if HAVE_AVX512ICL && ARCH_X86_64 + +%macro DUP4 1-* + %rep %0 + times 4 db %1 + %rotate 1 + %endrep +%endmacro + +%macro DIRS 16 ; cdef_directions[] + %rep 4 + 16 + 4 ; 6 7 0 1 2 3 4 5 6 7 0 1 + ; masking away unused bits allows us to use a single vpaddd {1to16} + ; instruction instead of having to do vpbroadcastd + paddb + db %13 & 0x3f, -%13 & 0x3f + %rotate 1 + %endrep +%endmacro + +SECTION_RODATA 64 + +lut_perm_4x4: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79 + db 16, 17, 0, 1, 2, 3, 4, 5, 18, 19, 8, 9, 10, 11, 12, 13 + db 20, 21, 80, 81, 82, 83, 84, 85, 22, 23, 32, 33, 34, 35, 36, 37 + db 98, 99,100,101,102,103,104,105, 50, 51, 52, 53, 54, 55, 56, 57 +lut_perm_4x8a: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79 + db 96, 97, 0, 1, 2, 3, 4, 5, 98, 99, 8, 9, 10, 11, 12, 13 +lut_perm_4x8b:db 100,101, 16, 17, 18, 19, 20, 21,102,103, 24, 25, 26, 27, 28, 29 + db 104,105, 32, 33, 34, 35, 36, 37,106,107, 40, 41, 42, 43, 44, 45 + db 108,109, 48, 49, 50, 51, 52, 53,110,111, 56, 57, 58, 59, 60, 61 + db 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95 +pd_01234567: dd 0, 1, 2, 3, 4, 5, 6, 7 +lut_perm_8x8a: db 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 + db -1, -1, 34, 35, 36, 37, 38, 39, -1, -1, 50, 51, 52, 53, 54, 55 + db -1, -1, 66, 67, 68, 69, 70, 71, -1, -1, 82, 83, 84, 85, 86, 87 + db 96, 97, 98, 99,100,101,102,103,112,113,114,115,116,117,118,119 +lut_perm_8x8b: db 4, 5, 6, 7, 8, 9, 10, 11, 20, 21, 22, 23, 24, 25, 26, 27 + db 36, 37, 38, 39, 40, 41, 42, 43, 52, 53, 54, 55, 56, 57, 58, 59 + db 68, 69, 70, 71, 72, 73, 74, 75, 84, 85, 86, 87, 88, 89, 90, 91 + db 100,101,102,103,104,105,106,107,116,117,118,119,120,121,122,123 +edge_mask: dq 0x00003c3c3c3c0000, 0x00003f3f3f3f0000 ; 0000, 0001 + dq 0x0000fcfcfcfc0000, 0x0000ffffffff0000 ; 0010, 0011 + dq 0x00003c3c3c3c3c3c, 0x00003f3f3f3f3f3f ; 0100, 0101 + dq 0x0000fcfcfcfcfcfc, 0x0000ffffffffffff ; 0110, 0111 + dq 0x3c3c3c3c3c3c0000, 0x3f3f3f3f3f3f0000 ; 1000, 1001 + dq 0xfcfcfcfcfcfc0000, 0xffffffffffff0000 ; 1010, 1011 + dq 0x3c3c3c3c3c3c3c3c, 0x3f3f3f3f3f3f3f3f ; 1100, 1101 + dq 0xfcfcfcfcfcfcfcfc, 0xffffffffffffffff ; 1110, 1111 +px_idx: DUP4 18, 19, 20, 21, 26, 27, 28, 29, 34, 35, 36, 37, 42, 43, 44, 45 +cdef_dirs: DIRS -7,-14, 1, -6, 1, 2, 1, 10, 9, 18, 8, 17, 8, 16, 8, 15 +gf_shr: dq 0x0102040810204080, 0x0102040810204080 ; >> 0, >> 0 + dq 0x0204081020408000, 0x0408102040800000 ; >> 1, >> 2 + dq 0x0810204080000000, 0x1020408000000000 ; >> 3, >> 4 + dq 0x2040800000000000, 0x4080000000000000 ; >> 5, >> 6 + times 16 db 0 ; realign (introduced by cdef_dirs) +end_perm_w8clip:db 0, 4, 8, 12, 2, 6, 10, 14, 16, 20, 24, 28, 18, 22, 26, 30 + db 32, 36, 40, 44, 34, 38, 42, 46, 48, 52, 56, 60, 50, 54, 58, 62 + db 1, 5, 9, 13, 3, 7, 11, 15, 17, 21, 25, 29, 19, 23, 27, 31 + db 33, 37, 41, 45, 35, 39, 43, 47, 49, 53, 57, 61, 51, 55, 59, 63 +end_perm: db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 + db 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63 +pri_tap: db 64, 64, 32, 32, 48, 48, 48, 48 ; left-shifted by 4 +sec_tap: db 32, 32, 16, 16 +pd_268435568: dd 268435568 + +SECTION .text + +%if WIN64 +DECLARE_REG_TMP 5, 6 +%else +DECLARE_REG_TMP 8, 5 +%endif + +; lut: +; t0 t1 t2 t3 t4 t5 t6 t7 +; T0 T1 T2 T3 T4 T5 T6 T7 +; L0 L1 00 01 02 03 04 05 +; L2 L3 10 11 12 13 14 15 +; L4 L5 20 21 22 23 24 25 +; L6 L7 30 31 32 33 34 35 +; 4e 4f 40 41 42 43 44 45 +; 5e 5f 50 51 52 53 54 55 + +INIT_ZMM avx512icl +cglobal cdef_filter_4x4, 4, 8, 13, dst, stride, left, top, pri, sec, dir, damping, edge +%define base r7-edge_mask + movq xmm0, [dstq+strideq*0] + movhps xmm0, [dstq+strideq*1] + lea r7, [edge_mask] + movq xmm1, [topq+strideq*0-2] + movhps xmm1, [topq+strideq*1-2] + mov r6d, edgem + vinserti32x4 ym0, ymm0, [leftq], 1 + lea r2, [strideq*3] + vinserti32x4 ym1, ymm1, [dstq+strideq*2], 1 + mova m5, [base+lut_perm_4x4] + vinserti32x4 m0, [dstq+r2], 2 + test r6b, 0x08 ; avoid buffer overread + jz .main + lea r3, [dstq+strideq*4-4] + vinserti32x4 m1, [r3+strideq*0], 2 + vinserti32x4 m0, [r3+strideq*1], 3 +.main: + movifnidn prid, prim + mov t0d, dirm + mova m3, [base+px_idx] + mov r3d, dampingm + vpermi2b m5, m0, m1 ; lut + vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4) + pxor m7, m7 + lea r3, [r7+r3*8] ; gf_shr + (damping - 30) * 8 + vpermb m6, m3, m5 ; px + cmp r6d, 0x0f + jne .mask_edges ; mask edges only if required + test prid, prid + jz .sec_only + vpaddd m1, m3, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir + vpermb m1, m1, m5 ; k0p0 k0p1 k1p0 k1p1 +%macro CDEF_FILTER_4x4_PRI 0 + vpcmpub k1, m6, m1, 6 ; px > pN + psubb m2, m1, m6 + lzcnt r6d, prid + vpsubb m2{k1}, m6, m1 ; abs(diff) + vpbroadcastb m4, prid + and prid, 1 + vgf2p8affineqb m9, m2, [r3+r6*8] {1to8}, 0 ; abs(diff) >> shift + movifnidn t1d, secm + vpbroadcastd m10, [base+pri_tap+priq*4] + vpsubb m10{k1}, m7, m10 ; apply_sign(pri_tap) + psubusb m4, m9 ; imax(0, pri_strength - (abs(diff) >> shift))) + pminub m2, m4 + vpdpbusd m0, m2, m10 ; sum +%endmacro + CDEF_FILTER_4x4_PRI + test t1d, t1d ; sec + jz .end_no_clip + call .sec +.end_clip: + pminub m4, m6, m1 + pmaxub m1, m6 + pminub m5, m2, m3 + pmaxub m2, m3 + pminub m4, m5 + pmaxub m2, m1 + psrldq m1, m4, 2 + psrldq m3, m2, 2 + pminub m1, m4 + vpcmpw k1, m0, m7, 1 + vpshldd m6, m0, 8 + pmaxub m2, m3 + pslldq m3, m1, 1 + psubw m7, m0 + paddusw m0, m6 ; clip >0xff + vpsubusw m0{k1}, m6, m7 ; clip <0x00 + pslldq m4, m2, 1 + pminub m1, m3 + pmaxub m2, m4 + pmaxub m0, m1 + pminub m0, m2 + jmp .end +.sec_only: + movifnidn t1d, secm + call .sec +.end_no_clip: + vpshldd m6, m0, 8 ; (px << 8) + ((sum > -8) << 4) + paddw m0, m6 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) +.end: + mova xm1, [base+end_perm] + vpermb m0, m1, m0 ; output in bits 8-15 of each dword + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+r2 ], xm0, 3 + RET +.mask_edges_sec_only: + movifnidn t1d, secm + call .mask_edges_sec + jmp .end_no_clip +ALIGN function_align +.mask_edges: + vpbroadcastq m8, [base+edge_mask+r6*8] + test prid, prid + jz .mask_edges_sec_only + vpaddd m2, m3, [base+cdef_dirs+(t0+2)*4] {1to16} + vpshufbitqmb k1, m8, m2 ; index in-range + mova m1, m6 + vpermb m1{k1}, m2, m5 + CDEF_FILTER_4x4_PRI + test t1d, t1d + jz .end_no_clip + call .mask_edges_sec + jmp .end_clip +.mask_edges_sec: + vpaddd m4, m3, [base+cdef_dirs+(t0+4)*4] {1to16} + vpaddd m9, m3, [base+cdef_dirs+(t0+0)*4] {1to16} + vpshufbitqmb k1, m8, m4 + mova m2, m6 + vpermb m2{k1}, m4, m5 + vpshufbitqmb k1, m8, m9 + mova m3, m6 + vpermb m3{k1}, m9, m5 + jmp .sec_main +ALIGN function_align +.sec: + vpaddd m2, m3, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2 + vpaddd m3, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2 + vpermb m2, m2, m5 ; k0s0 k0s1 k1s0 k1s1 + vpermb m3, m3, m5 ; k0s2 k0s3 k1s2 k1s3 +.sec_main: + vpbroadcastd m8, [base+sec_tap] + vpcmpub k1, m6, m2, 6 + psubb m4, m2, m6 + vpbroadcastb m12, t1d + lzcnt t1d, t1d + vpsubb m4{k1}, m6, m2 + vpcmpub k2, m6, m3, 6 + vpbroadcastq m11, [r3+t1*8] + gf2p8affineqb m10, m4, m11, 0 + psubb m5, m3, m6 + mova m9, m8 + vpsubb m8{k1}, m7, m8 + psubusb m10, m12, m10 + vpsubb m5{k2}, m6, m3 + pminub m4, m10 + vpdpbusd m0, m4, m8 + gf2p8affineqb m11, m5, m11, 0 + vpsubb m9{k2}, m7, m9 + psubusb m12, m11 + pminub m5, m12 + vpdpbusd m0, m5, m9 + ret + +DECLARE_REG_TMP 2, 7 + +; lut top lut bottom +; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25 +; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35 +; L0 L1 00 01 02 03 04 05 L8 L9 40 41 42 43 44 45 +; L2 L3 10 11 12 13 14 15 La Lb 50 51 52 53 54 55 +; L4 L5 20 21 22 23 24 25 Lc Ld 60 61 62 63 64 65 +; L6 L7 30 31 32 33 34 35 Le Lf 70 71 72 73 74 75 +; L8 L9 40 41 42 43 44 45 8e 8f 80 81 82 83 84 85 +; La Lb 50 51 52 53 54 55 9e 9f 90 91 92 93 94 95 + +cglobal cdef_filter_4x8, 4, 9, 22, dst, stride, left, top, \ + pri, sec, dir, damping, edge +%define base r8-edge_mask + vpbroadcastd ym21, strided + mov r6d, edgem + lea r8, [edge_mask] + movq xm1, [topq+strideq*0-2] + pmulld ym21, [base+pd_01234567] + kxnorb k1, k1, k1 + movq xm2, [topq+strideq*1-2] + vpgatherdq m0{k1}, [dstq+ym21] ; +0+1 +2+3 +4+5 +6+7 + mova m14, [base+lut_perm_4x8a] + movu m15, [base+lut_perm_4x8b] + test r6b, 0x08 ; avoid buffer overread + jz .main + lea r7, [dstq+strideq*8-2] + vinserti32x4 ym1, [r7+strideq*0], 1 + vinserti32x4 ym2, [r7+strideq*1], 1 +.main: + punpcklqdq ym1, ym2 + vinserti32x4 m1, [leftq], 2 ; -2-1 +8+9 left ____ + movifnidn prid, prim + mov t0d, dirm + mova m16, [base+px_idx] + mov r3d, dampingm + vpermi2b m14, m0, m1 ; lut top + vpermi2b m15, m0, m1 ; lut bottom + vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4) + pxor m20, m20 + lea r3, [r8+r3*8] ; gf_shr + (damping - 30) * 8 + vpermb m2, m16, m14 ; pxt + vpermb m3, m16, m15 ; pxb + mova m1, m0 + cmp r6b, 0x0f + jne .mask_edges ; mask edges only if required + test prid, prid + jz .sec_only + vpaddd m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir + vpermb m4, m6, m14 ; pNt k0p0 k0p1 k1p0 k1p1 + vpermb m5, m6, m15 ; pNb +%macro CDEF_FILTER_4x8_PRI 0 + vpcmpub k1, m2, m4, 6 ; pxt > pNt + vpcmpub k2, m3, m5, 6 ; pxb > pNb + psubb m6, m4, m2 + psubb m7, m5, m3 + lzcnt r6d, prid + vpsubb m6{k1}, m2, m4 ; abs(diff_top) + vpsubb m7{k2}, m3, m5 ; abs(diff_bottom) + vpbroadcastb m13, prid + vpbroadcastq m9, [r3+r6*8] + and prid, 1 + vpbroadcastd m11, [base+pri_tap+priq*4] + vgf2p8affineqb m8, m6, m9, 0 ; abs(dt) >> shift + vgf2p8affineqb m9, m7, m9, 0 ; abs(db) >> shift + mova m10, m11 + movifnidn t1d, secm + vpsubb m10{k1}, m20, m11 ; apply_sign(pri_tap_top) + vpsubb m11{k2}, m20, m11 ; apply_sign(pri_tap_bottom) + psubusb m12, m13, m8 ; imax(0, pri_strength - (abs(dt) >> shift))) + psubusb m13, m13, m9 ; imax(0, pri_strength - (abs(db) >> shift))) + pminub m6, m12 + pminub m7, m13 + vpdpbusd m0, m6, m10 ; sum top + vpdpbusd m1, m7, m11 ; sum bottom +%endmacro + CDEF_FILTER_4x8_PRI + test t1d, t1d ; sec + jz .end_no_clip + call .sec +.end_clip: + pminub m10, m4, m2 + pminub m12, m6, m8 + pminub m11, m5, m3 + pminub m13, m7, m9 + pmaxub m4, m2 + pmaxub m6, m8 + pmaxub m5, m3 + pmaxub m7, m9 + pminub m10, m12 + pminub m11, m13 + pmaxub m4, m6 + pmaxub m5, m7 + mov r2d, 0xAAAAAAAA + kmovd k1, r2d + kxnorb k2, k2, k2 ; hw lw + vpshrdd m12, m0, m1, 16 ; m1lw m0hw + vpshrdd m6, m10, m11, 16 ; m11lw m10hw + vpshrdd m8, m4, m5, 16 ; m5lw m4hw + vpblendmw m7{k1}, m10, m11 ; m11hw m10lw + vpblendmw m9{k1}, m4, m5 ; m5hw m4lw + vpblendmw m4{k1}, m0, m12 ; m1lw m0lw + vpblendmw m5{k1}, m12, m1 ; m1hw m0hw + vpshrdd m2, m3, 16 + pminub m6, m7 + pmaxub m8, m9 + mova ym14, [base+end_perm] + vpcmpw k1, m4, m20, 1 + vpshldw m2, m5, 8 + pslldq m7, m6, 1 + pslldq m9, m8, 1 + psubw m5, m20, m4 + paddusw m0, m4, m2 ; clip >0xff + pminub m6, m7 + pmaxub m8, m9 + psubusw m0{k1}, m2, m5 ; clip <0x00 + pmaxub m0, m6 + pminub m0, m8 + vpermb m0, m14, m0 + vpscatterdd [dstq+ym21]{k2}, ym0 + RET +.sec_only: + movifnidn t1d, secm + call .sec +.end_no_clip: + mova ym4, [base+end_perm] + kxnorb k1, k1, k1 + vpshldd m2, m0, 8 ; (px << 8) + ((sum > -8) << 4) + vpshldd m3, m1, 8 + paddw m0, m2 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) + paddw m1, m3 + pslld m0, 16 + vpshrdd m0, m1, 16 + vpermb m0, m4, m0 ; output in bits 8-15 of each word + vpscatterdd [dstq+ym21]{k1}, ym0 + RET +.mask_edges_sec_only: + movifnidn t1d, secm + call .mask_edges_sec + jmp .end_no_clip +ALIGN function_align +.mask_edges: + mov t1d, r6d + or r6d, 8 ; top 4x4 has bottom + or t1d, 4 ; bottom 4x4 has top + vpbroadcastq m17, [base+edge_mask+r6*8] + vpbroadcastq m18, [base+edge_mask+t1*8] + test prid, prid + jz .mask_edges_sec_only + vpaddd m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16} + vpshufbitqmb k1, m17, m6 ; index in-range + vpshufbitqmb k2, m18, m6 + mova m4, m2 + mova m5, m3 + vpermb m4{k1}, m6, m14 + vpermb m5{k2}, m6, m15 + CDEF_FILTER_4x8_PRI + test t1d, t1d + jz .end_no_clip + call .mask_edges_sec + jmp .end_clip +.mask_edges_sec: + vpaddd m10, m16, [base+cdef_dirs+(t0+4)*4] {1to16} + vpaddd m11, m16, [base+cdef_dirs+(t0+0)*4] {1to16} + vpshufbitqmb k1, m17, m10 + vpshufbitqmb k2, m18, m10 + vpshufbitqmb k3, m17, m11 + vpshufbitqmb k4, m18, m11 + mova m6, m2 + mova m7, m3 + mova m8, m2 + mova m9, m3 + vpermb m6{k1}, m10, m14 + vpermb m7{k2}, m10, m15 + vpermb m8{k3}, m11, m14 + vpermb m9{k4}, m11, m15 + jmp .sec_main +ALIGN function_align +.sec: + vpaddd m8, m16, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2 + vpaddd m9, m16, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2 + vpermb m6, m8, m14 ; pNt k0s0 k0s1 k1s0 k1s1 + vpermb m7, m8, m15 ; pNb + vpermb m8, m9, m14 ; pNt k0s2 k0s3 k1s2 k1s3 + vpermb m9, m9, m15 ; pNb +.sec_main: + vpbroadcastb m18, t1d + lzcnt t1d, t1d + vpcmpub k1, m2, m6, 6 + vpcmpub k2, m3, m7, 6 + vpcmpub k3, m2, m8, 6 + vpcmpub k4, m3, m9, 6 + vpbroadcastq m17, [r3+t1*8] + psubb m10, m6, m2 + psubb m11, m7, m3 + psubb m12, m8, m2 + psubb m13, m9, m3 + vpsubb m10{k1}, m2, m6 ; abs(dt0) + vpsubb m11{k2}, m3, m7 ; abs(db0) + vpsubb m12{k3}, m2, m8 ; abs(dt1) + vpsubb m13{k4}, m3, m9 ; abs(db1) + vpbroadcastd m19, [base+sec_tap] + gf2p8affineqb m14, m10, m17, 0 ; abs(dt0) >> shift + gf2p8affineqb m15, m11, m17, 0 ; abs(db0) >> shift + gf2p8affineqb m16, m12, m17, 0 ; abs(dt1) >> shift + gf2p8affineqb m17, m13, m17, 0 ; abs(db1) >> shift + psubusb m14, m18, m14 ; imax(0, sec_strength - (abs(dt0) >> shift))) + psubusb m15, m18, m15 ; imax(0, sec_strength - (abs(db0) >> shift))) + psubusb m16, m18, m16 ; imax(0, sec_strength - (abs(dt1) >> shift))) + psubusb m17, m18, m17 ; imax(0, sec_strength - (abs(db1) >> shift))) + pminub m10, m14 + pminub m11, m15 + pminub m12, m16 + pminub m13, m17 + mova m14, m19 + mova m15, m19 + mova m16, m19 + vpsubb m14{k1}, m20, m19 ; apply_sign(sec_tap_top_0) + vpsubb m15{k2}, m20, m19 ; apply_sign(sec_tap_bottom_0) + vpsubb m16{k3}, m20, m19 ; apply_sign(sec_tap_top_1) + vpsubb m19{k4}, m20, m19 ; apply_sign(sec_tap_bottom_1) + vpdpbusd m0, m10, m14 + vpdpbusd m1, m11, m15 + vpdpbusd m0, m12, m16 + vpdpbusd m1, m13, m19 + ret + +; lut tl lut tr +; t0 t1 t2 t3 t4 t5 t6 t7 t6 t7 t8 t9 ta tb tc td +; T0 T1 T2 T3 T4 T5 T6 T7 T6 T7 T8 T9 TA TB TC TD +; L0 L1 00 01 02 03 04 05 04 05 06 07 08 09 0a 0b +; L2 L3 10 11 12 13 14 15 14 15 16 17 18 19 1a 1b +; L4 L5 20 21 22 23 24 25 24 25 26 27 28 29 2a 2b +; L6 L7 30 31 32 33 34 35 34 35 36 37 38 39 3a 3b +; L8 L9 40 41 42 43 44 45 44 45 46 47 48 49 4a 4b +; La Lb 50 51 52 53 54 55 54 55 56 57 58 59 5a 5b +; lut bl lut br +; L4 L5 20 21 22 23 24 25 24 25 26 27 28 29 2a 2b +; L6 L7 30 31 32 33 34 35 34 35 36 37 38 39 3a 3b +; L8 L9 40 41 42 43 44 45 44 45 46 47 48 49 4a 4b +; La Lb 50 51 52 53 54 55 54 55 56 57 58 59 5a 5b +; Lc Ld 60 61 62 63 64 65 64 65 66 67 68 69 6a 6b +; Le Lf 70 71 72 73 74 75 74 75 76 77 78 79 7a 7b +; 8e 8f 80 81 82 83 84 85 84 85 86 87 88 89 8a 8b +; 9e 9f 90 91 92 93 94 95 94 95 96 97 98 99 9a 9b + +cglobal cdef_filter_8x8, 4, 11, 32, 4*64, dst, stride, left, top, \ + pri, sec, dir, damping, edge +%define base r8-edge_mask + mov r6d, edgem + lea r10, [dstq+strideq*4-2] + movu xmm0, [topq+strideq*0-2] + movu xmm1, [dstq+strideq*2-2] + movu xmm2, [r10 +strideq*2 ] + lea r8, [edge_mask] + lea r9, [strideq*3] + pmovzxwq m10, [leftq-4] + vinserti32x4 ym0, ymm0, [topq+strideq*1-2], 1 + vinserti32x4 ym1, ymm1, [dstq+r9 -2], 1 + vinserti32x4 ym2, ymm2, [r10 +r9 ], 1 + lea r7, [r10 +strideq*4 ] + pmovzxwq m11, [leftq+4] + vinserti32x4 m0, [dstq+strideq*0-2], 2 + vinserti32x4 m1, [r10 +strideq*0 ], 2 + mova m12, [base+lut_perm_8x8a] + movu m13, [base+lut_perm_8x8b] + vinserti32x4 m0, [dstq+strideq*1-2], 3 + vinserti32x4 m1, [r10 +strideq*1 ], 3 + test r6b, 0x08 ; avoid buffer overread + jz .main + vinserti32x4 m2, [r7 +strideq*0], 2 + vinserti32x4 m2, [r7 +strideq*1], 3 +.main: + mov t1d, 0x11111100 + mova m14, m12 + mova m15, m13 + kmovd k1, t1d + kshiftrd k2, k1, 8 + movifnidn prid, prim + mov t0d, dirm + mova m30, [base+px_idx] + mov r3d, dampingm + vpermi2b m12, m0, m1 ; lut tl + vpermi2b m14, m1, m2 ; lut bl + vpermi2b m13, m0, m1 ; lut tr + vpermi2b m15, m1, m2 ; lut br + vpblendmw m12{k1}, m12, m10 + vpblendmw m14{k2}, m14, m11 + vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4) + pxor m31, m31 + lea r3, [r8+r3*8] ; gf_shr + (damping - 30) * 8 + vpermb m4, m30, m12 ; pxtl + vpermb m5, m30, m13 ; pxtr + vpermb m6, m30, m14 ; pxbl + vpermb m7, m30, m15 ; pxbr + mova m1, m0 + mova m2, m0 + mova m3, m0 + cmp r6b, 0x0f + jne .mask_edges ; mask edges only if required + test prid, prid + jz .sec_only + vpaddd m11, m30, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir + vpermb m8, m11, m12 ; pNtl k0p0 k0p1 k1p0 k1p1 + vpermb m9, m11, m13 ; pNtr + vpermb m10, m11, m14 ; pNbl + vpermb m11, m11, m15 ; pNbr +%macro CDEF_FILTER_8x8_PRI 0 + vpcmpub k1, m4, m8, 6 ; pxtl > pNtl + vpcmpub k2, m5, m9, 6 ; pxtr > pNtr + vpcmpub k3, m6, m10, 6 ; pxbl > pNbl + vpcmpub k4, m7, m11, 6 ; pxbr > pNbr + psubb m16, m8, m4 + psubb m17, m9, m5 + psubb m18, m10, m6 + psubb m19, m11, m7 + lzcnt r6d, prid + vpsubb m16{k1}, m4, m8 ; abs(diff_tl) + vpsubb m17{k2}, m5, m9 ; abs(diff_tr) + vpsubb m18{k3}, m6, m10 ; abs(diff_bl) + vpsubb m19{k4}, m7, m11 ; abs(diff_br) + vpbroadcastq m28, [r3+r6*8] + vpbroadcastb m29, prid + and prid, 1 + vpbroadcastd m27, [base+pri_tap+priq*4] + vgf2p8affineqb m20, m16, m28, 0 ; abs(dtl) >> shift + vgf2p8affineqb m21, m17, m28, 0 ; abs(dtr) >> shift + vgf2p8affineqb m22, m18, m28, 0 ; abs(dbl) >> shift + vgf2p8affineqb m23, m19, m28, 0 ; abs(dbl) >> shift + mova m24, m27 + mova m25, m27 + mova m26, m27 + movifnidn t1d, secm + vpsubb m24{k1}, m31, m27 ; apply_sign(pri_tap_tl) + vpsubb m25{k2}, m31, m27 ; apply_sign(pri_tap_tr) + vpsubb m26{k3}, m31, m27 ; apply_sign(pri_tap_tl) + vpsubb m27{k4}, m31, m27 ; apply_sign(pri_tap_tr) + psubusb m20, m29, m20 ; imax(0, pri_strength - (abs(dtl) >> shift))) + psubusb m21, m29, m21 ; imax(0, pri_strength - (abs(dtr) >> shift))) + psubusb m22, m29, m22 ; imax(0, pri_strength - (abs(dbl) >> shift))) + psubusb m23, m29, m23 ; imax(0, pri_strength - (abs(dbr) >> shift))) + pminub m16, m20 + pminub m17, m21 + pminub m18, m22 + pminub m19, m23 + vpdpbusd m0, m16, m24 ; sum tl + vpdpbusd m1, m17, m25 ; sum tr + vpdpbusd m2, m18, m26 ; sum bl + vpdpbusd m3, m19, m27 ; sum br +%endmacro + CDEF_FILTER_8x8_PRI + test t1d, t1d ; sec + jz .end_no_clip + call .sec +.end_clip: + pminub m20, m8, m4 + pminub m24, m12, m16 + pminub m21, m9, m5 + pminub m25, m13, m17 + pminub m22, m10, m6 + pminub m26, m14, m18 + pminub m23, m11, m7 + pminub m27, m15, m19 + pmaxub m8, m4 + pmaxub m12, m16 + pmaxub m9, m5 + pmaxub m13, m17 + pmaxub m10, m6 + pmaxub m14, m18 + pmaxub m11, m7 + pmaxub m15, m19 + pminub m20, m24 + pminub m21, m25 + pminub m22, m26 + pminub m23, m27 + pmaxub m8, m12 + pmaxub m9, m13 + pmaxub m10, m14 + pmaxub m11, m15 + mov r2d, 0xAAAAAAAA + kmovd k1, r2d + vpshrdd m24, m0, m1, 16 + vpshrdd m25, m2, m3, 16 + vpshrdd m12, m20, m21, 16 + vpshrdd m14, m22, m23, 16 + vpshrdd m16, m8, m9, 16 + vpshrdd m18, m10, m11, 16 + vpblendmw m13{k1}, m20, m21 + vpblendmw m15{k1}, m22, m23 + vpblendmw m17{k1}, m8, m9 + vpblendmw m19{k1}, m10, m11 + vpblendmw m20{k1}, m0, m24 + vpblendmw m21{k1}, m24, m1 + vpblendmw m22{k1}, m2, m25 + vpblendmw m23{k1}, m25, m3 + vpshrdd m4, m5, 16 + vpshrdd m6, m7, 16 + pminub m12, m13 + pminub m14, m15 + pmaxub m16, m17 + pmaxub m18, m19 + mova m8, [base+end_perm_w8clip] + vpcmpw k2, m20, m31, 1 + vpcmpw k3, m22, m31, 1 + vpshldw m4, m21, 8 + vpshldw m6, m23, 8 + kunpckdq k1, k1, k1 + kxnorb k4, k4, k4 + vpshrdw m11, m12, m14, 8 + vpshrdw m15, m16, m18, 8 + vpblendmb m13{k1}, m12, m14 + vpblendmb m17{k1}, m16, m18 + psubw m21, m31, m20 + psubw m23, m31, m22 + paddusw m0, m20, m4 ; clip >0xff + paddusw m1, m22, m6 + pminub m11, m13 + pmaxub m15, m17 + psubusw m0{k2}, m4, m21 ; clip <0x00 + psubusw m1{k3}, m6, m23 + psrlw m0, 8 + vmovdqu8 m0{k1}, m1 + pmaxub m0, m11 + pminub m0, m15 + vpermb m0, m8, m0 + add r10, 2 + vextracti32x4 xm1, m0, 1 + vextracti32x4 xm2, m0, 2 + vextracti32x4 xm3, m0, 3 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*2], xm1 + movq [r10 +strideq*0], xm2 + movq [r10 +strideq*2], xm3 + movhps [dstq+strideq*1], xm0 + movhps [dstq+r9 ], xm1 + movhps [r10 +strideq*1], xm2 + movhps [r10 +r9 ], xm3 + RET +.sec_only: + movifnidn t1d, secm + call .sec +.end_no_clip: + mova xm8, [base+end_perm] + kxnorb k1, k1, k1 + vpshldd m4, m0, 8 ; (px << 8) + ((sum > -8) << 4) + vpshldd m5, m1, 8 + vpshldd m6, m2, 8 + vpshldd m7, m3, 8 + paddw m0, m4 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) + paddw m1, m5 + paddw m2, m6 + paddw m3, m7 + vpermb m0, m8, m0 + vpermb m1, m8, m1 + vpermb m2, m8, m2 + vpermb m3, m8, m3 + add r10, 2 + punpckldq m4, m0, m1 + punpckhdq m0, m1 + punpckldq m5, m2, m3 + punpckhdq m2, m3 + movq [dstq+strideq*0], xm4 + movq [dstq+strideq*2], xm0 + movq [r10 +strideq*0], xm5 + movq [r10 +strideq*2], xm2 + movhps [dstq+strideq*1], xm4 + movhps [dstq+r9 ], xm0 + movhps [r10 +strideq*1], xm5 + movhps [r10 +r9 ], xm2 + RET +.mask_edges_sec_only: + movifnidn t1d, secm + call .mask_edges_sec + jmp .end_no_clip +ALIGN function_align +.mask_edges: + mov t0d, r6d + mov t1d, r6d + or t0d, 0xA ; top-left 4x4 has bottom and right + or t1d, 0x9 ; top-right 4x4 has bottom and left + vpbroadcastq m26, [base+edge_mask+t0*8] + vpbroadcastq m27, [base+edge_mask+t1*8] + mov t1d, r6d + or r6d, 0x6 ; bottom-left 4x4 has top and right + or t1d, 0x5 ; bottom-right 4x4 has top and left + vpbroadcastq m28, [base+edge_mask+r6*8] + vpbroadcastq m29, [base+edge_mask+t1*8] + mov t0d, dirm + test prid, prid + jz .mask_edges_sec_only + vpaddd m20, m30, [base+cdef_dirs+(t0+2)*4] {1to16} + vpshufbitqmb k1, m26, m20 ; index in-range + vpshufbitqmb k2, m27, m20 + vpshufbitqmb k3, m28, m20 + vpshufbitqmb k4, m29, m20 + mova m8, m4 + mova m9, m5 + mova m10, m6 + mova m11, m7 + vpermb m8{k1}, m20, m12 + vpermb m9{k2}, m20, m13 + vpermb m10{k3}, m20, m14 + vpermb m11{k4}, m20, m15 + mova [rsp+0x00], m26 + mova [rsp+0x40], m27 + mova [rsp+0x80], m28 + mova [rsp+0xC0], m29 + CDEF_FILTER_8x8_PRI + test t1d, t1d + jz .end_no_clip + mova m26, [rsp+0x00] + mova m27, [rsp+0x40] + mova m28, [rsp+0x80] + mova m29, [rsp+0xC0] + call .mask_edges_sec + jmp .end_clip +.mask_edges_sec: + vpaddd m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16} + vpaddd m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16} + vpshufbitqmb k1, m26, m20 + vpshufbitqmb k2, m27, m20 + vpshufbitqmb k3, m28, m20 + vpshufbitqmb k4, m29, m20 + mova m16, m4 + mova m17, m5 + mova m18, m6 + mova m19, m7 + vpermb m16{k1}, m20, m12 + vpermb m17{k2}, m20, m13 + vpermb m18{k3}, m20, m14 + vpermb m19{k4}, m20, m15 + vpshufbitqmb k1, m26, m21 + vpshufbitqmb k2, m27, m21 + vpshufbitqmb k3, m28, m21 + vpshufbitqmb k4, m29, m21 + vpermb m12, m21, m12 + vpermb m13, m21, m13 + vpermb m14, m21, m14 + vpermb m15, m21, m15 + vpblendmb m12{k1}, m4, m12 + vpblendmb m13{k2}, m5, m13 + vpblendmb m14{k3}, m6, m14 + vpblendmb m15{k4}, m7, m15 + jmp .sec_main +ALIGN function_align +.sec: + vpaddd m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2 + vpaddd m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2 + vpermb m16, m20, m12 ; pNtl k0s0 k0s1 k1s0 k1s1 + vpermb m17, m20, m13 ; pNtr + vpermb m18, m20, m14 ; pNbl + vpermb m19, m20, m15 ; pNbr + vpermb m12, m21, m12 ; pNtl k0s2 k0s3 k1s2 k1s3 + vpermb m13, m21, m13 ; pNtr + vpermb m14, m21, m14 ; pNbl + vpermb m15, m21, m15 ; pNbr +.sec_main: +%macro CDEF_FILTER_8x8_SEC 4-5 0 ; load constants + vpcmpub k1, m4, %1, 6 + vpcmpub k2, m5, %2, 6 + vpcmpub k3, m6, %3, 6 + vpcmpub k4, m7, %4, 6 + psubb m20, %1, m4 + psubb m21, %2, m5 + psubb m22, %3, m6 + psubb m23, %4, m7 +%if %5 + vpbroadcastb m28, t1d + lzcnt t1d, t1d + vpbroadcastq m29, [r3+t1*8] +%endif + vpsubb m20{k1}, m4, %1 + vpsubb m21{k2}, m5, %2 + vpsubb m22{k3}, m6, %3 + vpsubb m23{k4}, m7, %4 + gf2p8affineqb m24, m20, m29, 0 + gf2p8affineqb m25, m21, m29, 0 + gf2p8affineqb m26, m22, m29, 0 + gf2p8affineqb m27, m23, m29, 0 +%if %5 + vpbroadcastd m30, [base+sec_tap] +%endif + psubusb m24, m28, m24 + psubusb m25, m28, m25 + psubusb m26, m28, m26 + psubusb m27, m28, m27 + pminub m20, m24 + pminub m21, m25 + pminub m22, m26 + pminub m23, m27 + mova m24, m30 + mova m25, m30 + mova m26, m30 + mova m27, m30 + vpsubb m24{k1}, m31, m30 + vpsubb m25{k2}, m31, m30 + vpsubb m26{k3}, m31, m30 + vpsubb m27{k4}, m31, m30 + vpdpbusd m0, m20, m24 + vpdpbusd m1, m21, m25 + vpdpbusd m2, m22, m26 + vpdpbusd m3, m23, m27 +%endmacro + CDEF_FILTER_8x8_SEC m16, m17, m18, m19, 1 + CDEF_FILTER_8x8_SEC m12, m13, m14, m15 + ret + +%endif ; HAVE_AVX512ICL && ARCH_X86_64 diff --git a/third_party/dav1d/src/x86/cdef_init_tmpl.c b/third_party/dav1d/src/x86/cdef_init_tmpl.c new file mode 100644 index 0000000000..edc3b5d4bf --- /dev/null +++ b/third_party/dav1d/src/x86/cdef_init_tmpl.c @@ -0,0 +1,94 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/cdef.h" + +#define decl_cdef_size_fn(sz) \ + decl_cdef_fn(dav1d_cdef_filter_##sz##_avx512icl); \ + decl_cdef_fn(dav1d_cdef_filter_##sz##_avx2); \ + decl_cdef_fn(dav1d_cdef_filter_##sz##_sse4); \ + decl_cdef_fn(dav1d_cdef_filter_##sz##_ssse3); \ + decl_cdef_fn(dav1d_cdef_filter_##sz##_sse2) + +decl_cdef_size_fn(4x4); +decl_cdef_size_fn(4x8); +decl_cdef_size_fn(8x8); + +decl_cdef_dir_fn(dav1d_cdef_dir_avx2); +decl_cdef_dir_fn(dav1d_cdef_dir_sse4); +decl_cdef_dir_fn(dav1d_cdef_dir_ssse3); + +COLD void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return; + +#if BITDEPTH == 8 + c->fb[0] = dav1d_cdef_filter_8x8_sse2; + c->fb[1] = dav1d_cdef_filter_4x8_sse2; + c->fb[2] = dav1d_cdef_filter_4x4_sse2; +#endif + + if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; + +#if BITDEPTH == 8 + c->dir = dav1d_cdef_dir_ssse3; + c->fb[0] = dav1d_cdef_filter_8x8_ssse3; + c->fb[1] = dav1d_cdef_filter_4x8_ssse3; + c->fb[2] = dav1d_cdef_filter_4x4_ssse3; +#endif + + if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return; + +#if BITDEPTH == 8 + c->dir = dav1d_cdef_dir_sse4; + c->fb[0] = dav1d_cdef_filter_8x8_sse4; + c->fb[1] = dav1d_cdef_filter_4x8_sse4; + c->fb[2] = dav1d_cdef_filter_4x4_sse4; +#endif + +#if ARCH_X86_64 + if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; + +#if BITDEPTH == 8 + c->dir = dav1d_cdef_dir_avx2; + c->fb[0] = dav1d_cdef_filter_8x8_avx2; + c->fb[1] = dav1d_cdef_filter_4x8_avx2; + c->fb[2] = dav1d_cdef_filter_4x4_avx2; +#endif + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return; + +#if HAVE_AVX512ICL && BITDEPTH == 8 + c->fb[0] = dav1d_cdef_filter_8x8_avx512icl; + c->fb[1] = dav1d_cdef_filter_4x8_avx512icl; + c->fb[2] = dav1d_cdef_filter_4x4_avx512icl; +#endif + +#endif +} diff --git a/third_party/dav1d/src/x86/cdef_sse.asm b/third_party/dav1d/src/x86/cdef_sse.asm new file mode 100644 index 0000000000..2dcaf22a68 --- /dev/null +++ b/third_party/dav1d/src/x86/cdef_sse.asm @@ -0,0 +1,1355 @@ +; Copyright © 2018, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; Copyright © 2019, VideoLabs +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA 16 + +%macro DUP8 1-* + %rep %0 + times 8 db %1 + %rotate 1 + %endrep +%endmacro + +div_table_sse4: dd 840, 420, 280, 210, 168, 140, 120, 105 + dd 420, 210, 140, 105, 105, 105, 105, 105 +div_table_ssse3: dw 840, 840, 420, 420, 280, 280, 210, 210 + dw 168, 168, 140, 140, 120, 120, 105, 105 + dw 420, 420, 210, 210, 140, 140, 105, 105 + dw 105, 105, 105, 105, 105, 105, 105, 105 +shufw_6543210x: db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15 +shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 +pw_8: times 8 dw 8 +pw_128: times 8 dw 128 +pw_256: times 8 dw 256 +pw_2048: times 8 dw 2048 +pw_0x7FFF: times 8 dw 0x7FFF +pw_0x8000: times 8 dw 0x8000 +tap_table: ; masks for 8-bit shift emulation + DUP8 0xFF, 0xFE, 0xFC, 0xF8, 0xF0, 0xE0, 0xC0, 0x80 + ; weights + DUP8 4, 2, 3, 3, 2, 1 + ; taps indices + db -1 * 16 + 1, -2 * 16 + 2 + db 0 * 16 + 1, -1 * 16 + 2 + db 0 * 16 + 1, 0 * 16 + 2 + db 0 * 16 + 1, 1 * 16 + 2 + db 1 * 16 + 1, 2 * 16 + 2 + db 1 * 16 + 0, 2 * 16 + 1 + db 1 * 16 + 0, 2 * 16 + 0 + db 1 * 16 + 0, 2 * 16 - 1 + ; the last 6 are repeats of the first 6 so we don't need to & 7 + db -1 * 16 + 1, -2 * 16 + 2 + db 0 * 16 + 1, -1 * 16 + 2 + db 0 * 16 + 1, 0 * 16 + 2 + db 0 * 16 + 1, 1 * 16 + 2 + db 1 * 16 + 1, 2 * 16 + 2 + db 1 * 16 + 0, 2 * 16 + 1 + +SECTION .text + +%macro movif32 2 + %if ARCH_X86_32 + mov %1, %2 + %endif +%endmacro + +%macro PMOVZXBW 2-3 0 ; %3 = half + %if cpuflag(sse4) && %3 == 0 + pmovzxbw %1, %2 + %else + %if %3 == 1 + movd %1, %2 + %else + movq %1, %2 + %endif + punpcklbw %1, m7 + %endif +%endmacro + +%macro PSHUFB_0 2 + %if cpuflag(ssse3) + pshufb %1, %2 + %else + punpcklbw %1, %1 + pshuflw %1, %1, q0000 + punpcklqdq %1, %1 + %endif +%endmacro + +%macro MOVDDUP 2 +%if cpuflag(ssse3) + movddup %1, %2 +%else + movq %1, %2 + punpcklqdq %1, %1 +%endif +%endmacro + +%macro ACCUMULATE_TAP 7 ; tap_offset, shift, shift_mask, strength, mul_tap, w, minmax + ; load p0/p1 + movsx offq, byte [dirq+kq+%1+14*8] ; off1 + %if %6 == 4 + movq m5, [stkq+offq*2+32*0] ; p0 + movhps m5, [stkq+offq*2+32*1] + %else + movu m5, [stkq+offq*2+32*0] ; p0 + %endif + neg offq ; -off1 + %if %6 == 4 + movq m6, [stkq+offq*2+32*0] ; p1 + movhps m6, [stkq+offq*2+32*1] + %else + movu m6, [stkq+offq*2+32*0] ; p1 + %endif + %if %7 + %if cpuflag(sse4) + ; out of bounds values are set to a value that is a both a large unsigned + ; value and a negative signed value. + ; use signed max and unsigned min to remove them + pmaxsw m7, m5 + pminuw m8, m5 + pmaxsw m7, m6 + pminuw m8, m6 + %else + pcmpeqw m3, m14, m5 + pminsw m8, m5 ; min after p0 + pandn m3, m5 + pmaxsw m7, m3 ; max after p0 + pcmpeqw m3, m14, m6 + pminsw m8, m6 ; min after p1 + pandn m3, m6 + pmaxsw m7, m3 ; max after p1 + %endif + %endif + + ; accumulate sum[m13] over p0/p1 + psubw m5, m4 ; diff_p0(p0 - px) + psubw m6, m4 ; diff_p1(p1 - px) + packsswb m5, m6 ; convert pixel diff to 8-bit + %if cpuflag(ssse3) + pshufb m5, m13 ; group diffs p0 and p1 into pairs + pabsb m6, m5 + psignb m3, %5, m5 + %else + movlhps m6, m5 + punpckhbw m6, m5 + pxor m5, m5 + pcmpgtb m5, m6 + paddb m6, m5 + pxor m6, m5 + paddb m3, %5, m5 + pxor m3, m5 + %endif + pand m9, %3, m6 ; emulate 8-bit shift + psrlw m9, %2 + psubusb m5, %4, m9 + pminub m5, m6 ; constrain(diff_p) + %if cpuflag(ssse3) + pmaddubsw m5, m3 ; constrain(diff_p) * taps + %else + psrlw m9, m5, 8 + psraw m6, m3, 8 + psllw m5, 8 + psllw m3, 8 + pmullw m9, m6 + pmulhw m5, m3 + paddw m5, m9 + %endif + paddw m0, m5 +%endmacro + +%macro LOAD_BODY 3 ; dst, src, block_width + %if %3 == 4 + PMOVZXBW m0, [%2+strideq*0] + PMOVZXBW m1, [%2+strideq*1] + PMOVZXBW m2, [%2+strideq*2] + PMOVZXBW m3, [%2+stride3q] + mova [%1+32*0], m0 + mova [%1+32*1], m1 + mova [%1+32*2], m2 + mova [%1+32*3], m3 + %else + movu m0, [%2+strideq*0] + movu m1, [%2+strideq*1] + movu m2, [%2+strideq*2] + movu m3, [%2+stride3q] + punpcklbw m4, m0, m7 + punpckhbw m0, m7 + mova [%1+32*0+ 0], m4 + mova [%1+32*0+16], m0 + punpcklbw m4, m1, m7 + punpckhbw m1, m7 + mova [%1+32*1+ 0], m4 + mova [%1+32*1+16], m1 + punpcklbw m4, m2, m7 + punpckhbw m2, m7 + mova [%1+32*2+ 0], m4 + mova [%1+32*2+16], m2 + punpcklbw m4, m3, m7 + punpckhbw m3, m7 + mova [%1+32*3+ 0], m4 + mova [%1+32*3+16], m3 + %endif +%endmacro + +%macro CDEF_FILTER_END 2 ; w, minmax + pxor m6, m6 + pcmpgtw m6, m0 + paddw m0, m6 + %if cpuflag(ssse3) + pmulhrsw m0, m15 + %else + paddw m0, m15 + psraw m0, 4 + %endif + paddw m4, m0 + %if %2 + pminsw m4, m7 + pmaxsw m4, m8 + %endif + packuswb m4, m4 + %if %1 == 4 + movd [dstq+strideq*0], m4 + psrlq m4, 32 + movd [dstq+strideq*1], m4 + add stkq, 32*2 + lea dstq, [dstq+strideq*2] + %else + movq [dstq], m4 + add stkq, 32 + add dstq, strideq + %endif +%endmacro + +%macro CDEF_FILTER 2 ; w, h + %if ARCH_X86_64 +cglobal cdef_filter_%1x%2, 4, 9, 16, 3 * 16 + (%2+4)*32, \ + dst, stride, left, top, pri, sec, edge, stride3, dst4 + %define px rsp+3*16+2*32 + %define base 0 + %else +cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*32, \ + dst, stride, left, edge, stride3 + %define topq r2 + %define dst4q r2 + LEA r5, tap_table + %define px esp+7*16+2*32 + %define base r5-tap_table + %endif + mov edged, r8m + %if cpuflag(sse4) + %define OUT_OF_BOUNDS_MEM [base+pw_0x8000] + %else + %define OUT_OF_BOUNDS_MEM [base+pw_0x7FFF] + %endif + mova m6, OUT_OF_BOUNDS_MEM + pxor m7, m7 + + ; prepare pixel buffers - body/right + %if %2 == 8 + lea dst4q, [dstq+strideq*4] + %endif + lea stride3q, [strideq*3] + test edgeb, 2 ; have_right + jz .no_right + LOAD_BODY px, dstq, %1 + %if %2 == 8 + LOAD_BODY px+4*32, dst4q, %1 + %endif + jmp .body_done +.no_right: + PMOVZXBW m0, [dstq+strideq*0], %1 == 4 + PMOVZXBW m1, [dstq+strideq*1], %1 == 4 + PMOVZXBW m2, [dstq+strideq*2], %1 == 4 + PMOVZXBW m3, [dstq+stride3q ], %1 == 4 + mova [px+32*0], m0 + mova [px+32*1], m1 + mova [px+32*2], m2 + mova [px+32*3], m3 + movd [px+32*0+%1*2], m6 + movd [px+32*1+%1*2], m6 + movd [px+32*2+%1*2], m6 + movd [px+32*3+%1*2], m6 + %if %2 == 8 + PMOVZXBW m0, [dst4q+strideq*0], %1 == 4 + PMOVZXBW m1, [dst4q+strideq*1], %1 == 4 + PMOVZXBW m2, [dst4q+strideq*2], %1 == 4 + PMOVZXBW m3, [dst4q+stride3q ], %1 == 4 + mova [px+32*4], m0 + mova [px+32*5], m1 + mova [px+32*6], m2 + mova [px+32*7], m3 + movd [px+32*4+%1*2], m6 + movd [px+32*5+%1*2], m6 + movd [px+32*6+%1*2], m6 + movd [px+32*7+%1*2], m6 + %endif +.body_done: + + ; top + movifnidn topq, r3mp + test edgeb, 4 ; have_top + jz .no_top + test edgeb, 1 ; have_left + jz .top_no_left + test edgeb, 2 ; have_right + jz .top_no_right + %if %1 == 4 + PMOVZXBW m0, [topq+strideq*0-2] + PMOVZXBW m1, [topq+strideq*1-2] + %else + movu m0, [topq+strideq*0-4] + movu m1, [topq+strideq*1-4] + punpckhbw m2, m0, m7 + punpcklbw m0, m7 + punpckhbw m3, m1, m7 + punpcklbw m1, m7 + movu [px-32*2+8], m2 + movu [px-32*1+8], m3 + %endif + movu [px-32*2-%1], m0 + movu [px-32*1-%1], m1 + jmp .top_done +.top_no_right: + %if %1 == 4 + PMOVZXBW m0, [topq+strideq*0-%1] + PMOVZXBW m1, [topq+strideq*1-%1] + movu [px-32*2-8], m0 + movu [px-32*1-8], m1 + %else + movu m0, [topq+strideq*0-%1] + movu m1, [topq+strideq*1-%2] + punpckhbw m2, m0, m7 + punpcklbw m0, m7 + punpckhbw m3, m1, m7 + punpcklbw m1, m7 + mova [px-32*2-16], m0 + mova [px-32*2+ 0], m2 + mova [px-32*1-16], m1 + mova [px-32*1+ 0], m3 + %endif + movd [px-32*2+%1*2], m6 + movd [px-32*1+%1*2], m6 + jmp .top_done +.top_no_left: + test edgeb, 2 ; have_right + jz .top_no_left_right + %if %1 == 4 + PMOVZXBW m0, [topq+strideq*0] + PMOVZXBW m1, [topq+strideq*1] + %else + movu m0, [topq+strideq*0] + movu m1, [topq+strideq*1] + punpckhbw m2, m0, m7 + punpcklbw m0, m7 + punpckhbw m3, m1, m7 + punpcklbw m1, m7 + movd [px-32*2+16], m2 + movd [px-32*1+16], m3 + %endif + movd [px-32*2- 4], m6 + movd [px-32*1- 4], m6 + mova [px-32*2+ 0], m0 + mova [px-32*1+ 0], m1 + jmp .top_done +.top_no_left_right: + PMOVZXBW m0, [topq+strideq*0], %1 == 4 + PMOVZXBW m1, [topq+strideq*1], %1 == 4 + movd [px-32*2-4], m6 + movd [px-32*1-4], m6 + mova [px-32*2+0], m0 + mova [px-32*1+0], m1 + movd [px-32*2+%1*2], m6 + movd [px-32*1+%1*2], m6 + jmp .top_done +.no_top: + movu [px-32*2- 4], m6 + movu [px-32*1- 4], m6 + %if %1 == 8 + movq [px-32*2+12], m6 + movq [px-32*1+12], m6 + %endif +.top_done: + + ; left + test edgeb, 1 ; have_left + jz .no_left + movifnidn leftq, leftmp + %if %2 == 4 + movq m0, [leftq] + %else + movu m0, [leftq] + %endif + %if %2 == 4 + punpcklbw m0, m7 + %else + punpckhbw m1, m0, m7 + punpcklbw m0, m7 + movhlps m3, m1 + movd [px+32*4-4], m1 + movd [px+32*6-4], m3 + psrlq m1, 32 + psrlq m3, 32 + movd [px+32*5-4], m1 + movd [px+32*7-4], m3 + %endif + movhlps m2, m0 + movd [px+32*0-4], m0 + movd [px+32*2-4], m2 + psrlq m0, 32 + psrlq m2, 32 + movd [px+32*1-4], m0 + movd [px+32*3-4], m2 + jmp .left_done +.no_left: + movd [px+32*0-4], m6 + movd [px+32*1-4], m6 + movd [px+32*2-4], m6 + movd [px+32*3-4], m6 + %if %2 == 8 + movd [px+32*4-4], m6 + movd [px+32*5-4], m6 + movd [px+32*6-4], m6 + movd [px+32*7-4], m6 + %endif +.left_done: + + ; bottom + %if ARCH_X86_64 + DEFINE_ARGS dst, stride, dst8, dummy, pri, sec, edge, stride3 + %else + DEFINE_ARGS dst, stride, dst8, edge, stride3 + %endif + test edgeb, 8 ; have_bottom + jz .no_bottom + lea dst8q, [dstq+%2*strideq] + test edgeb, 1 ; have_left + jz .bottom_no_left + test edgeb, 2 ; have_right + jz .bottom_no_right + %if %1 == 4 + PMOVZXBW m0, [dst8q-(%1/2)] + PMOVZXBW m1, [dst8q+strideq-(%1/2)] + %else + movu m0, [dst8q-4] + movu m1, [dst8q+strideq-4] + punpckhbw m2, m0, m7 + punpcklbw m0, m7 + punpckhbw m3, m1, m7 + punpcklbw m1, m7 + movu [px+32*(%2+0)+8], m2 + movu [px+32*(%2+1)+8], m3 + %endif + movu [px+32*(%2+0)-%1], m0 + movu [px+32*(%2+1)-%1], m1 + jmp .bottom_done +.bottom_no_right: + %if %1 == 4 + PMOVZXBW m0, [dst8q-4] + PMOVZXBW m1, [dst8q+strideq-4] + movu [px+32*(%2+0)-8], m0 + movu [px+32*(%2+1)-8], m1 + %else + movu m0, [dst8q-8] + movu m1, [dst8q+strideq-8] + punpckhbw m2, m0, m7 + punpcklbw m0, m7 + punpckhbw m3, m1, m7 + punpcklbw m1, m7 + mova [px+32*(%2+0)-16], m0 + mova [px+32*(%2+0)+ 0], m2 + mova [px+32*(%2+1)-16], m1 + mova [px+32*(%2+1)+ 0], m3 + movd [px+32*(%2-1)+16], m6 ; overwritten by first mova + %endif + movd [px+32*(%2+0)+%1*2], m6 + movd [px+32*(%2+1)+%1*2], m6 + jmp .bottom_done +.bottom_no_left: + test edgeb, 2 ; have_right + jz .bottom_no_left_right + %if %1 == 4 + PMOVZXBW m0, [dst8q] + PMOVZXBW m1, [dst8q+strideq] + %else + movu m0, [dst8q] + movu m1, [dst8q+strideq] + punpckhbw m2, m0, m7 + punpcklbw m0, m7 + punpckhbw m3, m1, m7 + punpcklbw m1, m7 + mova [px+32*(%2+0)+16], m2 + mova [px+32*(%2+1)+16], m3 + %endif + mova [px+32*(%2+0)+ 0], m0 + mova [px+32*(%2+1)+ 0], m1 + movd [px+32*(%2+0)- 4], m6 + movd [px+32*(%2+1)- 4], m6 + jmp .bottom_done +.bottom_no_left_right: + PMOVZXBW m0, [dst8q+strideq*0], %1 == 4 + PMOVZXBW m1, [dst8q+strideq*1], %1 == 4 + mova [px+32*(%2+0)+ 0], m0 + mova [px+32*(%2+1)+ 0], m1 + movd [px+32*(%2+0)+%1*2], m6 + movd [px+32*(%2+1)+%1*2], m6 + movd [px+32*(%2+0)- 4], m6 + movd [px+32*(%2+1)- 4], m6 + jmp .bottom_done +.no_bottom: + movu [px+32*(%2+0)- 4], m6 + movu [px+32*(%2+1)- 4], m6 + %if %1 == 8 + movq [px+32*(%2+0)+12], m6 + movq [px+32*(%2+1)+12], m6 + %endif +.bottom_done: + + ; actual filter + %if ARCH_X86_64 + DEFINE_ARGS dst, stride, pridmp, damping, pri, sec + mova m13, [shufb_lohi] + %if cpuflag(ssse3) + mova m15, [pw_2048] + %else + mova m15, [pw_8] + %endif + mova m14, m6 + %else + DEFINE_ARGS dst, pridmp, sec, damping, pri, tap + %xdefine m8 m1 + %xdefine m9 m2 + %xdefine m10 m0 + %xdefine m13 [base+shufb_lohi] + %xdefine m14 OUT_OF_BOUNDS_MEM + %if cpuflag(ssse3) + %xdefine m15 [base+pw_2048] + %else + %xdefine m15 [base+pw_8] + %endif + %endif + movifnidn prid, r4m + movifnidn secd, r5m + mov dampingd, r7m + movif32 [esp+0x3C], r1d + test prid, prid + jz .sec_only + movd m1, prim + bsr pridmpd, prid + test secd, secd + jz .pri_only + movd m10, r5m + bsr secd, secd + and prid, 1 + sub pridmpd, dampingd + sub secd, dampingd + xor dampingd, dampingd + add prid, prid + neg pridmpd + cmovs pridmpd, dampingd + neg secd + cmovs secd, dampingd + PSHUFB_0 m1, m7 + PSHUFB_0 m10, m7 + %if ARCH_X86_64 + DEFINE_ARGS dst, stride, pridmp, tap, pri, sec + lea tapq, [tap_table] + MOVDDUP m11, [tapq+pridmpq*8] ; pri_shift_mask + MOVDDUP m12, [tapq+secq*8] ; sec_shift_mask + mov [rsp+0x00], pridmpq ; pri_shift + mov [rsp+0x10], secq ; sec_shift + DEFINE_ARGS dst, stride, dir, tap, pri, stk, k, off, h + %else + MOVDDUP m2, [tapq+pridmpq*8] + MOVDDUP m3, [tapq+secq*8] + mov [esp+0x04], dampingd ; zero upper 32 bits of psrlw + mov [esp+0x34], dampingd ; source operand in ACCUMULATE_TAP + mov [esp+0x00], pridmpd + mov [esp+0x30], secd + DEFINE_ARGS dst, stride, dir, stk, pri, tap, h + %define offq dstq + %define kd strided + %define kq strideq + mova [esp+0x10], m2 + mova [esp+0x40], m3 + mova [esp+0x20], m1 + mova [esp+0x50], m10 + %endif + mov dird, r6m + lea stkq, [px] + lea priq, [tapq+8*8+priq*8] ; pri_taps + mov hd, %1*%2/8 + lea dirq, [tapq+dirq*2] +.v_loop: + movif32 [esp+0x38], dstd + mov kd, 1 + %if %1 == 4 + movq m4, [stkq+32*0] + movhps m4, [stkq+32*1] + %else + mova m4, [stkq+32*0] ; px + %endif + pxor m0, m0 ; sum + mova m7, m4 ; max + mova m8, m4 ; min +.k_loop: + MOVDDUP m2, [priq+kq*8] + %if ARCH_X86_64 + ACCUMULATE_TAP 0*2, [rsp+0x00], m11, m1, m2, %1, 1 + MOVDDUP m2, [tapq+12*8+kq*8] + ACCUMULATE_TAP 2*2, [rsp+0x10], m12, m10, m2, %1, 1 + ACCUMULATE_TAP 6*2, [rsp+0x10], m12, m10, m2, %1, 1 + %else + ACCUMULATE_TAP 0*2, [esp+0x00], [esp+0x10], [esp+0x20], m2, %1, 1 + MOVDDUP m2, [tapq+12*8+kq*8] + ACCUMULATE_TAP 2*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, 1 + MOVDDUP m2, [tapq+12*8+kq*8] + ACCUMULATE_TAP 6*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, 1 + %endif + dec kd + jge .k_loop + movif32 dstq, [esp+0x38] + movif32 strideq, [esp+0x3C] + CDEF_FILTER_END %1, 1 + dec hd + jg .v_loop + RET + +.pri_only: +%if ARCH_X86_64 + DEFINE_ARGS dst, stride, pridmp, damping, pri, tap, zero + lea tapq, [tap_table] + %else + DEFINE_ARGS dst, pridmp, zero, damping, pri, tap + %endif + and prid, 1 + xor zerod, zerod + sub dampingd, pridmpd + cmovs dampingd, zerod + add prid, prid + PSHUFB_0 m1, m7 + MOVDDUP m7, [tapq+dampingq*8] + mov [rsp+0x00], dampingq + %if ARCH_X86_64 + DEFINE_ARGS dst, stride, dir, stk, pri, tap, k, off, h + %else + mov [rsp+0x04], zerod + DEFINE_ARGS dst, stride, dir, stk, pri, tap, h + %endif + mov dird, r6m + lea stkq, [px] + lea priq, [tapq+8*8+priq*8] + mov hd, %1*%2/8 + lea dirq, [tapq+dirq*2] +.pri_v_loop: + movif32 [esp+0x38], dstd + mov kd, 1 + %if %1 == 4 + movq m4, [stkq+32*0] + movhps m4, [stkq+32*1] + %else + mova m4, [stkq+32*0] + %endif + pxor m0, m0 +.pri_k_loop: + MOVDDUP m2, [priq+kq*8] + ACCUMULATE_TAP 0*2, [rsp], m7, m1, m2, %1, 0 + dec kd + jge .pri_k_loop + movif32 dstq, [esp+0x38] + movif32 strideq, [esp+0x3C] + CDEF_FILTER_END %1, 0 + dec hd + jg .pri_v_loop + RET + +.sec_only: +%if ARCH_X86_64 + DEFINE_ARGS dst, stride, dir, damping, tap, sec, zero +%else + DEFINE_ARGS dst, stride, sec, damping, dir, tap, zero +%endif + movd m1, r5m + bsr secd, secd + mov dird, r6m + xor zerod, zerod + sub dampingd, secd + cmovs dampingd, zerod + PSHUFB_0 m1, m7 + %if ARCH_X86_64 + lea tapq, [tap_table] + %else + mov [rsp+0x04], zerod + %endif + mov [rsp+0x00], dampingq + MOVDDUP m7, [tapq+dampingq*8] + lea dirq, [tapq+dirq*2] + %if ARCH_X86_64 + DEFINE_ARGS dst, stride, dir, stk, tap, off, k, h + %else + DEFINE_ARGS dst, stride, off, stk, dir, tap, h + %endif + lea stkq, [px] + mov hd, %1*%2/8 +.sec_v_loop: + mov kd, 1 + %if %1 == 4 + movq m4, [stkq+32*0] + movhps m4, [stkq+32*1] + %else + mova m4, [stkq+32*0] + %endif + pxor m0, m0 +.sec_k_loop: + MOVDDUP m2, [tapq+12*8+kq*8] + ACCUMULATE_TAP 2*2, [rsp], m7, m1, m2, %1, 0 + %if ARCH_X86_32 + MOVDDUP m2, [tapq+12*8+kq*8] + %endif + ACCUMULATE_TAP 6*2, [rsp], m7, m1, m2, %1, 0 + dec kd + jge .sec_k_loop + movif32 strideq, [esp+0x3C] + CDEF_FILTER_END %1, 0 + dec hd + jg .sec_v_loop + RET +%endmacro + +%macro MULLD 2 + %if cpuflag(sse4) + pmulld %1, %2 + %else + %if ARCH_X86_32 + %define m15 m1 + %endif + pmulhuw m15, %1, %2 + pmullw %1, %2 + pslld m15, 16 + paddd %1, m15 + %endif +%endmacro + +%macro CDEF_DIR 0 + %if ARCH_X86_64 +cglobal cdef_dir, 3, 5, 16, 32, src, stride, var, stride3 + lea stride3q, [strideq*3] + movq m1, [srcq+strideq*0] + movhps m1, [srcq+strideq*1] + movq m3, [srcq+strideq*2] + movhps m3, [srcq+stride3q] + lea srcq, [srcq+strideq*4] + movq m5, [srcq+strideq*0] + movhps m5, [srcq+strideq*1] + movq m7, [srcq+strideq*2] + movhps m7, [srcq+stride3q] + + pxor m8, m8 + psadbw m0, m1, m8 + psadbw m2, m3, m8 + psadbw m4, m5, m8 + psadbw m6, m7, m8 + packssdw m0, m2 + packssdw m4, m6 + packssdw m0, m4 + SWAP m0, m9 + + punpcklbw m0, m1, m8 + punpckhbw m1, m8 + punpcklbw m2, m3, m8 + punpckhbw m3, m8 + punpcklbw m4, m5, m8 + punpckhbw m5, m8 + punpcklbw m6, m7, m8 + punpckhbw m7, m8 + + mova m8, [pw_128] + psubw m0, m8 + psubw m1, m8 + psubw m2, m8 + psubw m3, m8 + psubw m4, m8 + psubw m5, m8 + psubw m6, m8 + psubw m7, m8 + psllw m8, 3 + psubw m9, m8 ; partial_sum_hv[0] + + paddw m8, m0, m1 + paddw m10, m2, m3 + paddw m8, m4 + paddw m10, m5 + paddw m8, m6 + paddw m10, m7 + paddw m8, m10 ; partial_sum_hv[1] + + pmaddwd m8, m8 + pmaddwd m9, m9 + phaddd m9, m8 + SWAP m8, m9 + MULLD m8, [div_table%+SUFFIX+48] + + pslldq m9, m1, 2 + psrldq m10, m1, 14 + pslldq m11, m2, 4 + psrldq m12, m2, 12 + pslldq m13, m3, 6 + psrldq m14, m3, 10 + paddw m9, m0 + paddw m10, m12 + paddw m11, m13 + paddw m10, m14 ; partial_sum_diag[0] top/right half + paddw m9, m11 ; partial_sum_diag[0] top/left half + pslldq m11, m4, 8 + psrldq m12, m4, 8 + pslldq m13, m5, 10 + psrldq m14, m5, 6 + paddw m9, m11 + paddw m10, m12 + paddw m9, m13 + paddw m10, m14 + pslldq m11, m6, 12 + psrldq m12, m6, 4 + pslldq m13, m7, 14 + psrldq m14, m7, 2 + paddw m9, m11 + paddw m10, m12 + paddw m9, m13 ; partial_sum_diag[0][0-7] + paddw m10, m14 ; partial_sum_diag[0][8-14,zero] + pshufb m10, [shufw_6543210x] + punpckhwd m11, m9, m10 + punpcklwd m9, m10 + pmaddwd m11, m11 + pmaddwd m9, m9 + MULLD m11, [div_table%+SUFFIX+16] + MULLD m9, [div_table%+SUFFIX+0] + paddd m9, m11 ; cost[0a-d] + + pslldq m10, m0, 14 + psrldq m11, m0, 2 + pslldq m12, m1, 12 + psrldq m13, m1, 4 + pslldq m14, m2, 10 + psrldq m15, m2, 6 + paddw m10, m12 + paddw m11, m13 + paddw m10, m14 + paddw m11, m15 + pslldq m12, m3, 8 + psrldq m13, m3, 8 + pslldq m14, m4, 6 + psrldq m15, m4, 10 + paddw m10, m12 + paddw m11, m13 + paddw m10, m14 + paddw m11, m15 + pslldq m12, m5, 4 + psrldq m13, m5, 12 + pslldq m14, m6, 2 + psrldq m15, m6, 14 + paddw m10, m12 + paddw m11, m13 + paddw m10, m14 + paddw m11, m15 ; partial_sum_diag[1][8-14,zero] + paddw m10, m7 ; partial_sum_diag[1][0-7] + pshufb m11, [shufw_6543210x] + punpckhwd m12, m10, m11 + punpcklwd m10, m11 + pmaddwd m12, m12 + pmaddwd m10, m10 + MULLD m12, [div_table%+SUFFIX+16] + MULLD m10, [div_table%+SUFFIX+0] + paddd m10, m12 ; cost[4a-d] + phaddd m9, m10 ; cost[0a/b,4a/b] + + paddw m10, m0, m1 + paddw m11, m2, m3 + paddw m12, m4, m5 + paddw m13, m6, m7 + phaddw m0, m4 + phaddw m1, m5 + phaddw m2, m6 + phaddw m3, m7 + + ; m0-3 are horizontal sums (x >> 1), m10-13 are vertical sums (y >> 1) + pslldq m4, m11, 2 + psrldq m5, m11, 14 + pslldq m6, m12, 4 + psrldq m7, m12, 12 + pslldq m14, m13, 6 + psrldq m15, m13, 10 + paddw m4, m10 + paddw m5, m7 + paddw m4, m6 + paddw m5, m15 ; partial_sum_alt[3] right + paddw m4, m14 ; partial_sum_alt[3] left + pshuflw m6, m5, q3012 + punpckhwd m5, m4 + punpcklwd m4, m6 + pmaddwd m5, m5 + pmaddwd m4, m4 + MULLD m5, [div_table%+SUFFIX+48] + MULLD m4, [div_table%+SUFFIX+32] + paddd m4, m5 ; cost[7a-d] + + pslldq m5, m10, 6 + psrldq m6, m10, 10 + pslldq m7, m11, 4 + psrldq m10, m11, 12 + pslldq m11, m12, 2 + psrldq m12, 14 + paddw m5, m7 + paddw m6, m10 + paddw m5, m11 + paddw m6, m12 + paddw m5, m13 + pshuflw m7, m6, q3012 + punpckhwd m6, m5 + punpcklwd m5, m7 + pmaddwd m6, m6 + pmaddwd m5, m5 + MULLD m6, [div_table%+SUFFIX+48] + MULLD m5, [div_table%+SUFFIX+32] + paddd m5, m6 ; cost[5a-d] + + pslldq m6, m1, 2 + psrldq m7, m1, 14 + pslldq m10, m2, 4 + psrldq m11, m2, 12 + pslldq m12, m3, 6 + psrldq m13, m3, 10 + paddw m6, m0 + paddw m7, m11 + paddw m6, m10 + paddw m7, m13 ; partial_sum_alt[3] right + paddw m6, m12 ; partial_sum_alt[3] left + pshuflw m10, m7, q3012 + punpckhwd m7, m6 + punpcklwd m6, m10 + pmaddwd m7, m7 + pmaddwd m6, m6 + MULLD m7, [div_table%+SUFFIX+48] + MULLD m6, [div_table%+SUFFIX+32] + paddd m6, m7 ; cost[1a-d] + + pshufd m0, m0, q1032 + pshufd m1, m1, q1032 + pshufd m2, m2, q1032 + pshufd m3, m3, q1032 + + pslldq m10, m0, 6 + psrldq m11, m0, 10 + pslldq m12, m1, 4 + psrldq m13, m1, 12 + pslldq m14, m2, 2 + psrldq m2, 14 + paddw m10, m12 + paddw m11, m13 + paddw m10, m14 + paddw m11, m2 + paddw m10, m3 + pshuflw m12, m11, q3012 + punpckhwd m11, m10 + punpcklwd m10, m12 + pmaddwd m11, m11 + pmaddwd m10, m10 + MULLD m11, [div_table%+SUFFIX+48] + MULLD m10, [div_table%+SUFFIX+32] + paddd m10, m11 ; cost[3a-d] + + phaddd m9, m8 ; cost[0,4,2,6] + phaddd m6, m10 + phaddd m5, m4 + phaddd m6, m5 ; cost[1,3,5,7] + pshufd m4, m9, q3120 + + ; now find the best cost + %if cpuflag(sse4) + pmaxsd m9, m6 + pshufd m0, m9, q1032 + pmaxsd m0, m9 + pshufd m1, m0, q2301 + pmaxsd m0, m1 ; best cost + %else + pcmpgtd m0, m9, m6 + pand m9, m0 + pandn m0, m6 + por m9, m0 + pshufd m1, m9, q1032 + pcmpgtd m0, m9, m1 + pand m9, m0 + pandn m0, m1 + por m9, m0 + pshufd m1, m9, q2301 + pcmpgtd m0, m9, m1 + pand m9, m0 + pandn m0, m1 + por m0, m9 + %endif + + ; get direction and variance + punpckhdq m1, m4, m6 + punpckldq m4, m6 + psubd m2, m0, m1 + psubd m3, m0, m4 + mova [rsp+0x00], m2 ; emulate ymm in stack + mova [rsp+0x10], m3 + pcmpeqd m1, m0 ; compute best cost mask + pcmpeqd m4, m0 + packssdw m4, m1 + pmovmskb eax, m4 ; get byte-idx from mask + tzcnt eax, eax + mov r1d, [rsp+rax*2] ; get idx^4 complement from emulated ymm + shr eax, 1 ; get direction by converting byte-idx to word-idx + shr r1d, 10 + mov [varq], r1d + %else +cglobal cdef_dir, 2, 4, 8, 96, src, stride, var, stride3 +%define base r2-shufw_6543210x + LEA r2, shufw_6543210x + pxor m0, m0 + lea stride3q, [strideq*3] + movq m5, [srcq+strideq*0] + movhps m5, [srcq+strideq*1] + movq m7, [srcq+strideq*2] + movhps m7, [srcq+stride3q] + mova m1, [base+pw_128] + psadbw m2, m5, m0 + psadbw m3, m7, m0 + packssdw m2, m3 + punpcklbw m4, m5, m0 + punpckhbw m5, m0 + punpcklbw m6, m7, m0 + punpckhbw m7, m0 + psubw m4, m1 + psubw m5, m1 + psubw m6, m1 + psubw m7, m1 + + mova [esp+0x00], m4 + mova [esp+0x10], m5 + mova [esp+0x20], m6 + mova [esp+0x50], m7 + + lea srcq, [srcq+strideq*4] + movq m5, [srcq+strideq*0] + movhps m5, [srcq+strideq*1] + movq m7, [srcq+strideq*2] + movhps m7, [srcq+stride3q] + psadbw m3, m5, m0 + psadbw m0, m7, m0 + packssdw m3, m0 + pxor m0, m0 + packssdw m2, m3 + punpcklbw m4, m5, m0 + punpckhbw m5, m0 + punpcklbw m6, m7, m0 + punpckhbw m7, m0 + psubw m4, m1 + psubw m5, m1 + psubw m6, m1 + psubw m7, m1 + + psllw m1, 3 + psubw m2, m1 ; partial_sum_hv[0] + pmaddwd m2, m2 + + mova m3, [esp+0x50] + mova m0, [esp+0x00] + paddw m0, [esp+0x10] + paddw m1, m3, [esp+0x20] + paddw m0, m4 + paddw m1, m5 + paddw m0, m6 + paddw m1, m7 + paddw m0, m1 ; partial_sum_hv[1] + pmaddwd m0, m0 + + phaddd m2, m0 + MULLD m2, [base+div_table%+SUFFIX+48] + mova [esp+0x30], m2 + + mova m1, [esp+0x10] + pslldq m0, m1, 2 + psrldq m1, 14 + paddw m0, [esp+0x00] + pslldq m2, m3, 6 + psrldq m3, 10 + paddw m0, m2 + paddw m1, m3 + mova m3, [esp+0x20] + pslldq m2, m3, 4 + psrldq m3, 12 + paddw m0, m2 ; partial_sum_diag[0] top/left half + paddw m1, m3 ; partial_sum_diag[0] top/right half + pslldq m2, m4, 8 + psrldq m3, m4, 8 + paddw m0, m2 + paddw m1, m3 + pslldq m2, m5, 10 + psrldq m3, m5, 6 + paddw m0, m2 + paddw m1, m3 + pslldq m2, m6, 12 + psrldq m3, m6, 4 + paddw m0, m2 + paddw m1, m3 + pslldq m2, m7, 14 + psrldq m3, m7, 2 + paddw m0, m2 ; partial_sum_diag[0][0-7] + paddw m1, m3 ; partial_sum_diag[0][8-14,zero] + mova m3, [esp+0x50] + pshufb m1, [base+shufw_6543210x] + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + pmaddwd m2, m2 + pmaddwd m0, m0 + MULLD m2, [base+div_table%+SUFFIX+16] + MULLD m0, [base+div_table%+SUFFIX+ 0] + paddd m0, m2 ; cost[0a-d] + mova [esp+0x40], m0 + + mova m1, [esp+0x00] + pslldq m0, m1, 14 + psrldq m1, 2 + paddw m0, m7 + pslldq m2, m3, 8 + psrldq m3, 8 + paddw m0, m2 + paddw m1, m3 + mova m3, [esp+0x20] + pslldq m2, m3, 10 + psrldq m3, 6 + paddw m0, m2 + paddw m1, m3 + mova m3, [esp+0x10] + pslldq m2, m3, 12 + psrldq m3, 4 + paddw m0, m2 + paddw m1, m3 + pslldq m2, m4, 6 + psrldq m3, m4, 10 + paddw m0, m2 + paddw m1, m3 + pslldq m2, m5, 4 + psrldq m3, m5, 12 + paddw m0, m2 + paddw m1, m3 + pslldq m2, m6, 2 + psrldq m3, m6, 14 + paddw m0, m2 ; partial_sum_diag[1][0-7] + paddw m1, m3 ; partial_sum_diag[1][8-14,zero] + mova m3, [esp+0x50] + pshufb m1, [base+shufw_6543210x] + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + pmaddwd m2, m2 + pmaddwd m0, m0 + MULLD m2, [base+div_table%+SUFFIX+16] + MULLD m0, [base+div_table%+SUFFIX+ 0] + paddd m0, m2 ; cost[4a-d] + phaddd m1, [esp+0x40], m0 ; cost[0a/b,4a/b] + phaddd m1, [esp+0x30] ; cost[0,4,2,6] + mova [esp+0x30], m1 + + phaddw m0, [esp+0x00], m4 + phaddw m1, [esp+0x10], m5 + paddw m4, m5 + mova m2, [esp+0x20] + paddw m5, m2, m3 + phaddw m2, m6 + paddw m6, m7 + phaddw m3, m7 + mova m7, [esp+0x00] + paddw m7, [esp+0x10] + mova [esp+0x00], m0 + mova [esp+0x10], m1 + mova [esp+0x20], m2 + + pslldq m1, m4, 4 + pslldq m2, m6, 6 + pslldq m0, m5, 2 + paddw m1, m2 + paddw m0, m7 + psrldq m2, m5, 14 + paddw m0, m1 ; partial_sum_alt[3] left + psrldq m1, m4, 12 + paddw m1, m2 + psrldq m2, m6, 10 + paddw m1, m2 ; partial_sum_alt[3] right + pshuflw m1, m1, q3012 + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + pmaddwd m2, m2 + pmaddwd m0, m0 + MULLD m2, [base+div_table%+SUFFIX+48] + MULLD m0, [base+div_table%+SUFFIX+32] + paddd m0, m2 ; cost[7a-d] + mova [esp+0x40], m0 + + pslldq m0, m7, 6 + psrldq m7, 10 + pslldq m1, m5, 4 + psrldq m5, 12 + pslldq m2, m4, 2 + psrldq m4, 14 + paddw m0, m6 + paddw m7, m5 + paddw m0, m1 + paddw m7, m4 + paddw m0, m2 + pshuflw m2, m7, q3012 + punpckhwd m7, m0 + punpcklwd m0, m2 + pmaddwd m7, m7 + pmaddwd m0, m0 + MULLD m7, [base+div_table%+SUFFIX+48] + MULLD m0, [base+div_table%+SUFFIX+32] + paddd m0, m7 ; cost[5a-d] + mova [esp+0x50], m0 + + mova m7, [esp+0x10] + mova m2, [esp+0x20] + pslldq m0, m7, 2 + psrldq m7, 14 + pslldq m4, m2, 4 + psrldq m2, 12 + pslldq m5, m3, 6 + psrldq m6, m3, 10 + paddw m0, [esp+0x00] + paddw m7, m2 + paddw m4, m5 + paddw m7, m6 ; partial_sum_alt[3] right + paddw m0, m4 ; partial_sum_alt[3] left + pshuflw m2, m7, q3012 + punpckhwd m7, m0 + punpcklwd m0, m2 + pmaddwd m7, m7 + pmaddwd m0, m0 + MULLD m7, [base+div_table%+SUFFIX+48] + MULLD m0, [base+div_table%+SUFFIX+32] + paddd m0, m7 ; cost[1a-d] + SWAP m0, m4 + + pshufd m0, [esp+0x00], q1032 + pshufd m1, [esp+0x10], q1032 + pshufd m2, [esp+0x20], q1032 + pshufd m3, m3, q1032 + mova [esp+0x00], m4 + + pslldq m4, m0, 6 + psrldq m0, 10 + pslldq m5, m1, 4 + psrldq m1, 12 + pslldq m6, m2, 2 + psrldq m2, 14 + paddw m4, m3 + paddw m0, m1 + paddw m5, m6 + paddw m0, m2 + paddw m4, m5 + pshuflw m2, m0, q3012 + punpckhwd m0, m4 + punpcklwd m4, m2 + pmaddwd m0, m0 + pmaddwd m4, m4 + MULLD m0, [base+div_table%+SUFFIX+48] + MULLD m4, [base+div_table%+SUFFIX+32] + paddd m4, m0 ; cost[3a-d] + + mova m1, [esp+0x00] + mova m2, [esp+0x50] + mova m0, [esp+0x30] ; cost[0,4,2,6] + phaddd m1, m4 + phaddd m2, [esp+0x40] ; cost[1,3,5,7] + phaddd m1, m2 + pshufd m2, m0, q3120 + + ; now find the best cost + %if cpuflag(sse4) + pmaxsd m0, m1 + pshufd m3, m0, q1032 + pmaxsd m3, m0 + pshufd m0, m3, q2301 + pmaxsd m0, m3 + %else + pcmpgtd m3, m0, m1 + pand m0, m3 + pandn m3, m1 + por m0, m3 + pshufd m4, m0, q1032 + pcmpgtd m3, m0, m4 + pand m0, m3 + pandn m3, m4 + por m0, m3 + pshufd m4, m0, q2301 + pcmpgtd m3, m0, m4 + pand m0, m3 + pandn m3, m4 + por m0, m3 + %endif + + ; get direction and variance + mov vard, varm + punpckhdq m3, m2, m1 + punpckldq m2, m1 + psubd m1, m0, m3 + psubd m4, m0, m2 + mova [esp+0x00], m1 ; emulate ymm in stack + mova [esp+0x10], m4 + pcmpeqd m3, m0 ; compute best cost mask + pcmpeqd m2, m0 + packssdw m2, m3 + pmovmskb eax, m2 ; get byte-idx from mask + tzcnt eax, eax + mov r1d, [esp+eax*2] ; get idx^4 complement from emulated ymm + shr eax, 1 ; get direction by converting byte-idx to word-idx + shr r1d, 10 + mov [vard], r1d + %endif + + RET +%endmacro + +INIT_XMM sse4 +CDEF_FILTER 8, 8 +CDEF_FILTER 4, 8 +CDEF_FILTER 4, 4 +CDEF_DIR + +INIT_XMM ssse3 +CDEF_FILTER 8, 8 +CDEF_FILTER 4, 8 +CDEF_FILTER 4, 4 +CDEF_DIR + +INIT_XMM sse2 +CDEF_FILTER 8, 8 +CDEF_FILTER 4, 8 +CDEF_FILTER 4, 4 diff --git a/third_party/dav1d/src/x86/cpu.c b/third_party/dav1d/src/x86/cpu.c new file mode 100644 index 0000000000..eb2b4bba6c --- /dev/null +++ b/third_party/dav1d/src/x86/cpu.c @@ -0,0 +1,82 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include + +#include "common/attributes.h" + +#include "src/x86/cpu.h" + +typedef struct { + uint32_t eax, ebx, ecx, edx; +} CpuidRegisters; + +void dav1d_cpu_cpuid(CpuidRegisters *regs, unsigned leaf, unsigned subleaf); +uint64_t dav1d_cpu_xgetbv(unsigned xcr); + +#define X(reg, mask) (((reg) & (mask)) == (mask)) + +COLD unsigned dav1d_get_cpu_flags_x86(void) { + CpuidRegisters r = { 0 }; + dav1d_cpu_cpuid(&r, 0, 0); + const unsigned max_leaf = r.eax; + unsigned flags = 0; + + if (max_leaf >= 1) { + dav1d_cpu_cpuid(&r, 1, 0); + if (X(r.edx, 0x06008000)) /* CMOV/SSE/SSE2 */ { + flags |= DAV1D_X86_CPU_FLAG_SSE2; + if (X(r.ecx, 0x00000201)) /* SSE3/SSSE3 */ { + flags |= DAV1D_X86_CPU_FLAG_SSSE3; + if (X(r.ecx, 0x00080000)) /* SSE4.1 */ + flags |= DAV1D_X86_CPU_FLAG_SSE41; + } + } +#if ARCH_X86_64 + /* We only support >128-bit SIMD on x86-64. */ + if (X(r.ecx, 0x18000000)) /* OSXSAVE/AVX */ { + const uint64_t xcr0 = dav1d_cpu_xgetbv(0); + if (X(xcr0, 0x00000006)) /* XMM/YMM */ { + if (max_leaf >= 7) { + dav1d_cpu_cpuid(&r, 7, 0); + if (X(r.ebx, 0x00000128)) /* BMI1/BMI2/AVX2 */ { + flags |= DAV1D_X86_CPU_FLAG_AVX2; + if (X(xcr0, 0x000000e0)) /* ZMM/OPMASK */ { + if (X(r.ebx, 0xd0230000) && X(r.ecx, 0x00005f42)) + flags |= DAV1D_X86_CPU_FLAG_AVX512ICL; + } + } + } + } + } +#endif + } + + return flags; +} diff --git a/third_party/dav1d/src/x86/cpu.h b/third_party/dav1d/src/x86/cpu.h new file mode 100644 index 0000000000..26ebc3893a --- /dev/null +++ b/third_party/dav1d/src/x86/cpu.h @@ -0,0 +1,42 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_X86_CPU_H +#define DAV1D_SRC_X86_CPU_H + +enum CpuFlags { + DAV1D_X86_CPU_FLAG_SSE2 = 1 << 0, + DAV1D_X86_CPU_FLAG_SSSE3 = 1 << 1, + DAV1D_X86_CPU_FLAG_SSE41 = 1 << 2, + DAV1D_X86_CPU_FLAG_AVX2 = 1 << 3, + DAV1D_X86_CPU_FLAG_AVX512ICL = 1 << 4, /* F/CD/BW/DQ/VL/VNNI/IFMA/VBMI/VBMI2/ + * VPOPCNTDQ/BITALG/GFNI/VAES/VPCLMULQDQ */ +}; + +unsigned dav1d_get_cpu_flags_x86(void); + +#endif /* DAV1D_SRC_X86_CPU_H */ diff --git a/third_party/dav1d/src/x86/cpuid.asm b/third_party/dav1d/src/x86/cpuid.asm new file mode 100644 index 0000000000..d4ee6825a4 --- /dev/null +++ b/third_party/dav1d/src/x86/cpuid.asm @@ -0,0 +1,55 @@ +; Copyright © 2018, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION .text + +cglobal cpu_cpuid, 0, 5, 0, regs, leaf, subleaf + mov r4, regsmp + mov eax, leafm + mov ecx, subleafm +%if ARCH_X86_64 + mov r5, rbx +%endif + cpuid + mov [r4+4*0], eax + mov [r4+4*1], ebx + mov [r4+4*2], ecx + mov [r4+4*3], edx +%if ARCH_X86_64 + mov rbx, r5 +%endif + RET + +cglobal cpu_xgetbv, 0, 0, 0, xcr + movifnidn ecx, xcrm + xgetbv +%if ARCH_X86_64 + shl rdx, 32 + or rax, rdx +%endif + RET diff --git a/third_party/dav1d/src/x86/film_grain.asm b/third_party/dav1d/src/x86/film_grain.asm new file mode 100644 index 0000000000..72a1e3c009 --- /dev/null +++ b/third_party/dav1d/src/x86/film_grain.asm @@ -0,0 +1,2405 @@ +; Copyright © 2019, VideoLAN and dav1d authors +; Copyright © 2019, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 32 +pb_8x_27_17_8x_17_27: times 8 db 27, 17 + times 8 db 17, 27 +pw_1024: times 16 dw 1024 +pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 +rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 +byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0 +pw_seed_xor: times 2 dw 0xb524 + times 2 dw 0x49d8 +pd_m65536: dd ~0xffff +pb_23_22: times 2 db 23, 22 +pb_1: times 4 db 1 +hmul_bits: dw 32768, 16384, 8192, 4096 +round: dw 2048, 1024, 512 +mul_bits: dw 256, 128, 64, 32, 16 +round_vals: dw 32, 64, 128, 256, 512 +max: dw 255, 240, 235 +min: dw 0, 16 +pb_27_17_17_27: db 27, 17, 17, 27 +pw_1: dw 1 + +%macro JMP_TABLE 1-* + %xdefine %1_table %%table + %xdefine %%base %1_table + %xdefine %%prefix mangle(private_prefix %+ _%1) + %%table: + %rep %0 - 1 + dd %%prefix %+ .ar%2 - %%base + %rotate 1 + %endrep +%endmacro + +ALIGN 4 +JMP_TABLE generate_grain_y_avx2, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_420_avx2, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_422_avx2, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_444_avx2, 0, 1, 2, 3 + +struc FGData + .seed: resd 1 + .num_y_points: resd 1 + .y_points: resb 14 * 2 + .chroma_scaling_from_luma: resd 1 + .num_uv_points: resd 2 + .uv_points: resb 2 * 10 * 2 + .scaling_shift: resd 1 + .ar_coeff_lag: resd 1 + .ar_coeffs_y: resb 24 + .ar_coeffs_uv: resb 2 * 28 ; includes padding + .ar_coeff_shift: resq 1 + .grain_scale_shift: resd 1 + .uv_mult: resd 2 + .uv_luma_mult: resd 2 + .uv_offset: resd 2 + .overlap_flag: resd 1 + .clip_to_restricted_range: resd 1 +endstruc + +cextern gaussian_sequence + +SECTION .text + +INIT_XMM avx2 +cglobal generate_grain_y, 2, 9, 16, buf, fg_data + lea r4, [pb_mask] +%define base r4-pb_mask + movq xm1, [base+rnd_next_upperbit_mask] + movq xm4, [base+mul_bits] + movq xm7, [base+hmul_bits] + mov r2d, [fg_dataq+FGData.grain_scale_shift] + vpbroadcastw xm8, [base+round+r2*2] + mova xm5, [base+pb_mask] + vpbroadcastw xm0, [fg_dataq+FGData.seed] + vpbroadcastd xm9, [base+pd_m65536] + mov r2, -73*82 + sub bufq, r2 + lea r3, [gaussian_sequence] +.loop: + pand xm2, xm0, xm1 + psrlw xm3, xm2, 10 + por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set + pmullw xm2, xm4 ; bits 0x0f00 are set + pshufb xm2, xm5, xm2 ; set 15th bit for next 4 seeds + psllq xm6, xm2, 30 + por xm2, xm6 + psllq xm6, xm2, 15 + por xm2, xm6 ; aggregate each bit into next seed's high bit + pmulhuw xm3, xm0, xm7 + por xm2, xm3 ; 4 next output seeds + pshuflw xm0, xm2, q3333 + psrlw xm2, 5 + pmovzxwd xm3, xm2 + mova xm6, xm9 + vpgatherdd xm2, [r3+xm3*2], xm6 + pandn xm2, xm9, xm2 + packusdw xm2, xm2 + pmulhrsw xm2, xm8 + packsswb xm2, xm2 + movd [bufq+r2], xm2 + add r2, 4 + jl .loop + + ; auto-regression code + movsxd r2, [fg_dataq+FGData.ar_coeff_lag] + movsxd r2, [base+generate_grain_y_avx2_table+r2*4] + lea r2, [r2+base+generate_grain_y_avx2_table] + jmp r2 + +.ar1: + DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0 + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] + movd xm4, [fg_dataq+FGData.ar_coeffs_y] + DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0 + pinsrb xm4, [pb_1], 3 + pmovsxbw xm4, xm4 + pshufd xm5, xm4, q1111 + pshufd xm4, xm4, q0000 + vpbroadcastw xm3, [base+round_vals+shiftq*2-12] ; rnd + sub bufq, 82*73-(82*3+79) + mov hd, 70 + mov mind, -128 + mov maxd, 127 +.y_loop_ar1: + mov xq, -76 + movsx val3d, byte [bufq+xq-1] +.x_loop_ar1: + pmovsxbw xm0, [bufq+xq-82-1] ; top/left + pmovsxbw xm2, [bufq+xq-82+0] ; top + pmovsxbw xm1, [bufq+xq-82+1] ; top/right + punpcklwd xm0, xm2 + punpcklwd xm1, xm3 + pmaddwd xm0, xm4 + pmaddwd xm1, xm5 + paddd xm0, xm1 +.x_loop_ar1_inner: + movd val0d, xm0 + psrldq xm0, 4 + imul val3d, cf3d + add val3d, val0d +%if WIN64 + sarx val3d, val3d, shiftd +%else + sar val3d, shiftb +%endif + movsx val0d, byte [bufq+xq] + add val3d, val0d + cmp val3d, maxd + cmovns val3d, maxd + cmp val3d, mind + cmovs val3d, mind + mov byte [bufq+xq], val3b + ; keep val3d in-place as left for next x iteration + inc xq + jz .x_loop_ar1_end + test xq, 3 + jnz .x_loop_ar1_inner + jmp .x_loop_ar1 + +.x_loop_ar1_end: + add bufq, 82 + dec hd + jg .y_loop_ar1 +.ar0: + RET + +.ar2: + DEFINE_ARGS buf, fg_data, shift + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + vpbroadcastw xm14, [base+round_vals-12+shiftq*2] + movq xm15, [base+byte_blend+1] + pmovsxbw xm8, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7 + movd xm9, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11 + pmovsxbw xm9, xm9 + DEFINE_ARGS buf, fg_data, h, x + pshufd xm12, xm9, q0000 + pshufd xm13, xm9, q1111 + pshufd xm11, xm8, q3333 + pshufd xm10, xm8, q2222 + pshufd xm9, xm8, q1111 + pshufd xm8, xm8, q0000 + pmovzxwd xm14, xm14 + sub bufq, 82*73-(82*3+79) + mov hd, 70 +.y_loop_ar2: + mov xq, -76 + +.x_loop_ar2: + pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] + pmovsxbw xm1, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] + psrldq xm2, xm0, 2 ; y=-2,x=[-1,+5] + psrldq xm3, xm1, 2 ; y=-1,x=[-1,+5] + psrldq xm4, xm1, 4 ; y=-1,x=[+0,+5] + punpcklwd xm2, xm0, xm2 + punpcklwd xm3, xm4 + pmaddwd xm2, xm8 + pmaddwd xm3, xm11 + paddd xm2, xm3 + + psrldq xm4, xm0, 4 ; y=-2,x=[+0,+5] + psrldq xm5, xm0, 6 ; y=-2,x=[+1,+5] + psrldq xm6, xm0, 8 ; y=-2,x=[+2,+5] + punpcklwd xm4, xm5 + punpcklwd xm6, xm1 + psrldq xm7, xm1, 6 ; y=-1,x=[+1,+5] + psrldq xm1, xm1, 8 ; y=-1,x=[+2,+5] + punpcklwd xm7, xm1 + pmaddwd xm4, xm9 + pmaddwd xm6, xm10 + pmaddwd xm7, xm12 + paddd xm4, xm6 + paddd xm2, xm7 + paddd xm2, xm4 + paddd xm2, xm14 + + movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5] +.x_loop_ar2_inner: + pmovsxbw xm1, xm0 + pmaddwd xm3, xm1, xm13 + paddd xm3, xm2 + psrldq xm1, 4 ; y=0,x=0 + psrldq xm2, 4 ; shift top to next pixel + psrad xm3, [fg_dataq+FGData.ar_coeff_shift] + ; don't packssdw since we only care about one value + paddw xm3, xm1 + packsswb xm3, xm3 + pextrb [bufq+xq], xm3, 0 + pslldq xm3, 2 + pand xm3, xm15 + pandn xm0, xm15, xm0 + por xm0, xm3 + psrldq xm0, 1 + inc xq + jz .x_loop_ar2_end + test xq, 3 + jnz .x_loop_ar2_inner + jmp .x_loop_ar2 + +.x_loop_ar2_end: + add bufq, 82 + dec hd + jg .y_loop_ar2 + RET + +.ar3: + DEFINE_ARGS buf, fg_data, shift +%if WIN64 + SUB rsp, 16*12 +%assign stack_size_padded (stack_size_padded+16*12) +%assign stack_size (stack_size+16*12) +%else + ALLOC_STACK 16*12 +%endif + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + vpbroadcastw xm14, [base+round_vals-12+shiftq*2] + movq xm15, [base+byte_blend] + pmovsxbw xm0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-7 + pmovsxbw xm1, [fg_dataq+FGData.ar_coeffs_y+ 8] ; cf8-15 + pmovsxbw xm2, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 + pshufd xm9, xm0, q1111 + pshufd xm10, xm0, q2222 + pshufd xm11, xm0, q3333 + pshufd xm0, xm0, q0000 + pshufd xm6, xm1, q1111 + pshufd xm7, xm1, q2222 + pshufd xm8, xm1, q3333 + pshufd xm1, xm1, q0000 + pshufd xm3, xm2, q1111 + psrldq xm13, xm2, 10 + pinsrw xm2, [pw_1], 5 + pshufd xm4, xm2, q2222 + pshufd xm2, xm2, q0000 + pinsrw xm13, [base+round_vals+shiftq*2-10], 3 + mova [rsp+ 0*16], xm0 + mova [rsp+ 1*16], xm9 + mova [rsp+ 2*16], xm10 + mova [rsp+ 3*16], xm11 + mova [rsp+ 4*16], xm1 + mova [rsp+ 5*16], xm6 + mova [rsp+ 6*16], xm7 + mova [rsp+ 7*16], xm8 + mova [rsp+ 8*16], xm2 + mova [rsp+ 9*16], xm3 + mova [rsp+10*16], xm4 + DEFINE_ARGS buf, fg_data, h, x + sub bufq, 82*73-(82*3+79) + mov hd, 70 +.y_loop_ar3: + mov xq, -76 + +.x_loop_ar3: + movu xm0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] + movu xm1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] + movu xm2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] + pxor xm3, xm3 + pcmpgtb xm6, xm3, xm2 + pcmpgtb xm5, xm3, xm1 + pcmpgtb xm4, xm3, xm0 + punpckhbw xm3, xm0, xm4 + punpcklbw xm0, xm4 + punpckhbw xm4, xm1, xm5 + punpcklbw xm1, xm5 + punpckhbw xm5, xm2, xm6 + punpcklbw xm2, xm6 + + psrldq xm6, xm0, 2 + psrldq xm7, xm0, 4 + psrldq xm8, xm0, 6 + psrldq xm9, xm0, 8 + palignr xm10, xm3, xm0, 10 + palignr xm11, xm3, xm0, 12 + + punpcklwd xm0, xm6 + punpcklwd xm7, xm8 + punpcklwd xm9, xm10 + punpcklwd xm11, xm1 + pmaddwd xm0, [rsp+ 0*16] + pmaddwd xm7, [rsp+ 1*16] + pmaddwd xm9, [rsp+ 2*16] + pmaddwd xm11, [rsp+ 3*16] + paddd xm0, xm7 + paddd xm9, xm11 + paddd xm0, xm9 + + psrldq xm6, xm1, 2 + psrldq xm7, xm1, 4 + psrldq xm8, xm1, 6 + psrldq xm9, xm1, 8 + palignr xm10, xm4, xm1, 10 + palignr xm11, xm4, xm1, 12 + psrldq xm12, xm2, 2 + + punpcklwd xm6, xm7 + punpcklwd xm8, xm9 + punpcklwd xm10, xm11 + punpcklwd xm12, xm2, xm12 + pmaddwd xm6, [rsp+ 4*16] + pmaddwd xm8, [rsp+ 5*16] + pmaddwd xm10, [rsp+ 6*16] + pmaddwd xm12, [rsp+ 7*16] + paddd xm6, xm8 + paddd xm10, xm12 + paddd xm6, xm10 + paddd xm0, xm6 + + psrldq xm6, xm2, 4 + psrldq xm7, xm2, 6 + psrldq xm8, xm2, 8 + palignr xm9, xm5, xm2, 10 + palignr xm5, xm5, xm2, 12 + + punpcklwd xm6, xm7 + punpcklwd xm8, xm9 + punpcklwd xm5, xm14 + pmaddwd xm6, [rsp+ 8*16] + pmaddwd xm8, [rsp+ 9*16] + pmaddwd xm5, [rsp+10*16] + paddd xm0, xm6 + paddd xm8, xm5 + paddd xm0, xm8 + + movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4] +.x_loop_ar3_inner: + pmovsxbw xm2, xm1 + pmaddwd xm2, xm13 + pshufd xm3, xm2, q1111 + paddd xm2, xm3 ; left+cur + paddd xm2, xm0 ; add top + psrldq xm0, 4 + psrad xm2, [fg_dataq+FGData.ar_coeff_shift] + ; don't packssdw since we only care about one value + packsswb xm2, xm2 + pextrb [bufq+xq], xm2, 0 + pslldq xm2, 3 + pand xm2, xm15 + pandn xm1, xm15, xm1 + por xm1, xm2 + psrldq xm1, 1 + inc xq + jz .x_loop_ar3_end + test xq, 3 + jnz .x_loop_ar3_inner + jmp .x_loop_ar3 + +.x_loop_ar3_end: + add bufq, 82 + dec hd + jg .y_loop_ar3 + RET + +%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y +INIT_XMM avx2 +cglobal generate_grain_uv_%1, 4, 10, 16, buf, bufy, fg_data, uv + lea r4, [pb_mask] +%define base r4-pb_mask + movq xm1, [base+rnd_next_upperbit_mask] + movq xm4, [base+mul_bits] + movq xm7, [base+hmul_bits] + mov r5d, [fg_dataq+FGData.grain_scale_shift] + vpbroadcastw xm8, [base+round+r5*2] + mova xm5, [base+pb_mask] + vpbroadcastw xm0, [fg_dataq+FGData.seed] + vpbroadcastw xm9, [base+pw_seed_xor+uvq*4] + pxor xm0, xm9 + vpbroadcastd xm9, [base+pd_m65536] + lea r6, [gaussian_sequence] +%if %2 + mov r7d, 73-35*%3 + add bufq, 44 +.loop_y: + mov r5, -44 +.loop_x: +%else + mov r5, -73*82 + sub bufq, r5 +.loop: +%endif + pand xm2, xm0, xm1 + psrlw xm3, xm2, 10 + por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set + pmullw xm2, xm4 ; bits 0x0f00 are set + pshufb xm2, xm5, xm2 ; set 15th bit for next 4 seeds + psllq xm6, xm2, 30 + por xm2, xm6 + psllq xm6, xm2, 15 + por xm2, xm6 ; aggregate each bit into next seed's high bit + pmulhuw xm3, xm0, xm7 + por xm2, xm3 ; 4 next output seeds + pshuflw xm0, xm2, q3333 + psrlw xm2, 5 + pmovzxwd xm3, xm2 + mova xm6, xm9 + vpgatherdd xm2, [r6+xm3*2], xm6 + pandn xm2, xm9, xm2 + packusdw xm2, xm2 + pmulhrsw xm2, xm8 + packsswb xm2, xm2 + movd [bufq+r5], xm2 + add r5, 4 +%if %2 + jl .loop_x + add bufq, 82 + dec r7d + jg .loop_y +%else + jl .loop +%endif + + ; auto-regression code + movsxd r5, [fg_dataq+FGData.ar_coeff_lag] + movsxd r5, [base+generate_grain_uv_%1_avx2_table+r5*4] + lea r5, [r5+base+generate_grain_uv_%1_avx2_table] + jmp r5 + +.ar0: + INIT_YMM avx2 + DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift + imul uvd, 28 + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq] + movd xm3, [base+hmul_bits+shiftq*2] + DEFINE_ARGS buf, bufy, h + pmovsxbw xm4, xm4 +%if %2 + vpbroadcastd m7, [pb_1] + vpbroadcastw m6, [hmul_bits+2+%3*2] +%endif + vpbroadcastw m4, xm4 + vpbroadcastw m3, xm3 + pxor m12, m12 +%if %2 + sub bufq, 82*(73-35*%3)+82-(82*3+41) +%else + sub bufq, 82*70-3 +%endif + add bufyq, 3+82*3 + mov hd, 70-35*%3 +.y_loop_ar0: +%if %2 + ; first 32 pixels + movu xm8, [bufyq] +%if %3 + movu xm9, [bufyq+82] +%endif + movu xm10, [bufyq+16] +%if %3 + movu xm11, [bufyq+82+16] +%endif + vinserti128 m8, [bufyq+32], 1 +%if %3 + vinserti128 m9, [bufyq+82+32], 1 +%endif + vinserti128 m10, [bufyq+48], 1 +%if %3 + vinserti128 m11, [bufyq+82+48], 1 +%endif + pmaddubsw m8, m7, m8 +%if %3 + pmaddubsw m9, m7, m9 +%endif + pmaddubsw m10, m7, m10 +%if %3 + pmaddubsw m11, m7, m11 + paddw m8, m9 + paddw m10, m11 +%endif + pmulhrsw m8, m6 + pmulhrsw m10, m6 +%else + xor r3d, r3d + ; first 32x2 pixels +.x_loop_ar0: + movu m8, [bufyq+r3] + pcmpgtb m9, m12, m8 + punpckhbw m10, m8, m9 + punpcklbw m8, m9 +%endif + pmullw m8, m4 + pmullw m10, m4 + pmulhrsw m8, m3 + pmulhrsw m10, m3 +%if %2 + movu m0, [bufq] +%else + movu m0, [bufq+r3] +%endif + pcmpgtb m1, m12, m0 + punpckhbw m9, m0, m1 + punpcklbw m0, m1 + paddw m0, m8 + paddw m9, m10 + packsswb m0, m9 +%if %2 + movu [bufq], m0 +%else + movu [bufq+r3], m0 + add r3d, 32 + cmp r3d, 64 + jl .x_loop_ar0 +%endif + + ; last 6/12 pixels + movu xm8, [bufyq+32*2] +%if %2 +%if %3 + movu xm9, [bufyq+32*2+82] +%endif + pmaddubsw xm8, xm7, xm8 +%if %3 + pmaddubsw xm9, xm7, xm9 + paddw xm8, xm9 +%endif + pmulhrsw xm8, xm6 + pmullw xm8, xm4 + pmulhrsw xm8, xm3 + movq xm0, [bufq+32] + pcmpgtb xm9, xm12, xm0 + punpcklbw xm9, xm0, xm9 + paddw xm8, xm9 + packsswb xm8, xm8 + vpblendw xm0, xm8, xm0, 1000b + movq [bufq+32], xm0 +%else + pcmpgtb xm9, xm12, xm8 + punpckhbw xm10, xm8, xm9 + punpcklbw xm8, xm9 + pmullw xm10, xm4 + pmullw xm8, xm4 + pmulhrsw xm10, xm3 + pmulhrsw xm8, xm3 + movu xm0, [bufq+64] + pcmpgtb xm9, xm12, xm0 + punpcklbw xm1, xm0, xm9 + punpckhbw xm9, xm0, xm9 + paddw xm1, xm8 + paddw xm9, xm10 + packsswb xm1, xm9 + vpblendw xm0, xm1, xm0, 11000000b + movu [bufq+64], xm0 +%endif + + add bufq, 82 + add bufyq, 82<<%3 + dec hd + jg .y_loop_ar0 + RET + +.ar1: + INIT_XMM avx2 + DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x, shift + imul uvd, 28 + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] + movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq] + pinsrb xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3 + DEFINE_ARGS buf, bufy, h, val0, val3, cf3, min, max, x, shift + pmovsxbw xm4, xm4 + pshufd xm5, xm4, q1111 + pshufd xm4, xm4, q0000 + pmovsxwd xm3, [base+round_vals+shiftq*2-12] ; rnd +%if %2 + vpbroadcastd xm7, [pb_1] + vpbroadcastw xm6, [hmul_bits+2+%3*2] +%endif + vpbroadcastd xm3, xm3 +%if %2 + sub bufq, 82*(73-35*%3)+44-(82*3+41) +%else + sub bufq, 82*70-(82-3) +%endif + add bufyq, 79+82*3 + mov hd, 70-35*%3 + mov mind, -128 + mov maxd, 127 +.y_loop_ar1: + mov xq, -(76>>%2) + movsx val3d, byte [bufq+xq-1] +.x_loop_ar1: + pmovsxbw xm0, [bufq+xq-82-1] ; top/left +%if %2 + movq xm8, [bufyq+xq*2] +%if %3 + movq xm9, [bufyq+xq*2+82] +%endif +%endif + psrldq xm2, xm0, 2 ; top + psrldq xm1, xm0, 4 ; top/right +%if %2 + pmaddubsw xm8, xm7, xm8 +%if %3 + pmaddubsw xm9, xm7, xm9 + paddw xm8, xm9 +%endif + pmulhrsw xm8, xm6 +%else + pmovsxbw xm8, [bufyq+xq] +%endif + punpcklwd xm0, xm2 + punpcklwd xm1, xm8 + pmaddwd xm0, xm4 + pmaddwd xm1, xm5 + paddd xm0, xm1 + paddd xm0, xm3 +.x_loop_ar1_inner: + movd val0d, xm0 + psrldq xm0, 4 + imul val3d, cf3d + add val3d, val0d + sarx val3d, val3d, shiftd + movsx val0d, byte [bufq+xq] + add val3d, val0d + cmp val3d, maxd + cmovns val3d, maxd + cmp val3d, mind + cmovs val3d, mind + mov byte [bufq+xq], val3b + ; keep val3d in-place as left for next x iteration + inc xq + jz .x_loop_ar1_end + test xq, 3 + jnz .x_loop_ar1_inner + jmp .x_loop_ar1 + +.x_loop_ar1_end: + add bufq, 82 + add bufyq, 82<<%3 + dec hd + jg .y_loop_ar1 + RET + +.ar2: + DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + imul uvd, 28 + vpbroadcastw xm15, [base+round_vals-12+shiftq*2] + pmovsxbw xm8, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-7 + pmovsxbw xm9, [fg_dataq+FGData.ar_coeffs_uv+uvq+8] ; cf8-12 + pinsrw xm9, [base+pw_1], 5 +%if %2 + vpbroadcastw xm7, [base+hmul_bits+2+%3*2] + vpbroadcastd xm6, [base+pb_1] +%endif + DEFINE_ARGS buf, bufy, fg_data, h, unused, x + pshufd xm12, xm9, q0000 + pshufd xm13, xm9, q1111 + pshufd xm14, xm9, q2222 + pshufd xm11, xm8, q3333 + pshufd xm10, xm8, q2222 + pshufd xm9, xm8, q1111 + pshufd xm8, xm8, q0000 +%if %2 + sub bufq, 82*(73-35*%3)+44-(82*3+41) +%else + sub bufq, 82*70-(82-3) +%endif + add bufyq, 79+82*3 + mov hd, 70-35*%3 +.y_loop_ar2: + mov xq, -(76>>%2) + +.x_loop_ar2: + pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] + pmovsxbw xm1, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] + psrldq xm2, xm0, 2 ; y=-2,x=[-1,+5] + psrldq xm3, xm1, 2 ; y=-1,x=[-1,+5] + psrldq xm4, xm1, 4 ; y=-1,x=[+0,+5] + punpcklwd xm2, xm0, xm2 + punpcklwd xm3, xm4 + pmaddwd xm2, xm8 + pmaddwd xm3, xm11 + paddd xm2, xm3 + + psrldq xm4, xm0, 4 ; y=-2,x=[+0,+5] + psrldq xm5, xm0, 6 ; y=-2,x=[+1,+5] + psrldq xm0, 8 ; y=-2,x=[+2,+5] + punpcklwd xm4, xm5 + punpcklwd xm0, xm1 + psrldq xm3, xm1, 6 ; y=-1,x=[+1,+5] + psrldq xm1, xm1, 8 ; y=-1,x=[+2,+5] + punpcklwd xm3, xm1 + pmaddwd xm4, xm9 + pmaddwd xm0, xm10 + pmaddwd xm3, xm12 + paddd xm4, xm0 + paddd xm2, xm3 + paddd xm2, xm4 + +%if %2 + movq xm0, [bufyq+xq*2] +%if %3 + movq xm3, [bufyq+xq*2+82] +%endif + pmaddubsw xm0, xm6, xm0 +%if %3 + pmaddubsw xm3, xm6, xm3 + paddw xm0, xm3 +%endif + pmulhrsw xm0, xm7 +%else + pmovsxbw xm0, [bufyq+xq] +%endif + punpcklwd xm0, xm15 + pmaddwd xm0, xm14 + paddd xm2, xm0 + + movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5] +.x_loop_ar2_inner: + pmovsxbw xm0, xm0 + pmaddwd xm3, xm0, xm13 + paddd xm3, xm2 + psrldq xm2, 4 ; shift top to next pixel + psrad xm3, [fg_dataq+FGData.ar_coeff_shift] + pslldq xm3, 2 + psrldq xm0, 2 + paddw xm3, xm0 + vpblendw xm0, xm3, 00000010b + packsswb xm0, xm0 + pextrb [bufq+xq], xm0, 1 + inc xq + jz .x_loop_ar2_end + test xq, 3 + jnz .x_loop_ar2_inner + jmp .x_loop_ar2 + +.x_loop_ar2_end: + add bufq, 82 + add bufyq, 82<<%3 + dec hd + jg .y_loop_ar2 + RET + +.ar3: + DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift + SUB rsp, 16*12 +%assign stack_size_padded (stack_size_padded+16*12) +%assign stack_size (stack_size+16*12) + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + imul uvd, 28 + vpbroadcastw xm14, [base+round_vals-12+shiftq*2] + pmovsxbw xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-7 + pmovsxbw xm1, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 8] ; cf8-15 + pmovsxbw xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-23 + pmovsxbw xm5, [fg_dataq+FGData.ar_coeffs_uv+uvq+24] ; cf24 [luma] + pshufd xm9, xm0, q1111 + pshufd xm10, xm0, q2222 + pshufd xm11, xm0, q3333 + pshufd xm0, xm0, q0000 + pshufd xm6, xm1, q1111 + pshufd xm7, xm1, q2222 + pshufd xm8, xm1, q3333 + pshufd xm1, xm1, q0000 + pshufd xm3, xm2, q1111 + pshufd xm4, xm2, q2222 + vpbroadcastw xm5, xm5 + vpblendw xm4, xm5, 10101010b ; interleave luma cf + psrldq xm5, xm2, 10 + pshufd xm2, xm2, q0000 + pinsrw xm5, [base+round_vals+shiftq*2-10], 3 + pmovzxwd xm14, xm14 + mova [rsp+ 0*16], xm0 + mova [rsp+ 1*16], xm9 + mova [rsp+ 2*16], xm10 + mova [rsp+ 3*16], xm11 + mova [rsp+ 4*16], xm1 + mova [rsp+ 5*16], xm6 + mova [rsp+ 6*16], xm7 + mova [rsp+ 7*16], xm8 + mova [rsp+ 8*16], xm2 + mova [rsp+ 9*16], xm3 + mova [rsp+10*16], xm4 + mova [rsp+11*16], xm5 +%if %2 + vpbroadcastd xm13, [base+pb_1] + vpbroadcastw xm15, [base+hmul_bits+2+%3*2] +%endif + DEFINE_ARGS buf, bufy, fg_data, h, unused, x +%if %2 + sub bufq, 82*(73-35*%3)+44-(82*3+41) +%else + sub bufq, 82*70-(82-3) +%endif + add bufyq, 79+82*3 + mov hd, 70-35*%3 +.y_loop_ar3: + mov xq, -(76>>%2) + +.x_loop_ar3: + movu xm0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] + movu xm1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] + movu xm2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] + pxor xm3, xm3 + pcmpgtb xm6, xm3, xm2 + pcmpgtb xm5, xm3, xm1 + pcmpgtb xm4, xm3, xm0 + punpckhbw xm3, xm0, xm4 + punpcklbw xm0, xm4 + punpckhbw xm4, xm1, xm5 + punpcklbw xm1, xm5 + punpckhbw xm5, xm2, xm6 + punpcklbw xm2, xm6 + + psrldq xm6, xm0, 2 + psrldq xm7, xm0, 4 + psrldq xm8, xm0, 6 + psrldq xm9, xm0, 8 + palignr xm10, xm3, xm0, 10 + palignr xm11, xm3, xm0, 12 + + punpcklwd xm0, xm6 + punpcklwd xm7, xm8 + punpcklwd xm9, xm10 + punpcklwd xm11, xm1 + pmaddwd xm0, [rsp+ 0*16] + pmaddwd xm7, [rsp+ 1*16] + pmaddwd xm9, [rsp+ 2*16] + pmaddwd xm11, [rsp+ 3*16] + paddd xm0, xm7 + paddd xm9, xm11 + paddd xm0, xm9 + + psrldq xm6, xm1, 2 + psrldq xm7, xm1, 4 + psrldq xm8, xm1, 6 + psrldq xm9, xm1, 8 + palignr xm10, xm4, xm1, 10 + palignr xm11, xm4, xm1, 12 + psrldq xm12, xm2, 2 + + punpcklwd xm6, xm7 + punpcklwd xm8, xm9 + punpcklwd xm10, xm11 + punpcklwd xm12, xm2, xm12 + pmaddwd xm6, [rsp+ 4*16] + pmaddwd xm8, [rsp+ 5*16] + pmaddwd xm10, [rsp+ 6*16] + pmaddwd xm12, [rsp+ 7*16] + paddd xm6, xm8 + paddd xm10, xm12 + paddd xm6, xm10 + paddd xm0, xm6 + + psrldq xm6, xm2, 4 + psrldq xm7, xm2, 6 + psrldq xm8, xm2, 8 + palignr xm9, xm5, xm2, 10 + palignr xm5, xm5, xm2, 12 + +%if %2 + movq xm1, [bufyq+xq*2] +%if %3 + movq xm2, [bufyq+xq*2+82] +%endif + pmaddubsw xm1, xm13, xm1 +%if %3 + pmaddubsw xm2, xm13, xm2 + paddw xm1, xm2 +%endif + pmulhrsw xm1, xm15 +%else + pmovsxbw xm1, [bufyq+xq] +%endif + + punpcklwd xm6, xm7 + punpcklwd xm8, xm9 + punpcklwd xm5, xm1 + pmaddwd xm6, [rsp+ 8*16] + pmaddwd xm8, [rsp+ 9*16] + pmaddwd xm5, [rsp+10*16] + paddd xm0, xm6 + paddd xm8, xm5 + paddd xm0, xm8 + paddd xm0, xm14 + + movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4] +.x_loop_ar3_inner: + pmovsxbw xm1, xm1 + pmaddwd xm2, xm1, [rsp+16*11] + pshufd xm3, xm2, q1111 + paddd xm2, xm3 ; left+cur + paddd xm2, xm0 ; add top + psrldq xm0, 4 + psrad xm2, [fg_dataq+FGData.ar_coeff_shift] + ; don't packssdw, we only care about one value + pslldq xm2, 6 + vpblendw xm1, xm2, 1000b + packsswb xm1, xm1 + pextrb [bufq+xq], xm1, 3 + psrldq xm1, 1 + inc xq + jz .x_loop_ar3_end + test xq, 3 + jnz .x_loop_ar3_inner + jmp .x_loop_ar3 + +.x_loop_ar3_end: + add bufq, 82 + add bufyq, 82<<%3 + dec hd + jg .y_loop_ar3 + RET +%endmacro + +generate_grain_uv_fn 420, 1, 1 +generate_grain_uv_fn 422, 1, 0 +generate_grain_uv_fn 444, 0, 0 + +INIT_YMM avx2 +cglobal fgy_32x32xn, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut + pcmpeqw m10, m10 + psrld m10, 24 + mov r7d, [fg_dataq+FGData.scaling_shift] + lea r8, [pb_mask] +%define base r8-pb_mask + vpbroadcastw m11, [base+mul_bits+r7*2-14] + mov r7d, [fg_dataq+FGData.clip_to_restricted_range] + vpbroadcastw m12, [base+max+r7*4] + vpbroadcastw m13, [base+min+r7*2] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap + + mov overlapd, [fg_dataq+FGData.overlap_flag] + movifnidn sbyd, sbym + test sbyd, sbyd + setnz r7b + test r7b, overlapb + jnz .vertical_overlap + + imul seed, sbyd, (173 << 24) | 37 + add seed, (105 << 24) | 178 + rol seed, 8 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + unused1, unused2, see, overlap + + lea src_bakq, [srcq+wq] + neg wq + sub dstq, srcq + +.loop_x: + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + offx, offy, see, overlap + + mov offxd, seed + rorx offyd, seed, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164 + lea offyq, [offyq+offxq*2+747] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + h, offxy, see, overlap + + mov hd, hm + mov grain_lutq, grain_lutmp +.loop_y: + ; src + mova m0, [srcq] + pxor m2, m2 + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + punpckhwd m5, m0, m2 + punpcklwd m4, m0, m2 + punpckhwd m7, m1, m2 + punpcklwd m6, m1, m2 ; m4-7: src as dword + + ; scaling[src] + pcmpeqw m3, m3 + pcmpeqw m9, m9 + vpgatherdd m8, [scalingq+m4], m3 + vpgatherdd m4, [scalingq+m5], m9 + pcmpeqw m3, m3 + pcmpeqw m9, m9 + vpgatherdd m5, [scalingq+m6], m3 + vpgatherdd m6, [scalingq+m7], m9 + pand m8, m10 + pand m4, m10 + pand m5, m10 + pand m6, m10 + packusdw m8, m4 + packusdw m5, m6 + + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq] + pcmpgtb m7, m2, m3 + punpcklbw m2, m3, m7 + punpckhbw m3, m7 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m2, m8 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + mova [dstq+srcq], m0 + + add srcq, strideq + add grain_lutq, 82 + dec hd + jg .loop_y + + add wq, 32 + jge .end + lea srcq, [src_bakq+wq] + test overlapd, overlapd + jz .loop_x + + ; r8m = sbym + movd xm15, [pb_27_17_17_27] + cmp dword r8m, 0 + jne .loop_x_hv_overlap + + ; horizontal overlap (without vertical overlap) + movd xm14, [pw_1024] +.loop_x_h_overlap: + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + offx, offy, see, left_offxy + + lea left_offxyd, [offyd+32] ; previous column's offy*stride+offx + mov offxd, seed + rorx offyd, seed, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164 + lea offyq, [offyq+offxq*2+747] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + h, offxy, see, left_offxy + + mov hd, hm + mov grain_lutq, grain_lutmp +.loop_y_h_overlap: + ; src + mova m0, [srcq] + pxor m2, m2 + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + punpckhwd m5, m0, m2 + punpcklwd m4, m0, m2 + punpckhwd m7, m1, m2 + punpcklwd m6, m1, m2 ; m4-7: src as dword + + ; scaling[src] + pcmpeqw m3, m3 + pcmpeqw m9, m9 + vpgatherdd m8, [scalingq+m4], m3 + vpgatherdd m4, [scalingq+m5], m9 + pcmpeqw m3, m3 + pcmpeqw m9, m9 + vpgatherdd m5, [scalingq+m6], m3 + vpgatherdd m6, [scalingq+m7], m9 + pand m8, m10 + pand m4, m10 + pand m5, m10 + pand m6, m10 + packusdw m8, m4 + packusdw m5, m6 + + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq] + movd xm4, [grain_lutq+left_offxyq] + punpcklbw xm4, xm3 + pmaddubsw xm4, xm15, xm4 + pmulhrsw xm4, xm14 + packsswb xm4, xm4 + vpblendw xm4, xm3, 11111110b + vpblendd m3, m4, 00001111b + pcmpgtb m7, m2, m3 + punpcklbw m2, m3, m7 + punpckhbw m3, m7 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m2, m8 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + mova [dstq+srcq], m0 + + add srcq, strideq + add grain_lutq, 82 + dec hd + jg .loop_y_h_overlap + + add wq, 32 + jge .end + lea srcq, [src_bakq+wq] + + ; r8m = sbym + cmp dword r8m, 0 + jne .loop_x_hv_overlap + jmp .loop_x_h_overlap + +.end: + RET + +.vertical_overlap: + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap + + movzx sbyd, sbyb + imul seed, [fg_dataq+FGData.seed], 0x00010001 + imul r7d, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add r7d, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and r7d, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, r7d + xor seed, sbyd ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + unused1, unused2, see, overlap + + lea src_bakq, [srcq+wq] + neg wq + sub dstq, srcq + + vpbroadcastd m14, [pw_1024] +.loop_x_v_overlap: + vpbroadcastw m15, [pb_27_17_17_27] + + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + offx, offy, see, overlap, top_offxy + + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*2+0x10001*747+32*82] + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + h, offxy, see, overlap, top_offxy + + movzx top_offxyd, offxyw + shr offxyd, 16 + + mov hd, hm + mov grain_lutq, grain_lutmp +.loop_y_v_overlap: + ; src + mova m0, [srcq] + pxor m2, m2 + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + punpckhwd m5, m0, m2 + punpcklwd m4, m0, m2 + punpckhwd m7, m1, m2 + punpcklwd m6, m1, m2 ; m4-7: src as dword + + ; scaling[src] + pcmpeqw m3, m3 + pcmpeqw m9, m9 + vpgatherdd m8, [scalingq+m4], m3 + vpgatherdd m4, [scalingq+m5], m9 + pcmpeqw m3, m3 + pcmpeqw m9, m9 + vpgatherdd m5, [scalingq+m6], m3 + vpgatherdd m6, [scalingq+m7], m9 + pand m8, m10 + pand m4, m10 + pand m5, m10 + pand m6, m10 + packusdw m8, m4 + packusdw m5, m6 + + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq] + movu m4, [grain_lutq+top_offxyq] + punpckhbw m6, m4, m3 + punpcklbw m4, m3 + pmaddubsw m6, m15, m6 + pmaddubsw m4, m15, m4 + pmulhrsw m6, m14 + pmulhrsw m4, m14 + packsswb m3, m4, m6 + pcmpgtb m7, m2, m3 + punpcklbw m2, m3, m7 + punpckhbw m3, m7 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m2, m8 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + mova [dstq+srcq], m0 + + vpbroadcastw m15, [pb_27_17_17_27+2] ; swap weights for second v-overlap line + add srcq, strideq + add grain_lutq, 82 + dec hw + jz .end_y_v_overlap + ; 2 lines get vertical overlap, then fall back to non-overlap code for + ; remaining (up to) 30 lines + btc hd, 16 + jnc .loop_y_v_overlap + jmp .loop_y + +.end_y_v_overlap: + add wq, 32 + jge .end_hv + lea srcq, [src_bakq+wq] + + ; since fg_dataq.overlap is guaranteed to be set, we never jump + ; back to .loop_x_v_overlap, and instead always fall-through to + ; h+v overlap + + movd xm15, [pb_27_17_17_27] +.loop_x_hv_overlap: + vpbroadcastw m8, [pb_27_17_17_27] + + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, top_offxy, topleft_offxy + + lea topleft_offxyq, [top_offxyq+32] + lea left_offxyq, [offyq+32] + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*2+0x10001*747+32*82] + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, top_offxy, topleft_offxy + + movzx top_offxyd, offxyw + shr offxyd, 16 + + mov hd, hm + mov grain_lutq, grain_lutmp +.loop_y_hv_overlap: + ; src + mova m0, [srcq] + pxor m2, m2 + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + punpckhwd m5, m0, m2 + punpcklwd m4, m0, m2 + punpckhwd m7, m1, m2 + punpcklwd m6, m1, m2 ; m4-7: src as dword + + ; scaling[src] + pcmpeqw m3, m3 + ; FIXME it would be nice to have another register here to do 2 vpgatherdd's in parallel + vpgatherdd m9, [scalingq+m4], m3 + pcmpeqw m3, m3 + vpgatherdd m4, [scalingq+m5], m3 + pcmpeqw m3, m3 + vpgatherdd m5, [scalingq+m6], m3 + pcmpeqw m3, m3 + vpgatherdd m6, [scalingq+m7], m3 + pand m9, m10 + pand m4, m10 + pand m5, m10 + pand m6, m10 + packusdw m9, m4 + packusdw m5, m6 + + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq] + movu m6, [grain_lutq+top_offxyq] + movd xm4, [grain_lutq+left_offxyq] + movd xm7, [grain_lutq+topleft_offxyq] + ; do h interpolation first (so top | top/left -> top, left | cur -> cur) + punpcklbw xm4, xm3 + punpcklbw xm7, xm6 + pmaddubsw xm4, xm15, xm4 + pmaddubsw xm7, xm15, xm7 + pmulhrsw xm4, xm14 + pmulhrsw xm7, xm14 + packsswb xm4, xm4 + packsswb xm7, xm7 + vpblendw xm4, xm3, 11111110b + vpblendw xm7, xm6, 11111110b + vpblendd m3, m4, 00001111b + vpblendd m6, m7, 00001111b + ; followed by v interpolation (top | cur -> cur) + punpckhbw m7, m6, m3 + punpcklbw m6, m3 + pmaddubsw m7, m8, m7 + pmaddubsw m6, m8, m6 + pmulhrsw m7, m14 + pmulhrsw m6, m14 + packsswb m3, m6, m7 + pcmpgtb m7, m2, m3 + punpcklbw m2, m3, m7 + punpckhbw m3, m7 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m2, m9 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + mova [dstq+srcq], m0 + + vpbroadcastw m8, [pb_27_17_17_27+2] ; swap weights for second v-overlap line + add srcq, strideq + add grain_lutq, 82 + dec hw + jz .end_y_hv_overlap + ; 2 lines get vertical overlap, then fall back to non-overlap code for + ; remaining (up to) 30 lines + btc hd, 16 + jnc .loop_y_hv_overlap + jmp .loop_y_h_overlap + +.end_y_hv_overlap: + add wq, 32 + lea srcq, [src_bakq+wq] + jl .loop_x_hv_overlap + +.end_hv: + RET + +%macro FGUV_FN 3 ; name, ss_hor, ss_ver +cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ + grain_lut, h, sby, luma, lstride, uv_pl, is_id + pcmpeqw m10, m10 + psrld m10, 24 + mov r7d, [fg_dataq+FGData.scaling_shift] + lea r8, [pb_mask] +%define base r8-pb_mask + vpbroadcastw m11, [base+mul_bits+r7*2-14] + mov r7d, [fg_dataq+FGData.clip_to_restricted_range] + mov r9d, dword is_idm + vpbroadcastw m13, [base+min+r7*2] + shlx r7d, r7d, r9d + vpbroadcastw m12, [base+max+r7*2] + + cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 + jne .csfl + +%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap + +%if %1 + mov r7d, dword r11m + vpbroadcastb m0, [fg_dataq+FGData.uv_mult+r7*4] + vpbroadcastb m1, [fg_dataq+FGData.uv_luma_mult+r7*4] + punpcklbw m14, m1, m0 + vpbroadcastw m15, [fg_dataq+FGData.uv_offset+r7*4] +%else + vpbroadcastd m14, [pw_1024] +%if %2 + vpbroadcastd m15, [pb_23_22] +%else + vpbroadcastd xm15, [pb_27_17_17_27] +%endif +%endif + + mov overlapd, [fg_dataq+FGData.overlap_flag] + movifnidn sbyd, sbym + test sbyd, sbyd + setnz r7b + test r7b, overlapb + jnz %%vertical_overlap + + imul seed, sbyd, (173 << 24) | 37 + add seed, (105 << 24) | 178 + rol seed, 8 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + unused2, unused3, see, overlap, unused4, unused5, lstride + + mov lumaq, r9mp + lea r12, [srcq+wq] + lea r13, [dstq+wq] + lea r14, [lumaq+wq*(1+%2)] + mov r11mp, r12 + mov r12mp, r13 + mov lstrideq, r10mp + neg wq + +%%loop_x: + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, overlap, unused1, unused2, lstride + + mov offxd, seed + rorx offyd, seed, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + h, offxy, see, overlap, unused1, unused2, lstride + + mov hd, hm + mov grain_lutq, grain_lutmp +%%loop_y: + ; src +%if %2 + mova xm4, [lumaq+lstrideq*0+ 0] + mova xm6, [lumaq+lstrideq*0+16] + mova xm0, [srcq] + vpbroadcastd m7, [pb_1] + vinserti128 m4, [lumaq+lstrideq*(1+%3) +0], 1 + vinserti128 m6, [lumaq+lstrideq*(1+%3)+16], 1 + vinserti128 m0, [srcq+strideq], 1 + pxor m2, m2 + pmaddubsw m4, m7 + pmaddubsw m6, m7 + pavgw m4, m2 + pavgw m6, m2 +%else + pxor m2, m2 + mova m4, [lumaq] + mova m0, [srcq] +%endif + +%if %1 +%if %2 + packuswb m4, m6 ; luma +%endif + punpckhbw m6, m4, m0 + punpcklbw m4, m0 ; { luma, chroma } + pmaddubsw m6, m14 + pmaddubsw m4, m14 + psraw m6, 6 + psraw m4, 6 + paddw m6, m15 + paddw m4, m15 + packuswb m4, m6 ; pack+unpack = clip + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%elif %2 == 0 + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%endif + + punpckhwd m5, m4, m2 + punpcklwd m4, m2 + punpckhwd m7, m6, m2 + punpcklwd m6, m2 ; m4-7: luma_src as dword + + ; scaling[luma_src] + pcmpeqw m3, m3 + pcmpeqw m9, m9 + vpgatherdd m8, [scalingq+m4], m3 + vpgatherdd m4, [scalingq+m5], m9 + pcmpeqw m3, m3 + pcmpeqw m9, m9 + vpgatherdd m5, [scalingq+m6], m3 + vpgatherdd m6, [scalingq+m7], m9 + pand m8, m10 + pand m4, m10 + pand m5, m10 + pand m6, m10 + packusdw m8, m4 + packusdw m5, m6 + + ; unpack chroma_source + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + + ; grain = grain_lut[offy+y][offx+x] +%if %2 + movu xm3, [grain_lutq+offxyq+ 0] + vinserti128 m3, [grain_lutq+offxyq+82], 1 +%else + movu m3, [grain_lutq+offxyq] +%endif + pcmpgtb m7, m2, m3 + punpcklbw m2, m3, m7 + punpckhbw m3, m7 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmullw m2, m8 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 +%if %2 + mova [dstq], xm0 + vextracti128 [dstq+strideq], m0, 1 +%else + mova [dstq], m0 +%endif + +%if %2 + lea srcq, [srcq+strideq*2] + lea dstq, [dstq+strideq*2] + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + add srcq, strideq + add dstq, strideq + add lumaq, lstrideq +%endif + add grain_lutq, 82<<%2 + sub hb, 1+%2 + jg %%loop_y + + add wq, 32>>%2 + jge %%end + mov srcq, r11mp + mov dstq, r12mp + lea lumaq, [r14+wq*(1+%2)] + add srcq, wq + add dstq, wq + test overlapd, overlapd + jz %%loop_x + + ; r8m = sbym + cmp dword r8m, 0 + jne %%loop_x_hv_overlap + + ; horizontal overlap (without vertical overlap) +%%loop_x_h_overlap: + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, unused1, unused2, lstride + + lea left_offxyd, [offyd+(32>>%2)] ; previous column's offy*stride+offx + mov offxd, seed + rorx offyd, seed, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, unused1, unused2, lstride + + mov hd, hm + mov grain_lutq, grain_lutmp +%%loop_y_h_overlap: + ; src +%if %2 + mova xm4, [lumaq+lstrideq*0+ 0] + mova xm6, [lumaq+lstrideq*0+16] + mova xm0, [srcq] + vpbroadcastd m7, [pb_1] + vinserti128 m4, [lumaq+lstrideq*(1+%3) +0], 1 + vinserti128 m6, [lumaq+lstrideq*(1+%3)+16], 1 + vinserti128 m0, [srcq+strideq], 1 + pxor m2, m2 + pmaddubsw m4, m7 + pmaddubsw m6, m7 + pavgw m4, m2 + pavgw m6, m2 +%else + mova m4, [lumaq] + mova m0, [srcq] + pxor m2, m2 +%endif + +%if %1 +%if %2 + packuswb m4, m6 ; luma +%endif + punpckhbw m6, m4, m0 + punpcklbw m4, m0 ; { luma, chroma } + pmaddubsw m6, m14 + pmaddubsw m4, m14 + psraw m6, 6 + psraw m4, 6 + paddw m6, m15 + paddw m4, m15 + packuswb m4, m6 ; pack+unpack = clip + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%elif %2 == 0 + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%endif + + punpckhwd m5, m4, m2 + punpcklwd m4, m2 + punpckhwd m7, m6, m2 + punpcklwd m6, m2 ; m4-7: luma_src as dword + + ; scaling[luma_src] + pcmpeqw m3, m3 + pcmpeqw m9, m9 + vpgatherdd m8, [scalingq+m4], m3 + vpgatherdd m4, [scalingq+m5], m9 + pcmpeqw m3, m3 + pcmpeqw m9, m9 + vpgatherdd m5, [scalingq+m6], m3 + vpgatherdd m6, [scalingq+m7], m9 + pand m8, m10 + pand m4, m10 + pand m5, m10 + pand m6, m10 + packusdw m8, m4 + packusdw m5, m6 + + ; unpack chroma_source + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + + ; grain = grain_lut[offy+y][offx+x] +%if %2 +%if %1 + vpbroadcastd m6, [pb_23_22] ; FIXME +%endif + movu xm3, [grain_lutq+offxyq+ 0] + movd xm4, [grain_lutq+left_offxyq+ 0] + vinserti128 m3, [grain_lutq+offxyq+82], 1 + vinserti128 m4, [grain_lutq+left_offxyq+82], 1 + punpcklbw m4, m3 +%if %1 + pmaddubsw m4, m6, m4 + pmulhrsw m4, [pw_1024] +%else + pmaddubsw m4, m15, m4 + pmulhrsw m4, m14 +%endif + packsswb m4, m4 + pcmpeqw m6, m6 ; FIXME + psrldq m6, 15 ; FIXME + vpblendvb m3, m3, m4, m6 +%else +%if %1 + vpbroadcastd xm6, [pb_27_17_17_27] +%endif + movu m3, [grain_lutq+offxyq] + movd xm4, [grain_lutq+left_offxyq] + punpcklbw xm4, xm3 +%if %1 + pmaddubsw xm4, xm6, xm4 + pmulhrsw xm4, [pw_1024] +%else + pmaddubsw xm4, xm15, xm4 + pmulhrsw xm4, xm14 +%endif + packsswb xm4, xm4 + pcmpeqw xm6, xm6 + psrldq xm6, 14 + vpblendvb m3, m3, m4, m6 +%endif + pcmpgtb m7, m2, m3 + punpcklbw m2, m3, m7 + punpckhbw m3, m7 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmullw m2, m8 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 +%if %2 + mova [dstq], xm0 + vextracti128 [dstq+strideq], m0, 1 +%else + mova [dstq], m0 +%endif + +%if %2 + lea srcq, [srcq+strideq*2] + lea dstq, [dstq+strideq*2] + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + add srcq, strideq + add dstq, strideq + add lumaq, lstrideq +%endif + add grain_lutq, 82*(1+%2) + sub hb, 1+%2 + jg %%loop_y_h_overlap + + add wq, 32>>%2 + jge %%end + mov srcq, r11mp + mov dstq, r12mp + lea lumaq, [r14+wq*(1+%2)] + add srcq, wq + add dstq, wq + + ; r8m = sbym + cmp dword r8m, 0 + jne %%loop_x_hv_overlap + jmp %%loop_x_h_overlap + +%%end: + RET + +%%vertical_overlap: + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ + sby, see, overlap, unused1, unused2, lstride + + movzx sbyd, sbyb + imul seed, [fg_dataq+FGData.seed], 0x00010001 + imul r7d, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add r7d, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and r7d, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, r7d + xor seed, sbyd ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + unused1, unused2, see, overlap, unused3, unused4, lstride + + mov lumaq, r9mp + lea r12, [srcq+wq] + lea r13, [dstq+wq] + lea r14, [lumaq+wq*(1+%2)] + mov r11mp, r12 + mov r12mp, r13 + mov lstrideq, r10mp + neg wq + +%%loop_x_v_overlap: + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, overlap, top_offxy, unused, lstride + + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + h, offxy, see, overlap, top_offxy, unused, lstride + + movzx top_offxyd, offxyw + shr offxyd, 16 + + mov hd, hm + mov grain_lutq, grain_lutmp +%if %2 == 0 + vbroadcasti128 m1, [pb_8x_27_17_8x_17_27] +%endif +%%loop_y_v_overlap: + ; src +%if %2 + mova xm4, [lumaq+lstrideq*0+ 0] + mova xm6, [lumaq+lstrideq*0+16] + mova xm0, [srcq] + vpbroadcastd m7, [pb_1] + vinserti128 m4, [lumaq+lstrideq*(1+%3) +0], 1 + vinserti128 m6, [lumaq+lstrideq*(1+%3)+16], 1 + vinserti128 m0, [srcq+strideq], 1 + pxor m2, m2 + pmaddubsw m4, m7 + pmaddubsw m6, m7 + pavgw m4, m2 + pavgw m6, m2 +%else + mova m4, [lumaq] + mova m0, [srcq] + pxor m2, m2 +%endif + +%if %1 +%if %2 + packuswb m4, m6 ; luma +%endif + punpckhbw m6, m4, m0 + punpcklbw m4, m0 ; { luma, chroma } + pmaddubsw m6, m14 + pmaddubsw m4, m14 + psraw m6, 6 + psraw m4, 6 + paddw m6, m15 + paddw m4, m15 + packuswb m4, m6 ; pack+unpack = clip + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%elif %2 == 0 + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%endif + + punpckhwd m5, m4, m2 + punpcklwd m4, m2 + punpckhwd m7, m6, m2 + punpcklwd m6, m2 ; m4-7: luma_src as dword + + ; scaling[luma_src] + pcmpeqw m3, m3 + pcmpeqw m9, m9 + vpgatherdd m8, [scalingq+m4], m3 + vpgatherdd m4, [scalingq+m5], m9 + pcmpeqw m3, m3 + pcmpeqw m9, m9 + vpgatherdd m5, [scalingq+m6], m3 + vpgatherdd m6, [scalingq+m7], m9 + pand m8, m10 + pand m4, m10 + pand m5, m10 + pand m6, m10 + packusdw m8, m4 + packusdw m5, m6 + +%if %2 + ; unpack chroma_source + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word +%endif + + ; grain = grain_lut[offy+y][offx+x] +%if %3 == 0 +%if %2 + mova m6, [pb_8x_27_17_8x_17_27] + movu xm3, [grain_lutq+offxyq] + movu xm4, [grain_lutq+top_offxyq] + vinserti128 m3, [grain_lutq+offxyq+82], 1 + vinserti128 m4, [grain_lutq+top_offxyq+82], 1 +%else + movu m3, [grain_lutq+offxyq] + movu m4, [grain_lutq+top_offxyq] +%endif + punpckhbw m9, m4, m3 + punpcklbw m4, m3 +%if %2 + pmaddubsw m9, m6, m9 + pmaddubsw m4, m6, m4 +%else + pmaddubsw m9, m1, m9 + pmaddubsw m4, m1, m4 +%endif +%if %1 + pmulhrsw m9, [pw_1024] + pmulhrsw m4, [pw_1024] +%else + pmulhrsw m9, m14 + pmulhrsw m4, m14 +%endif + packsswb m3, m4, m9 +%else +%if %1 + vpbroadcastd m6, [pb_23_22] +%endif + movq xm3, [grain_lutq+offxyq] + movq xm4, [grain_lutq+top_offxyq] + vinserti128 m3, [grain_lutq+offxyq+8], 1 + vinserti128 m4, [grain_lutq+top_offxyq+8], 1 + punpcklbw m4, m3 +%if %1 + pmaddubsw m4, m6, m4 + pmulhrsw m4, [pw_1024] +%else + pmaddubsw m4, m15, m4 + pmulhrsw m4, m14 +%endif + packsswb m4, m4 + vpermq m4, m4, q3120 + ; only interpolate first line, insert second line unmodified + vinserti128 m3, m4, [grain_lutq+offxyq+82], 1 +%endif + pcmpgtb m7, m2, m3 + punpcklbw m2, m3, m7 + punpckhbw m3, m7 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmullw m2, m8 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + + ; dst = clip_pixel(src, noise) +%if %2 + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + mova [dstq], xm0 + vextracti128 [dstq+strideq], m0, 1 +%else + pxor m6, m6 + punpckhbw m9, m0, m6 + punpcklbw m0, m6 ; m0-1: src as word + + paddw m0, m2 + paddw m9, m3 + pmaxsw m0, m13 + pmaxsw m9, m13 + pminsw m0, m12 + pminsw m9, m12 + packuswb m0, m9 + mova [dstq], m0 +%endif + + sub hb, 1+%2 + jl %%end_y_v_overlap +%if %2 + lea srcq, [srcq+strideq*2] + lea dstq, [dstq+strideq*2] + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + add srcq, strideq + add dstq, strideq + add lumaq, lstrideq +%endif + add grain_lutq, 82<<%2 +%if %2 == 0 + vbroadcasti128 m1, [pb_8x_27_17_8x_17_27+16] + btc hd, 16 + jnc %%loop_y_v_overlap +%endif + jmp %%loop_y + +%%end_y_v_overlap: + add wq, 32>>%2 + jge %%end_hv + mov srcq, r11mp + mov dstq, r12mp + lea lumaq, [r14+wq*(1+%2)] + add srcq, wq + add dstq, wq + + ; since fg_dataq.overlap is guaranteed to be set, we never jump + ; back to .loop_x_v_overlap, and instead always fall-through to + ; h+v overlap + +%%loop_x_hv_overlap: + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride + + lea topleft_offxyq, [top_offxyq+(32>>%2)] + lea left_offxyq, [offyq+(32>>%2)] + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride + + movzx top_offxyd, offxyw + shr offxyd, 16 + + mov hd, hm + mov grain_lutq, grain_lutmp +%if %2 == 0 + vbroadcasti128 m1, [pb_8x_27_17_8x_17_27] +%endif +%%loop_y_hv_overlap: + ; src +%if %2 + mova xm4, [lumaq+lstrideq*0+ 0] + mova xm6, [lumaq+lstrideq*0+16] + mova xm0, [srcq] + vpbroadcastd m7, [pb_1] + vinserti128 m4, [lumaq+lstrideq*(1+%3) +0], 1 + vinserti128 m6, [lumaq+lstrideq*(1+%3)+16], 1 + vinserti128 m0, [srcq+strideq], 1 + pxor m2, m2 + pmaddubsw m4, m7 + pmaddubsw m6, m7 + pavgw m4, m2 + pavgw m6, m2 +%else + mova m4, [lumaq] + mova m0, [srcq] + pxor m2, m2 +%endif + +%if %1 +%if %2 + packuswb m4, m6 ; luma +%endif + punpckhbw m6, m4, m0 + punpcklbw m4, m0 ; { luma, chroma } + pmaddubsw m6, m14 + pmaddubsw m4, m14 + psraw m6, 6 + psraw m4, 6 + paddw m6, m15 + paddw m4, m15 + packuswb m4, m6 ; pack+unpack = clip + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%elif %2 == 0 + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%endif + + punpckhwd m5, m4, m2 + punpcklwd m4, m2 + punpckhwd m7, m6, m2 + punpcklwd m6, m2 ; m4-7: src as dword + + ; scaling[src] + pcmpeqw m9, m9 + pcmpeqw m3, m3 + vpgatherdd m8, [scalingq+m4], m9 + vpgatherdd m4, [scalingq+m5], m3 + pcmpeqw m9, m9 + pcmpeqw m3, m3 + vpgatherdd m5, [scalingq+m6], m9 + vpgatherdd m6, [scalingq+m7], m3 + pand m8, m10 + pand m4, m10 + pand m5, m10 + pand m6, m10 + packusdw m8, m4 + packusdw m5, m6 + +%if %2 + ; unpack chroma source + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word +%endif + + ; grain = grain_lut[offy+y][offx+x] +%if %1 +%if %2 + vpbroadcastd m9, [pb_23_22] +%else + vpbroadcastd xm9, [pb_27_17_17_27] +%endif +%endif + +%if %2 + movu xm3, [grain_lutq+offxyq] +%if %3 + movq xm6, [grain_lutq+top_offxyq] +%else + movu xm6, [grain_lutq+top_offxyq] +%endif + vinserti128 m3, [grain_lutq+offxyq+82], 1 +%if %3 + vinserti128 m6, [grain_lutq+top_offxyq+8], 1 +%else + vinserti128 m6, [grain_lutq+top_offxyq+82], 1 +%endif +%else + movu m3, [grain_lutq+offxyq] + movu m6, [grain_lutq+top_offxyq] +%endif + movd xm4, [grain_lutq+left_offxyq] + movd xm7, [grain_lutq+topleft_offxyq] +%if %2 + vinserti128 m4, [grain_lutq+left_offxyq+82], 1 +%if %3 == 0 + vinserti128 m7, [grain_lutq+topleft_offxyq+82], 1 +%endif +%endif + + ; do h interpolation first (so top | top/left -> top, left | cur -> cur) +%if %2 + punpcklbw m4, m3 +%if %3 + punpcklbw xm7, xm6 +%else + punpcklbw m7, m6 +%endif + punpcklwd m4, m7 +%if %1 + pmaddubsw m4, m9, m4 + pmulhrsw m4, [pw_1024] +%else + pmaddubsw m4, m15, m4 + pmulhrsw m4, m14 +%endif + packsswb m4, m4 + pcmpeqw m9, m9 ; this is kind of ugly + psrldq m9, 15 + vpblendvb m3, m3, m4, m9 + psrldq m4, 1 +%if %3 + shufpd m9, m9, m9, 1110b ; clear upper lane +%endif + vpblendvb m6, m6, m4, m9 +%else + punpcklbw xm4, xm3 + punpcklbw xm7, xm6 + punpckldq xm4, xm7 +%if %1 + pmaddubsw xm4, xm9, xm4 + pmulhrsw xm4, [pw_1024] +%else + pmaddubsw xm4, xm15, xm4 + pmulhrsw xm4, xm14 +%endif + packsswb xm4, xm4 + pcmpeqw xm9, xm9 ; this is kind of ugly + psrldq xm9, 14 + vpblendvb m3, m3, m4, m9 + psrldq xm4, 2 + vpblendvb m6, m6, m4, m9 +%endif + + ; followed by v interpolation (top | cur -> cur) +%if %3 + vpermq m9, m3, q3120 + punpcklbw m6, m9 +%if %1 + vpbroadcastd m9, [pb_23_22] + pmaddubsw m6, m9, m6 + pmulhrsw m6, [pw_1024] +%else + pmaddubsw m6, m15, m6 + pmulhrsw m6, m14 +%endif + packsswb m6, m6 + vpermq m6, m6, q3120 + vpblendd m3, m3, m6, 00001111b +%else + punpckhbw m9, m6, m3 + punpcklbw m6, m3 +%if %2 + mova m3, [pb_8x_27_17_8x_17_27] + pmaddubsw m9, m3, m9 + pmaddubsw m6, m3, m6 +%else + pmaddubsw m9, m1, m9 + pmaddubsw m6, m1, m6 +%endif +%if %1 + pmulhrsw m9, [pw_1024] + pmulhrsw m6, [pw_1024] +%else + pmulhrsw m9, m14 + pmulhrsw m6, m14 +%endif + packsswb m3, m6, m9 +%endif + pcmpgtb m7, m2, m3 + punpcklbw m2, m3, m7 + punpckhbw m3, m7 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m2, m8 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + + ; dst = clip_pixel(src, noise) +%if %2 + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + mova [dstq], xm0 + vextracti128 [dstq+strideq], m0, 1 +%else + pxor m6, m6 + punpckhbw m9, m0, m6 + punpcklbw m0, m6 ; m0-1: src as word + paddw m0, m2 + paddw m9, m3 + pmaxsw m0, m13 + pmaxsw m9, m13 + pminsw m0, m12 + pminsw m9, m12 + packuswb m0, m9 + mova [dstq], m0 +%endif + +%if %2 + lea srcq, [srcq+strideq*2] + lea dstq, [dstq+strideq*2] + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + add srcq, strideq + add dstq, strideq + add lumaq, lstrideq +%endif + add grain_lutq, 82<<%2 + sub hb, 1+%2 +%if %2 + jg %%loop_y_h_overlap +%else + je %%end_y_hv_overlap + vbroadcasti128 m1, [pb_8x_27_17_8x_17_27+16] + btc hd, 16 + jnc %%loop_y_hv_overlap + jmp %%loop_y_h_overlap +%endif + +%%end_y_hv_overlap: + add wq, 32>>%2 + jge %%end_hv + mov srcq, r11mp + mov dstq, r12mp + lea lumaq, [r14+wq*(1+%2)] + add srcq, wq + add dstq, wq + jmp %%loop_x_hv_overlap + +%%end_hv: + RET +%endmacro + + %%FGUV_32x32xN_LOOP 1, %2, %3 +.csfl: + %%FGUV_32x32xN_LOOP 0, %2, %3 +%endmacro + +FGUV_FN 420, 1, 1 +FGUV_FN 422, 1, 0 +FGUV_FN 444, 0, 0 + +%endif ; ARCH_X86_64 diff --git a/third_party/dav1d/src/x86/film_grain_init_tmpl.c b/third_party/dav1d/src/x86/film_grain_init_tmpl.c new file mode 100644 index 0000000000..25e8ef99e6 --- /dev/null +++ b/third_party/dav1d/src/x86/film_grain_init_tmpl.c @@ -0,0 +1,77 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/film_grain.h" + +decl_generate_grain_y_fn(dav1d_generate_grain_y_ssse3); +decl_generate_grain_uv_fn(dav1d_generate_grain_uv_420_ssse3); +decl_generate_grain_uv_fn(dav1d_generate_grain_uv_422_ssse3); +decl_generate_grain_uv_fn(dav1d_generate_grain_uv_444_ssse3); +decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_ssse3); +decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_ssse3); +decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i422_ssse3); +decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i444_ssse3); + +decl_generate_grain_y_fn(dav1d_generate_grain_y_avx2); +decl_generate_grain_uv_fn(dav1d_generate_grain_uv_420_avx2); +decl_generate_grain_uv_fn(dav1d_generate_grain_uv_422_avx2); +decl_generate_grain_uv_fn(dav1d_generate_grain_uv_444_avx2); +decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_avx2); +decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_avx2); +decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i422_avx2); +decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i444_avx2); + +COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; + +#if BITDEPTH == 8 + c->generate_grain_y = dav1d_generate_grain_y_ssse3; + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_generate_grain_uv_420_ssse3; + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_generate_grain_uv_422_ssse3; + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_generate_grain_uv_444_ssse3; + c->fgy_32x32xn = dav1d_fgy_32x32xn_ssse3; + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_fguv_32x32xn_i420_ssse3; + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_fguv_32x32xn_i422_ssse3; + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_fguv_32x32xn_i444_ssse3; +#endif + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; + +#if BITDEPTH == 8 && ARCH_X86_64 + c->generate_grain_y = dav1d_generate_grain_y_avx2; + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_generate_grain_uv_420_avx2; + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_generate_grain_uv_422_avx2; + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_generate_grain_uv_444_avx2; + c->fgy_32x32xn = dav1d_fgy_32x32xn_avx2; + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_fguv_32x32xn_i420_avx2; + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_fguv_32x32xn_i422_avx2; + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_fguv_32x32xn_i444_avx2; +#endif +} diff --git a/third_party/dav1d/src/x86/film_grain_ssse3.asm b/third_party/dav1d/src/x86/film_grain_ssse3.asm new file mode 100644 index 0000000000..9e47ea0659 --- /dev/null +++ b/third_party/dav1d/src/x86/film_grain_ssse3.asm @@ -0,0 +1,3301 @@ +; Copyright © 2019, VideoLAN and dav1d authors +; Copyright © 2019, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA + +pw_1024: times 8 dw 1024 +pb_27_17: times 8 db 27, 17 +pb_17_27: times 8 db 17, 27 +pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 +rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 +byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0 +pw_seed_xor: times 2 dw 0xb524 + times 2 dw 0x49d8 +pb_23_22: times 2 db 23, 22 +pb_1: times 4 db 1 +hmul_bits: dw 32768, 16384, 8192, 4096 +round: dw 2048, 1024, 512 +mul_bits: dw 256, 128, 64, 32, 16 +round_vals: dw 32, 64, 128, 256, 512 +max: dw 255, 240, 235 +min: dw 0, 16 +pw_1: dw 1 + +%define pb_27_17_17_27 pb_17_27 - 2 + +%macro JMP_TABLE 1-* + %xdefine %1_table %%table + %xdefine %%base %1_table + %xdefine %%prefix mangle(private_prefix %+ _%1) + %%table: + %rep %0 - 1 + dd %%prefix %+ .ar%2 - %%base + %rotate 1 + %endrep +%endmacro + +JMP_TABLE generate_grain_y_ssse3, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_420_ssse3, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_422_ssse3, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_444_ssse3, 0, 1, 2, 3 + +struc FGData + .seed: resd 1 + .num_y_points: resd 1 + .y_points: resb 14 * 2 + .chroma_scaling_from_luma: resd 1 + .num_uv_points: resd 2 + .uv_points: resb 2 * 10 * 2 + .scaling_shift: resd 1 + .ar_coeff_lag: resd 1 + .ar_coeffs_y: resb 24 + .ar_coeffs_uv: resb 2 * 28 ; includes padding + .ar_coeff_shift: resq 1 + .grain_scale_shift: resd 1 + .uv_mult: resd 2 + .uv_luma_mult: resd 2 + .uv_offset: resd 2 + .overlap_flag: resd 1 + .clip_to_restricted_range: resd 1 +endstruc + +cextern gaussian_sequence + +SECTION .text + +%macro SCRATCH 3 +%if ARCH_X86_32 + mova [rsp+%3*mmsize], m%1 +%define m%2 [rsp+%3*mmsize] +%else + SWAP %1, %2 +%endif +%endmacro + +INIT_XMM ssse3 +cglobal generate_grain_y, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data + LEA r4, $$ +%define base r4-$$ + movq m1, [base+rnd_next_upperbit_mask] + movq m4, [base+mul_bits] + movq m7, [base+hmul_bits] + mov r2d, [fg_dataq+FGData.grain_scale_shift] + movd m2, [base+round+r2*2] + movd m0, [fg_dataq+FGData.seed] + mova m5, [base+pb_mask] + pshuflw m2, m2, q0000 + pshuflw m0, m0, q0000 + mov r2, -73*82 + sub bufq, r2 + lea r3, [base+gaussian_sequence] +.loop: + pand m6, m0, m1 + psrlw m3, m6, 10 + por m6, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set + pmullw m6, m4 ; bits 0x0f00 are set + pshufb m3, m5, m6 ; set 15th bit for next 4 seeds + psllq m6, m3, 30 + por m3, m6 + psllq m6, m3, 15 + por m3, m6 ; aggregate each bit into next seed's high bit + pmulhuw m6, m0, m7 + por m3, m6 ; 4 next output seeds + pshuflw m0, m3, q3333 + psrlw m3, 5 +%if ARCH_X86_64 + movq r6, m3 + mov r8, r6 + movzx r5d, r6w + shr r6d, 16 + shr r8, 32 + movzx r7, r8w + shr r8, 16 + + movd m6, [r3+r5*2] + pinsrw m6, [r3+r6*2], 1 + pinsrw m6, [r3+r7*2], 2 + pinsrw m6, [r3+r8*2], 3 +%else + movd r6, m3 + pshuflw m3, m3, q3232 + movzx r5, r6w + shr r6, 16 + + movd m6, [r3+r5*2] + pinsrw m6, [r3+r6*2], 1 + + movd r6, m3 + movzx r5, r6w + shr r6, 16 + + pinsrw m6, [r3+r5*2], 2 + pinsrw m6, [r3+r6*2], 3 +%endif + pmulhrsw m6, m2 + packsswb m6, m6 + movd [bufq+r2], m6 + add r2, 4 + jl .loop + + ; auto-regression code + movsxd r2, [fg_dataq+FGData.ar_coeff_lag] + movsxd r2, [base+generate_grain_y_ssse3_table+r2*4] + lea r2, [r2+base+generate_grain_y_ssse3_table] + jmp r2 + +.ar1: +%if ARCH_X86_32 + DEFINE_ARGS buf, fg_data, cf3, unused, val3, min, max +%elif WIN64 + DEFINE_ARGS shift, fg_data, cf3, buf, val3, min, max, x, val0 + mov bufq, r0 +%else + DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0 +%endif + movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] + movd m4, [fg_dataq+FGData.ar_coeffs_y] + mov ecx, [fg_dataq+FGData.ar_coeff_shift] +%if ARCH_X86_32 + mov r1m, cf3d + DEFINE_ARGS buf, shift, val3, min, max, x, val0 +%define hd r0mp +%define cf3d r1mp +%elif WIN64 + DEFINE_ARGS shift, h, cf3, buf, val3, min, max, x, val0 +%else + DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0 +%endif + pxor m6, m6 + pcmpgtb m7, m6, m4 + punpcklbw m4, m7 + pinsrw m4, [base+pw_1], 3 + pshufd m5, m4, q1111 + pshufd m4, m4, q0000 + movd m3, [base+round_vals+shiftq*2-12] ; rnd + pshuflw m3, m3, q0000 + sub bufq, 82*73-(82*3+79) + mov hd, 70 + mov mind, -128 + mov maxd, 127 +.y_loop_ar1: + mov xq, -76 + movsx val3d, byte [bufq+xq-1] +.x_loop_ar1: + movq m0, [bufq+xq-82-1] ; top/left + pcmpgtb m7, m6, m0 + punpcklbw m0, m7 + psrldq m2, m0, 2 ; top + psrldq m1, m0, 4 ; top/right + punpcklwd m0, m2 + punpcklwd m1, m3 + pmaddwd m0, m4 + pmaddwd m1, m5 + paddd m0, m1 +.x_loop_ar1_inner: + movd val0d, m0 + psrldq m0, 4 + imul val3d, cf3d + add val3d, val0d + sar val3d, shiftb + movsx val0d, byte [bufq+xq] + add val3d, val0d + cmp val3d, maxd + cmovns val3d, maxd + cmp val3d, mind + cmovs val3d, mind + mov byte [bufq+xq], val3b + ; keep val3d in-place as left for next x iteration + inc xq + jz .x_loop_ar1_end + test xq, 3 + jnz .x_loop_ar1_inner + jmp .x_loop_ar1 + +.x_loop_ar1_end: + add bufq, 82 + dec hd + jg .y_loop_ar1 +.ar0: + RET + +.ar2: +%if ARCH_X86_32 +%assign stack_offset_old stack_offset + ALLOC_STACK -16*8 +%endif + DEFINE_ARGS buf, fg_data, shift + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movd m6, [base+round_vals-12+shiftq*2] + movd m7, [base+byte_blend+1] + SCRATCH 7, 15, 7 + movq m0, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7 + movd m1, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11 + pxor m7, m7 + pshuflw m6, m6, q0000 + punpcklwd m6, m7 + pcmpgtb m4, m7, m0 + pcmpgtb m5, m7, m1 + punpcklbw m0, m4 + punpcklbw m1, m5 + DEFINE_ARGS buf, fg_data, h, x + pshufd m4, m1, q0000 + pshufd m5, m1, q1111 + pshufd m3, m0, q3333 + pshufd m2, m0, q2222 + pshufd m1, m0, q1111 + pshufd m0, m0, q0000 + SCRATCH 0, 8, 0 + SCRATCH 1, 9, 1 + SCRATCH 2, 10, 2 + SCRATCH 3, 11, 3 + SCRATCH 4, 12, 4 + SCRATCH 5, 13, 5 + SCRATCH 6, 14, 6 + sub bufq, 82*73-(82*3+79) + mov hd, 70 +.y_loop_ar2: + mov xq, -76 + +.x_loop_ar2: + movq m0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] + movhps m0, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] + pcmpgtb m2, m7, m0 + punpckhbw m1, m0, m2 + punpcklbw m0, m2 + psrldq m5, m0, 2 ; y=-2,x=[-1,+5] + psrldq m3, m1, 2 ; y=-1,x=[-1,+5] + psrldq m4, m1, 4 ; y=-1,x=[+0,+5] + punpcklwd m2, m0, m5 + punpcklwd m3, m4 + pmaddwd m2, m8 + pmaddwd m3, m11 + paddd m2, m3 + + psrldq m4, m0, 4 ; y=-2,x=[+0,+5] + psrldq m5, m0, 6 ; y=-2,x=[+1,+5] + psrldq m6, m0, 8 ; y=-2,x=[+2,+5] + punpcklwd m4, m5 + punpcklwd m6, m1 + psrldq m5, m1, 6 ; y=-1,x=[+1,+5] + psrldq m1, m1, 8 ; y=-1,x=[+2,+5] + punpcklwd m5, m1 + pmaddwd m4, m9 + pmaddwd m6, m10 + pmaddwd m5, m12 + paddd m4, m6 + paddd m2, m5 + paddd m2, m4 + paddd m2, m14 + + movq m0, [bufq+xq-2] ; y=0,x=[-2,+5] +.x_loop_ar2_inner: + pcmpgtb m4, m7, m0 + punpcklbw m1, m0, m4 + pmaddwd m3, m1, m13 + paddd m3, m2 + psrldq m1, 4 ; y=0,x=0 + psrldq m2, 4 ; shift top to next pixel + psrad m3, [fg_dataq+FGData.ar_coeff_shift] + ; don't packssdw since we only care about one value + paddw m3, m1 + packsswb m3, m3 + pslldq m3, 2 + pand m3, m15 + pandn m1, m15, m0 + por m0, m1, m3 + psrldq m0, 1 + ; overwrite 2 pixels, but that's ok + movd [bufq+xq-1], m0 + inc xq + jz .x_loop_ar2_end + test xq, 3 + jnz .x_loop_ar2_inner + jmp .x_loop_ar2 + +.x_loop_ar2_end: + add bufq, 82 + dec hd + jg .y_loop_ar2 + RET + +.ar3: + DEFINE_ARGS buf, fg_data, shift +%if ARCH_X86_32 +%assign stack_offset stack_offset_old + ALLOC_STACK -16*14 +%elif WIN64 + SUB rsp, 16*6 +%assign stack_size_padded (stack_size_padded+16*6) +%assign stack_size (stack_size+16*6) +%else + ALLOC_STACK -16*6 +%endif + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movd m6, [base+round_vals-12+shiftq*2] + movd m7, [base+byte_blend] + movu m0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15 + movq m2, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 + pxor m3, m3 + pcmpgtb m4, m3, m0 + pcmpgtb m3, m2 + pshuflw m6, m6, q0000 + SCRATCH 6, 14, 12 + SCRATCH 7, 15, 13 + punpckhbw m1, m0, m4 + punpcklbw m0, m4 + punpcklbw m2, m3 + pshufd m3, m0, q1111 + pshufd m4, m0, q2222 + pshufd m5, m0, q3333 + pshufd m0, m0, q0000 + mova [rsp+ 0*16], m0 + mova [rsp+ 1*16], m3 + mova [rsp+ 2*16], m4 + mova [rsp+ 3*16], m5 + pshufd m6, m1, q1111 + pshufd m7, m1, q2222 + pshufd m5, m1, q3333 + pshufd m1, m1, q0000 + pshufd m3, m2, q1111 + psrldq m0, m2, 10 + pinsrw m2, [base+pw_1], 5 + pshufd m4, m2, q2222 + pshufd m2, m2, q0000 + pinsrw m0, [base+round_vals+shiftq*2-10], 3 + mova [rsp+ 4*16], m1 + mova [rsp+ 5*16], m6 + SCRATCH 7, 8, 6 + SCRATCH 5, 9, 7 + SCRATCH 2, 10, 8 + SCRATCH 3, 11, 9 + SCRATCH 4, 12, 10 + SCRATCH 0, 13, 11 + DEFINE_ARGS buf, fg_data, h, x + sub bufq, 82*73-(82*3+79) + mov hd, 70 +.y_loop_ar3: + mov xq, -76 + +.x_loop_ar3: + movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] + pxor m3, m3 + pcmpgtb m3, m0 + punpckhbw m2, m0, m3 + punpcklbw m0, m3 + + psrldq m5, m0, 2 + psrldq m6, m0, 4 + psrldq m7, m0, 6 + punpcklwd m4, m0, m5 + punpcklwd m6, m7 + pmaddwd m4, [rsp+ 0*16] + pmaddwd m6, [rsp+ 1*16] + paddd m4, m6 + + movu m1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] + pxor m5, m5 + pcmpgtb m5, m1 + punpckhbw m3, m1, m5 + punpcklbw m1, m5 + palignr m6, m2, m0, 10 + palignr m7, m2, m0, 12 + psrldq m0, 8 + punpcklwd m0, m6 + punpcklwd m7, m1 + pmaddwd m0, [rsp+ 2*16] + pmaddwd m7, [rsp+ 3*16] + paddd m0, m7 + paddd m0, m4 + + psrldq m4, m1, 2 + psrldq m5, m1, 4 + psrldq m6, m1, 6 + psrldq m7, m1, 8 + punpcklwd m4, m5 + punpcklwd m6, m7 + pmaddwd m4, [rsp+ 4*16] + pmaddwd m6, [rsp+ 5*16] + paddd m4, m6 + paddd m0, m4 + + movu m2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] + pxor m7, m7 + pcmpgtb m7, m2 + punpckhbw m5, m2, m7 + punpcklbw m2, m7 + palignr m7, m3, m1, 10 + palignr m3, m1, 12 + psrldq m1, m2, 2 + punpcklwd m7, m3 + punpcklwd m3, m2, m1 + pmaddwd m7, m8 + pmaddwd m3, m9 + paddd m7, m3 + paddd m0, m7 + + psrldq m6, m2, 4 + psrldq m1, m2, 6 + psrldq m3, m2, 8 + palignr m4, m5, m2, 10 + palignr m5, m5, m2, 12 + + punpcklwd m6, m1 + punpcklwd m3, m4 + punpcklwd m5, m14 + pmaddwd m6, m10 + pmaddwd m3, m11 + pmaddwd m5, m12 + paddd m0, m6 + paddd m3, m5 + paddd m0, m3 + + movq m1, [bufq+xq-3] ; y=0,x=[-3,+4] +.x_loop_ar3_inner: + pxor m5, m5 + pcmpgtb m5, m1 + punpcklbw m2, m1, m5 + pmaddwd m2, m13 + pshufd m3, m2, q1111 + paddd m2, m3 ; left+cur + paddd m2, m0 ; add top + psrldq m0, 4 + psrad m2, [fg_dataq+FGData.ar_coeff_shift] + ; don't packssdw since we only care about one value + packsswb m2, m2 + pslldq m2, 3 + pand m2, m15 + pandn m3, m15, m1 + por m1, m2, m3 + movd [bufq+xq-3], m1 + psrldq m1, 1 + inc xq + jz .x_loop_ar3_end + test xq, 3 + jnz .x_loop_ar3_inner + jmp .x_loop_ar3 + +.x_loop_ar3_end: + add bufq, 82 + dec hd + jg .y_loop_ar3 + RET + +%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y +INIT_XMM ssse3 +cglobal generate_grain_uv_%1, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv + movifnidn r2, r2mp + movifnidn r3, r3mp + LEA r4, $$ +%define base r4-$$ + movq m1, [base+rnd_next_upperbit_mask] + movq m4, [base+mul_bits] + movq m7, [base+hmul_bits] + mov r5d, [fg_dataq+FGData.grain_scale_shift] + movd m6, [base+round+r5*2] + mova m5, [base+pb_mask] + movd m0, [fg_dataq+FGData.seed] + movd m2, [base+pw_seed_xor+uvq*4] + pxor m0, m2 + pshuflw m6, m6, q0000 + pshuflw m0, m0, q0000 + lea r6, [base+gaussian_sequence] +%if %2 +%if ARCH_X86_64 + mov r7d, 73-35*%3 +%else + mov r3mp, 73-35*%3 +%endif + add bufq, 44 +.loop_y: + mov r5, -44 +.loop_x: +%else + mov r5, -82*73 + sub bufq, r5 +.loop: +%endif + pand m2, m0, m1 + psrlw m3, m2, 10 + por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set + pmullw m2, m4 ; bits 0x0f00 are set + pshufb m3, m5, m2 ; set 15th bit for next 4 seeds + psllq m2, m3, 30 + por m3, m2 + psllq m2, m3, 15 + por m3, m2 ; aggregate each bit into next seed's high bit + pmulhuw m2, m0, m7 + por m2, m3 ; 4 next output seeds + pshuflw m0, m2, q3333 + psrlw m2, 5 +%if ARCH_X86_64 + movd r9d, m2 + pshuflw m2, m2, q3232 + movzx r8, r9w + shr r9, 16 + + movd m3, [r6+r8*2] + pinsrw m3, [r6+r9*2], 1 + + movd r9d, m2 + movzx r8, r9w + shr r9, 16 + + pinsrw m3, [r6+r8*2], 2 + pinsrw m3, [r6+r9*2], 3 +%else + movd r2, m2 + pshuflw m2, m2, q3232 + movzx r1, r2w + shr r2, 16 + + movd m3, [r6+r1*2] + pinsrw m3, [r6+r2*2], 1 + + movd r2, m2 + movzx r1, r2w + shr r2, 16 + + pinsrw m3, [r6+r1*2], 2 + pinsrw m3, [r6+r2*2], 3 +%endif + pmulhrsw m3, m6 + packsswb m3, m3 + movd [bufq+r5], m3 + add r5, 4 +%if %2 + jl .loop_x + add bufq, 82 +%if ARCH_X86_64 + dec r7d +%else + dec r3mp +%endif + jg .loop_y +%else + jl .loop +%endif + +%if ARCH_X86_32 + mov r2, r2mp +%endif + + ; auto-regression code + movsxd r5, [fg_dataq+FGData.ar_coeff_lag] + movsxd r5, [base+generate_grain_uv_%1_ssse3_table+r5*4] + lea r5, [r5+base+generate_grain_uv_%1_ssse3_table] + jmp r5 + +.ar0: + DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift + movifnidn bufyq, bufymp +%if ARCH_X86_32 +%assign stack_offset_old stack_offset + ALLOC_STACK -2*16 +%endif + imul uvd, 28 + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movd m5, [fg_dataq+FGData.ar_coeffs_uv+uvq] + movd m4, [base+hmul_bits+shiftq*2] + DEFINE_ARGS buf, bufy, h, x + pxor m0, m0 + pcmpgtb m0, m5 + punpcklbw m5, m0 + movd m7, [base+pb_1] +%if %2 + movd m6, [base+hmul_bits+2+%3*2] +%endif + pshuflw m5, m5, q0000 + pshuflw m4, m4, q0000 + pshufd m7, m7, q0000 +%if %2 + pshuflw m6, m6, q0000 +%endif + punpcklqdq m5, m5 + punpcklqdq m4, m4 +%if %2 + punpcklqdq m6, m6 +%endif + pcmpeqw m1, m1 + pslldq m1, 12>>%2 + SCRATCH 1, 8, 0 + SCRATCH 4, 9, 1 +%if %2 + sub bufq, 82*(73-35*%3)+82-(82*3+41) +%else + sub bufq, 82*70-3 +%endif + add bufyq, 3+82*3 + mov hd, 70-35*%3 +.y_loop_ar0: + xor xd, xd +.x_loop_ar0: + ; first 32 pixels +%if %2 + movu m1, [bufyq+xq*2] +%if %3 + movu m2, [bufyq+xq*2+82] +%endif + movu m3, [bufyq+xq*2+16] +%if %3 + movu m4, [bufyq+xq*2+82+16] +%endif + pmaddubsw m0, m7, m1 +%if %3 + pmaddubsw m1, m7, m2 +%endif + pmaddubsw m2, m7, m3 +%if %3 + pmaddubsw m3, m7, m4 + paddw m0, m1 + paddw m2, m3 +%endif + pmulhrsw m0, m6 + pmulhrsw m2, m6 +%else + movu m0, [bufyq+xq] + pxor m6, m6 + pcmpgtb m6, m0 + punpckhbw m2, m0, m6 + punpcklbw m0, m6 +%endif + pmullw m0, m5 + pmullw m2, m5 + pmulhrsw m0, m9 + pmulhrsw m2, m9 + movu m1, [bufq+xq] + pxor m4, m4 + pcmpgtb m4, m1 + punpckhbw m3, m1, m4 +%if %2 + punpcklbw m1, m4 + paddw m2, m3 + paddw m0, m1 +%else + punpcklbw m6, m1, m4 + paddw m2, m3 + paddw m0, m6 +%endif + packsswb m0, m2 +%if %2 + movu [bufq+xq], m0 + add xd, 16 + cmp xd, 32 + jl .x_loop_ar0 + + ; last 6/12 pixels + movu m1, [bufyq+xq*(1+%2)] +%if %3 + movu m2, [bufyq+xq*2+82] +%endif + pmaddubsw m0, m7, m1 +%if %3 + pmaddubsw m1, m7, m2 + paddw m0, m1 +%endif + pmulhrsw m0, m6 + pmullw m0, m5 + pmulhrsw m0, m9 + movq m1, [bufq+xq] + pxor m4, m4 + pcmpgtb m4, m1 + punpcklbw m2, m1, m4 + paddw m0, m2 + packsswb m0, m0 + pandn m2, m8, m0 + pand m1, m8 + por m2, m1 + movq [bufq+xq], m2 +%else + add xd, 16 + cmp xd, 80 + je .y_loop_final_ar0 + movu [bufq+xq-16], m0 + jmp .x_loop_ar0 +.y_loop_final_ar0: + pandn m2, m8, m0 + pand m1, m8 + por m2, m1 + movu [bufq+xq-16], m2 +%endif + + add bufq, 82 + add bufyq, 82<<%3 + dec hd + jg .y_loop_ar0 + RET + +.ar1: +%if ARCH_X86_32 +%assign stack_offset stack_offset_old +%assign stack_size_padded 0 +%xdefine rstk rsp +%endif + DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x + imul uvd, 28 + movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] + movd m4, [fg_dataq+FGData.ar_coeffs_uv+uvq-1] + pinsrw m4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 2 +%if ARCH_X86_32 + mov r3mp, cf3d + DEFINE_ARGS buf, shift, fg_data, val3, min, max, x +%elif WIN64 + DEFINE_ARGS shift, bufy, fg_data, buf, val3, cf3, min, max, x + mov bufq, r0 +%else + DEFINE_ARGS buf, bufy, fg_data, shift, val3, cf3, min, max, x +%endif + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movd m3, [base+round_vals+shiftq*2-12] ; rnd +%if %2 + movd m7, [base+pb_1] + movd m6, [base+hmul_bits+2+%3*2] +%endif + psrldq m4, 1 +%if ARCH_X86_32 + DEFINE_ARGS buf, shift, val0, val3, min, max, x +%elif WIN64 + DEFINE_ARGS shift, bufy, h, buf, val3, cf3, min, max, x, val0 +%else + DEFINE_ARGS buf, bufy, h, shift, val3, cf3, min, max, x, val0 +%endif + pxor m5, m5 + punpcklwd m3, m5 +%if %2 + punpcklwd m6, m6 +%endif + pcmpgtb m5, m4 + punpcklbw m4, m5 + pshufd m5, m4, q1111 + pshufd m4, m4, q0000 + pshufd m3, m3, q0000 +%if %2 + pshufd m7, m7, q0000 + pshufd m6, m6, q0000 + sub bufq, 82*(73-35*%3)+44-(82*3+41) +%else + sub bufq, 82*69+3 +%endif +%if ARCH_X86_32 + add r1mp, 79+82*3 + mov r0mp, 70-35*%3 +%else + add bufyq, 79+82*3 + mov hd, 70-35*%3 +%endif + mov mind, -128 + mov maxd, 127 +.y_loop_ar1: + mov xq, -(76>>%2) + movsx val3d, byte [bufq+xq-1] +.x_loop_ar1: +%if %2 +%if ARCH_X86_32 + mov r2, r1mp + movq m0, [r2+xq*2] +%if %3 + movq m1, [r2+xq*2+82] +%endif +%else + movq m0, [bufyq+xq*2] +%if %3 + movq m1, [bufyq+xq*2+82] +%endif +%endif + pmaddubsw m2, m7, m0 +%if %3 + pmaddubsw m0, m7, m1 + paddw m2, m0 +%endif + pmulhrsw m2, m6 +%else +%if ARCH_X86_32 + mov r2, r1mp + movd m2, [r2+xq] +%else + movd m2, [bufyq+xq] +%endif + pxor m0, m0 + pcmpgtb m0, m2 + punpcklbw m2, m0 +%endif + + movq m0, [bufq+xq-82-1] ; top/left + pxor m1, m1 + pcmpgtb m1, m0 + punpcklbw m0, m1 + psrldq m1, m0, 4 ; top/right + punpcklwd m1, m2 + psrldq m2, m0, 2 ; top + punpcklwd m0, m2 + pmaddwd m0, m4 + pmaddwd m1, m5 + paddd m0, m1 + paddd m0, m3 +.x_loop_ar1_inner: + movd val0d, m0 + psrldq m0, 4 +%if ARCH_X86_32 + imul val3d, r3mp +%else + imul val3d, cf3d +%endif + add val3d, val0d + sar val3d, shiftb + movsx val0d, byte [bufq+xq] + add val3d, val0d + cmp val3d, maxd + cmovns val3d, maxd + cmp val3d, mind + cmovs val3d, mind + mov byte [bufq+xq], val3b + ; keep val3d in-place as left for next x iteration + inc xq + jz .x_loop_ar1_end + test xq, 3 + jnz .x_loop_ar1_inner + jmp .x_loop_ar1 + +.x_loop_ar1_end: + add bufq, 82 +%if ARCH_X86_32 + add r1mp, 82<<%3 + dec r0mp +%else + add bufyq, 82<<%3 + dec hd +%endif + jg .y_loop_ar1 + RET + +.ar2: +%if ARCH_X86_32 +%assign stack_offset stack_offset_old +%assign stack_size_padded 0 +%xdefine rstk rsp + ALLOC_STACK -8*16 +%endif + DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift + movifnidn bufyq, bufymp + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + imul uvd, 28 + movd m7, [base+round_vals-12+shiftq*2] + movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-12 + pxor m2, m2 + pcmpgtb m2, m0 + punpckhbw m1, m0, m2 + punpcklbw m0, m2 + pinsrw m1, [base+pw_1], 5 + punpcklwd m7, m7 + pshufd m7, m7, q0000 + DEFINE_ARGS buf, bufy, fg_data, h, unused, x + pshufd m4, m1, q0000 + pshufd m5, m1, q1111 + pshufd m6, m1, q2222 + pshufd m3, m0, q3333 + pshufd m2, m0, q2222 + pshufd m1, m0, q1111 + pshufd m0, m0, q0000 + SCRATCH 0, 8, 0 + SCRATCH 1, 9, 1 + SCRATCH 2, 10, 2 + SCRATCH 3, 11, 3 + SCRATCH 4, 12, 4 + SCRATCH 5, 13, 5 + SCRATCH 6, 14, 6 + SCRATCH 7, 15, 7 +%if %2 + movd m7, [base+hmul_bits+2+%3*2] + movd m6, [base+pb_1] + punpcklwd m7, m7 + pshufd m6, m6, q0000 + pshufd m7, m7, q0000 + sub bufq, 82*(73-35*%3)+44-(82*3+41) +%else + sub bufq, 82*69+3 +%endif + add bufyq, 79+82*3 + mov hd, 70-35*%3 +.y_loop_ar2: + mov xq, -(76>>%2) + +.x_loop_ar2: + pxor m2, m2 + movq m0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] + movhps m0, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] + pcmpgtb m2, m0 + punpckhbw m1, m0, m2 + punpcklbw m0, m2 + psrldq m5, m0, 2 ; y=-2,x=[-1,+5] + psrldq m3, m1, 2 ; y=-1,x=[-1,+5] + psrldq m4, m1, 4 ; y=-1,x=[+0,+5] + punpcklwd m2, m0, m5 + punpcklwd m3, m4 + pmaddwd m2, m8 + pmaddwd m3, m11 + paddd m2, m3 + + psrldq m4, m0, 4 ; y=-2,x=[+0,+5] + psrldq m5, m0, 6 ; y=-2,x=[+1,+5] + psrldq m0, 8 ; y=-2,x=[+2,+5] + punpcklwd m4, m5 + punpcklwd m0, m1 + psrldq m3, m1, 6 ; y=-1,x=[+1,+5] + psrldq m1, m1, 8 ; y=-1,x=[+2,+5] + punpcklwd m3, m1 + pmaddwd m4, m9 + pmaddwd m0, m10 + pmaddwd m3, m12 + paddd m4, m0 + paddd m2, m3 + paddd m2, m4 + +%if %2 + movq m1, [bufyq+xq*2] +%if %3 + movq m3, [bufyq+xq*2+82] +%endif + pmaddubsw m0, m6, m1 +%if %3 + pmaddubsw m1, m6, m3 + paddw m0, m1 +%endif + pmulhrsw m0, m7 +%else + movd m0, [bufyq+xq] + pxor m1, m1 + pcmpgtb m1, m0 + punpcklbw m0, m1 +%endif + punpcklwd m0, m15 + pmaddwd m0, m14 + paddd m2, m0 + + movq m0, [bufq+xq-2] ; y=0,x=[-2,+5] + pxor m4, m4 + movd m5, [base+byte_blend+1] + punpcklbw m5, m5 +.x_loop_ar2_inner: + pcmpgtb m1, m4, m0 + punpcklbw m0, m1 + pmaddwd m3, m0, m13 + paddd m3, m2 + psrldq m2, 4 ; shift top to next pixel + psrad m3, [fg_dataq+FGData.ar_coeff_shift] + pslldq m3, 4 + pand m3, m5 + paddw m0, m3 + packsswb m0, m0 + movd [bufq+xq-2], m0 + psrldq m0, 1 + inc xq + jz .x_loop_ar2_end + test xq, 3 + jnz .x_loop_ar2_inner + jmp .x_loop_ar2 + +.x_loop_ar2_end: + add bufq, 82 + add bufyq, 82<<%3 + dec hd + jg .y_loop_ar2 + RET + +.ar3: +%if ARCH_X86_32 +%assign stack_offset stack_offset_old +%assign stack_size_padded 0 +%xdefine rstk rsp +%endif + DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift + movifnidn bufyq, bufymp +%if ARCH_X86_32 + ALLOC_STACK -15*16 +%else + SUB rsp, 16*7 +%assign stack_size_padded (stack_size_padded+16*7) +%assign stack_size (stack_size+16*7) +%endif + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + imul uvd, 28 + + movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-15 + pxor m3, m3 + pcmpgtb m3, m0 + punpckhbw m1, m0, m3 + punpcklbw m0, m3 + pshufd m2, m0, q1111 + pshufd m3, m0, q2222 + pshufd m4, m0, q3333 + pshufd m0, m0, q0000 + pshufd m5, m1, q1111 + pshufd m6, m1, q2222 + pshufd m7, m1, q3333 + pshufd m1, m1, q0000 + mova [rsp+ 0*16], m0 + mova [rsp+ 1*16], m2 + mova [rsp+ 2*16], m3 + mova [rsp+ 3*16], m4 + mova [rsp+ 4*16], m1 + mova [rsp+ 5*16], m5 + mova [rsp+ 6*16], m6 + SCRATCH 7, 8, 7 + + movu m2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-24 [24=luma] + pxor m4, m4 + pcmpgtb m4, m2 + punpckhbw m5, m2, m4 + punpcklbw m2, m4 + pshufd m4, m2, q3232 + punpcklwd m3, m4, m5 + pshuflw m5, m4, q3321 + pshufd m4, m3, q0000 + pshufd m3, m2, q1111 + pshufd m2, m2, q0000 + pinsrw m5, [base+round_vals+shiftq*2-10], 3 + SCRATCH 2, 9, 8 + SCRATCH 3, 10, 9 + SCRATCH 4, 11, 10 + SCRATCH 5, 12, 11 + + movd m2, [base+round_vals-12+shiftq*2] +%if %2 + movd m1, [base+pb_1] + movd m3, [base+hmul_bits+2+%3*2] +%endif + pxor m0, m0 + punpcklwd m2, m0 +%if %2 + punpcklwd m3, m3 +%endif + pshufd m2, m2, q0000 +%if %2 + pshufd m1, m1, q0000 + pshufd m3, m3, q0000 + SCRATCH 1, 13, 12 +%endif + SCRATCH 2, 14, 13 +%if %2 + SCRATCH 3, 15, 14 +%endif + + DEFINE_ARGS buf, bufy, fg_data, h, unused, x +%if %2 + sub bufq, 82*(73-35*%3)+44-(82*3+41) +%else + sub bufq, 82*69+3 +%endif + add bufyq, 79+82*3 + mov hd, 70-35*%3 +.y_loop_ar3: + mov xq, -(76>>%2) + +.x_loop_ar3: + movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] + pxor m4, m4 + pcmpgtb m4, m0 + punpckhbw m3, m0, m4 + punpcklbw m0, m4 + + psrldq m5, m0, 2 + psrldq m6, m0, 4 + psrldq m7, m0, 6 + punpcklwd m4, m0, m5 + punpcklwd m6, m7 + pmaddwd m4, [rsp+ 0*16] + pmaddwd m6, [rsp+ 1*16] + paddd m4, m6 + + palignr m2, m3, m0, 10 + palignr m3, m0, 12 + psrldq m0, 8 + + movu m1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] + pxor m6, m6 + pcmpgtb m6, m1 + punpckhbw m5, m1, m6 + punpcklbw m1, m6 + + punpcklwd m0, m2 + punpcklwd m3, m1 + pmaddwd m0, [rsp+ 2*16] + pmaddwd m3, [rsp+ 3*16] + paddd m0, m3 + paddd m0, m4 + + movu m2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] + pxor m7, m7 + pcmpgtb m7, m2 + punpckhbw m6, m2, m7 + punpcklbw m2, m7 + + palignr m3, m5, m1, 10 + palignr m5, m1, 12 + psrldq m4, m2, 2 + + punpcklwd m3, m5 + punpcklwd m5, m2, m4 + pmaddwd m3, [rsp+ 6*16] + pmaddwd m5, m8 + paddd m3, m5 + paddd m0, m3 + + psrldq m3, m1, 2 + psrldq m4, m1, 4 + psrldq m5, m1, 6 + psrldq m1, 8 + + punpcklwd m3, m4 + punpcklwd m5, m1 + pmaddwd m3, [rsp+ 4*16] + pmaddwd m5, [rsp+ 5*16] + paddd m3, m5 + paddd m0, m3 + +%if %2 + movq m1, [bufyq+xq*2] +%if %3 + movq m3, [bufyq+xq*2+82] +%endif + pmaddubsw m7, m13, m1 +%if %3 + pmaddubsw m5, m13, m3 + paddw m7, m5 +%endif + pmulhrsw m7, m15 +%else + movd m7, [bufyq+xq] + pxor m1, m1 + pcmpgtb m1, m7 + punpcklbw m7, m1 +%endif + + psrldq m1, m2, 4 + psrldq m3, m2, 6 + palignr m4, m6, m2, 10 + palignr m6, m2, 12 + psrldq m2, 8 + + punpcklwd m1, m3 + punpcklwd m2, m4 + punpcklwd m6, m7 + pmaddwd m1, m9 + pmaddwd m2, m10 + pmaddwd m6, m11 + paddd m1, m2 + paddd m0, m6 + paddd m0, m1 + paddd m0, m14 + + movq m1, [bufq+xq-3] ; y=0,x=[-3,+4] + pxor m4, m4 + movd m5, [base+byte_blend] +.x_loop_ar3_inner: + pcmpgtb m2, m4, m1 + punpcklbw m3, m1, m2 + pmaddwd m2, m3, m12 + pshufd m3, m2, q1111 + paddd m2, m3 ; left+cur + paddd m2, m0 ; add top + psrldq m0, 4 + psrad m2, [fg_dataq+FGData.ar_coeff_shift] + ; don't packssdw, we only care about one value + packsswb m2, m2 + pandn m3, m5, m1 + pslld m2, 24 + pand m2, m5 + por m1, m2, m3 + movd [bufq+xq-3], m1 + psrldq m1, 1 + inc xq + jz .x_loop_ar3_end + test xq, 3 + jnz .x_loop_ar3_inner + jmp .x_loop_ar3 + +.x_loop_ar3_end: + add bufq, 82 + add bufyq, 82<<%3 + dec hd + jg .y_loop_ar3 + RET +%endmacro + +generate_grain_uv_fn 420, 1, 1 +generate_grain_uv_fn 422, 1, 0 +generate_grain_uv_fn 444, 0, 0 + +%macro vpgatherdw 5-6 ; dst, src, base, tmp_gpr[x2], tmp_xmm_reg +%assign %%idx 0 +%define %%tmp %2 +%if %0 == 6 +%define %%tmp %6 +%endif +%rep 4 +%if %%idx == 0 + movd %5 %+ d, %2 + pshuflw %%tmp, %2, q3232 +%else + movd %5 %+ d, %%tmp +%if %%idx == 2 + punpckhqdq %%tmp, %%tmp +%elif %%idx == 4 + psrlq %%tmp, 32 +%endif +%endif + movzx %4 %+ d, %5 %+ w + shr %5 %+ d, 16 + +%if %%idx == 0 + movd %1, [%3+%4] +%else + pinsrw %1, [%3+%4], %%idx + 0 +%endif + pinsrw %1, [%3+%5], %%idx + 1 +%assign %%idx %%idx+2 +%endrep +%endmacro + +INIT_XMM ssse3 +; fgy_32x32xn(dst, src, stride, fg_data, w, scaling, grain_lut, h, sby) +%if ARCH_X86_32 +%if STACK_ALIGNMENT < mmsize +cglobal fgy_32x32xn, 0, 7, 16, 0 - (6 * mmsize + (9 + 3) * gprsize), \ + dst, src, scaling, unused1, fg_data, picptr, unused2 + ; copy stack arguments to new position post-alignment, so that we + ; don't have to keep the old stack location in a separate register + mov r0, r0m + mov r1, r2m + mov r2, r4m + mov r3, r6m + mov r4, r7m + mov r5, r8m + + mov [rsp+6*mmsize+ 3*gprsize], r0 + mov [rsp+6*mmsize+ 5*gprsize], r1 + mov [rsp+6*mmsize+ 7*gprsize], r2 + mov [rsp+6*mmsize+ 9*gprsize], r3 + mov [rsp+6*mmsize+10*gprsize], r4 + mov [rsp+6*mmsize+11*gprsize], r5 +%else +cglobal fgy_32x32xn, 0, 7, 16, 6 * mmsize + (3 + 1) * gprsize, \ + dst, src, scaling, unused1, fg_data, picptr, unused2 +%endif + mov srcq, srcm + mov fg_dataq, r3m + mov scalingq, r5m +%if STACK_ALIGNMENT < mmsize +%define r0m [rsp+6*mmsize+ 3*gprsize] +%define r1m [rsp+6*mmsize+ 4*gprsize] +%define r2m [rsp+6*mmsize+ 5*gprsize] +%define r3m [rsp+6*mmsize+ 6*gprsize] +%define r4m [rsp+6*mmsize+ 7*gprsize] +%define r5m [rsp+6*mmsize+ 8*gprsize] +%define r6m [rsp+6*mmsize+ 9*gprsize] +%define r7m [rsp+6*mmsize+10*gprsize] +%define r8m [rsp+6*mmsize+11*gprsize] +%endif + LEA r5, pb_mask +%define base r5-pb_mask + mov r5m, picptrq +%else +cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut + lea r7, [pb_mask] +%define base r7-pb_mask +%endif + mov r6d, [fg_dataq+FGData.scaling_shift] + movd m3, [base+mul_bits+r6*2-14] + mov r6d, [fg_dataq+FGData.clip_to_restricted_range] + pcmpeqw m2, m2 + psrldq m2, 14 + movd m4, [base+max+r6*4] + movd m5, [base+min+r6*2] + punpcklwd m3, m3 + punpcklwd m4, m4 + punpcklwd m5, m5 + pshufd m3, m3, q0000 + pshufd m4, m4, q0000 + pshufd m5, m5, q0000 + SCRATCH 2, 10, 0 + SCRATCH 3, 11, 1 + SCRATCH 4, 12, 2 + SCRATCH 5, 13, 3 + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap +%endif + + mov sbyd, r8m + mov overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1 + test overlapd, overlapd + jz .no_vertical_overlap + mova m6, [base+pw_1024] + movd m7, [base+pb_27_17_17_27] + SCRATCH 6, 14, 4 + SCRATCH 7, 15, 5 + test sbyd, sbyd + jnz .vertical_overlap + ; fall-through + +.no_vertical_overlap: + mov r8m, overlapd +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused + imul seed, (173 << 24) | 37 +%else + imul seed, sbyd, (173 << 24) | 37 +%endif + add seed, (105 << 24) | 178 + rol seed, 8 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak + + mov r3m, seed + mov wq, r4m +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + unused1, unused2, see, unused3 +%endif + + lea src_bakq, [srcq+wq] + neg wq + sub dstmp, srcq +%if ARCH_X86_32 + mov r1m, src_bakq + mov r4m, wq + DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3 +%endif + +.loop_x: +%if ARCH_X86_32 + mov seed, r3m +%endif + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + offx, offy, see, unused + + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164 + lea offyq, [offyq+offxq*2+747] ; offy*stride+offx + +%if ARCH_X86_32 + ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr, + ; r6m=grain_lut, r7m=h, r8m=overlap_v|h + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + h, offxy, see, unused +%endif + +.loop_x_odd: + mov hd, r7m + mov grain_lutq, grain_lutmp +.loop_y: + ; src + mova m0, [srcq] + pxor m2, m2 + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + + ; scaling[src] +%if ARCH_X86_32 + vpgatherdw m4, m0, scalingq, r0, r5, m3 + vpgatherdw m5, m1, scalingq, r0, r5, m3 +%else + vpgatherdw m4, m0, scalingq, r12, r13, m3 + vpgatherdw m5, m1, scalingq, r12, r13, m3 +%endif + pcmpeqw m3, m3 + psrlw m3, 8 + pand m4, m3 + pand m5, m3 + + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq] + pcmpgtb m7, m2, m3 + punpcklbw m2, m3, m7 + punpckhbw m3, m7 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m2, m4 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + movifnidn dstq, dstmp + mova [dstq+srcq], m0 + + add srcq, r2mp + add grain_lutq, 82 + dec hd + jg .loop_y + +%if ARCH_X86_32 + add r4mp, 16 +%else + add wq, 16 +%endif + jge .end +%if ARCH_X86_32 + mov srcq, r1mp + add srcq, r4mp +%else + lea srcq, [src_bakq+wq] +%endif + btc dword r8m, 2 + jc .next_blk + + add offxyd, 16 + test dword r8m, 2 ; r8m & 2 = have_top_overlap + jz .loop_x_odd + +%if ARCH_X86_32 + add dword [rsp+6*mmsize+1*gprsize], 16 +%else + add r11d, 16 ; top_offxyd +%endif + jnz .loop_x_odd_v_overlap + +.next_blk: + test dword r8m, 1 + jz .loop_x + + test dword r8m, 2 + jnz .loop_x_hv_overlap + + ; horizontal overlap (without vertical overlap) +.loop_x_h_overlap: +%if ARCH_X86_32 + ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr, + ; r6m=grain_lut, r7m=h, r8m=overlap_v|h + DEFINE_ARGS dst, src, scaling, offxy, unused1, unused2, unused3 + + add offxyd, 16 ; left_offxyd + mov [rsp+6*mmsize+0*gprsize], offxyd + + DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3 + + mov seed, r3m +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + offx, offy, see, left_offxy + + lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx +%endif + + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx + + mov offxd, offyd +%else + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164 + lea offyq, [offyq+offxq*2+747] ; offy*stride+offx + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + h, offxy, see, left_offxy +%endif + + mov hd, r7m + mov grain_lutq, grain_lutmp +.loop_y_h_overlap: + ; src + mova m0, [srcq] + pxor m2, m2 + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + + ; scaling[src] +%if ARCH_X86_32 + vpgatherdw m4, m0, scalingq, r0, r5, m3 + vpgatherdw m5, m1, scalingq, r0, r5, m3 +%else + vpgatherdw m4, m0, scalingq, r12, r13, m3 + vpgatherdw m5, m1, scalingq, r12, r13, m3 +%endif + pcmpeqw m3, m3 + psrlw m3, 8 + pand m4, m3 + pand m5, m3 + + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq] +%if ARCH_X86_32 + mov r5, [rsp+6*mmsize+0*gprsize] + movd m7, [grain_lutq+r5] +%else + movd m7, [grain_lutq+left_offxyq] +%endif + punpcklbw m7, m3 + pmaddubsw m6, m15, m7 + pmulhrsw m6, m14 + packsswb m6, m6 + pand m6, m10 + pandn m7, m10, m3 + por m6, m7 + pcmpgtb m2, m6 + punpcklbw m7, m6, m2 + punpckhbw m6, m2 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m7, m4 + pmullw m6, m5 + pmulhrsw m7, m11 + pmulhrsw m6, m11 + + ; dst = clip_pixel(src, noise) + paddw m0, m7 + paddw m1, m6 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + movifnidn dstq, dstmp + mova [dstq+srcq], m0 + + add srcq, r2mp + add grain_lutq, 82 + dec hd + jg .loop_y_h_overlap + +%if ARCH_X86_32 + add r4mp, 16 +%else + add wq, 16 +%endif + jge .end +%if ARCH_X86_32 + mov srcq, r1m + add srcq, r4m +%else + lea srcq, [src_bakq+wq] +%endif + xor dword r8m, 4 + add offxyd, 16 + + ; since this half-block had left-overlap, the next does not + test dword r8m, 2 ; have_top_overlap + jz .loop_x_odd +%if ARCH_X86_32 + add dword [rsp+6*mmsize+1*gprsize], 16 +%else + add r11d, 16 ; top_offxyd +%endif + jmp .loop_x_odd_v_overlap + +.end: + RET + +.vertical_overlap: +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap +%endif + + or overlapd, 2 ; top_overlap: overlap & 2 + mov r8m, overlapd + movzx sbyd, sbyb +%if ARCH_X86_32 + imul r4, [fg_dataq+FGData.seed], 0x00010001 + DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused +%else + imul seed, [fg_dataq+FGData.seed], 0x00010001 +%endif + imul tmpd, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add tmpd, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and tmpd, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, tmpd +%if ARCH_X86_32 + xor sbyd, seed ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak + + mov r3m, seed + mov wq, r4m +%else + xor seed, sbyd ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + tmp, unused2, see, unused3 +%endif + + lea src_bakq, [srcq+wq] + neg wq + sub dstmp, srcq +%if ARCH_X86_32 + mov r1m, src_bakq + mov r4m, wq + DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 +%endif + +.loop_x_v_overlap: +%if ARCH_X86_32 + mov seed, r3m +%endif + ; we assume from the block above that bits 8-15 of tmpd are zero'ed, + ; because of the 'and tmpd, 0x00ff00ff' above + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp tmpb ; parity of top_seed + shr seed, 16 + shl tmpd, 16 + test seeb, seeh + setp tmpb ; parity of cur_seed + or r6d, 0x00010001 + xor tmpd, r6d + mov seed, tmpd + ror seed, 1 ; updated (cur_seed << 16) | top_seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + offx, offy, see, unused, top_offxy + + mov offyd, seed + mov offxd, seed +%endif + + ror offyd, 8 + ror offxd, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*2+0x10001*747+32*82] + +%if ARCH_X86_32 + DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + h, offxy, see, unused, top_offxy +%endif + + movzx top_offxyd, offxyw +%if ARCH_X86_32 + mov [rsp+6*mmsize+1*gprsize], top_offxyd + + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif + shr offxyd, 16 + +.loop_x_odd_v_overlap: +%if ARCH_X86_32 + mov r5, r5m + lea r5, [base+pb_27_17] + mov [rsp+5*mmsize+8], r5 +%else + mova m8, [pb_27_17] +%endif + mov hd, r7m + mov grain_lutq, grain_lutmp +.loop_y_v_overlap: + ; src + mova m0, [srcq] + pxor m2, m2 + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + + ; scaling[src] +%if ARCH_X86_32 + vpgatherdw m4, m0, scalingq, r0, r5, m3 + vpgatherdw m5, m1, scalingq, r0, r5, m3 +%else + vpgatherdw m4, m0, scalingq, r12, r13, m3 + vpgatherdw m5, m1, scalingq, r12, r13, m3 +%endif + pcmpeqw m3, m3 + psrlw m3, 8 + pand m4, m3 + pand m5, m3 + + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq] +%if ARCH_X86_32 + mov r5, [rsp+6*mmsize+1*gprsize] + movu m7, [grain_lutq+r5] +%else + movu m7, [grain_lutq+top_offxyq] +%endif + punpckhbw m6, m7, m3 + punpcklbw m7, m3 +%if ARCH_X86_32 + mov r5, [rsp+5*mmsize+8] + pmaddubsw m3, [r5], m6 + pmaddubsw m6, [r5], m7 +%else + pmaddubsw m3, m8, m6 + pmaddubsw m6, m8, m7 +%endif + pmulhrsw m3, m14 + pmulhrsw m6, m14 + packsswb m6, m3 + pcmpgtb m7, m2, m6 + punpcklbw m2, m6, m7 + punpckhbw m6, m7 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m2, m4 + pmullw m6, m5 + pmulhrsw m2, m11 + pmulhrsw m6, m11 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m6 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + movifnidn dstq, dstmp + mova [dstq+srcq], m0 + +%if ARCH_X86_32 + add dword [rsp+5*mmsize+8], mmsize +%else + mova m8, [pb_17_27] +%endif + add srcq, r2mp + add grain_lutq, 82 + dec hw + jz .end_y_v_overlap + ; 2 lines get vertical overlap, then fall back to non-overlap code for + ; remaining (up to) 30 lines + btc hd, 16 + jnc .loop_y_v_overlap + jmp .loop_y + +.end_y_v_overlap: +%if ARCH_X86_32 + add r4mp, 16 +%else + add wq, 16 +%endif + jge .end_hv +%if ARCH_X86_32 + mov srcq, r1mp + add srcq, r4mp +%else + lea srcq, [src_bakq+wq] +%endif + btc dword r8m, 2 + jc .loop_x_hv_overlap + add offxyd, 16 +%if ARCH_X86_32 + add dword [rsp+6*mmsize+1*gprsize], 16 +%else + add top_offxyd, 16 +%endif + jmp .loop_x_odd_v_overlap + +.loop_x_hv_overlap: +%if ARCH_X86_32 + mov r5, r5m + lea r5, [base+pb_27_17] + mov [rsp+5*mmsize+8], r5 + + DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, src_bak + + mov r5, [rsp+6*mmsize+1*gprsize] + mov r4, offxyd + add r5, 16 + add r4, 16 + mov [rsp+6*mmsize+2*gprsize], r5 ; topleft_offxy + mov [rsp+6*mmsize+0*gprsize], r4 ; left_offxy + + DEFINE_ARGS tmp, src, scaling, see, w, picptr, src_bak + + xor tmpd, tmpd + mov seed, r3m +%else + mova m8, [pb_27_17] + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + tmp, unused2, see, unused3 + + ; we assume from the block above that bits 8-15 of tmpd are zero'ed +%endif + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp tmpb ; parity of top_seed + shr seed, 16 + shl tmpd, 16 + test seeb, seeh + setp tmpb ; parity of cur_seed + or r6d, 0x00010001 + xor tmpd, r6d + mov seed, tmpd + ror seed, 1 ; updated (cur_seed << 16) | top_seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, top_offxy, topleft_offxy + + lea topleft_offxyq, [top_offxyq+16] + lea left_offxyq, [offyq+16] + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + ror offxd, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*2+0x10001*747+32*82] + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut + + movzx r5, offxyw ; top_offxy + mov [rsp+6*mmsize+1*gprsize], r5 +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, top_offxy, topleft_offxy + + movzx top_offxyd, offxyw +%endif + shr offxyd, 16 + + mov hd, r7m + mov grain_lutq, grain_lutmp +.loop_y_hv_overlap: + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq] +%if ARCH_X86_32 + mov r5, [rsp+6*mmsize+1*gprsize] ; top_offxy + mov r0, [rsp+6*mmsize+0*gprsize] ; left_offxy + movu m6, [grain_lutq+r5] + mov r5, [rsp+6*mmsize+2*gprsize] ; topleft_offxy + movd m4, [grain_lutq+r0] + movd m7, [grain_lutq+r5] +%else + movu m6, [grain_lutq+top_offxyq] + movd m4, [grain_lutq+left_offxyq] + movd m7, [grain_lutq+topleft_offxyq] +%endif + ; do h interpolation first (so top | top/left -> top, left | cur -> cur) + punpcklbw m4, m3 + punpcklbw m7, m6 + pmaddubsw m2, m15, m4 + pmaddubsw m4, m15, m7 + pmulhrsw m2, m14 + pmulhrsw m4, m14 + packsswb m2, m2 + packsswb m4, m4 + pand m2, m10 + pand m4, m10 + pandn m7, m10, m3 + pandn m3, m10, m6 + por m7, m2 + por m3, m4 + ; followed by v interpolation (top | cur -> cur) + punpckhbw m4, m3, m7 + punpcklbw m3, m7 +%if ARCH_X86_32 + mov r5, [rsp+5*mmsize+8] + pmaddubsw m7, [r5], m4 + pmaddubsw m4, [r5], m3 +%else + pmaddubsw m7, m8, m4 + pmaddubsw m4, m8, m3 +%endif + pmulhrsw m7, m14 + pmulhrsw m4, m14 + packsswb m4, m7 + pxor m2, m2 + pcmpgtb m7, m2, m4 + punpcklbw m3, m4, m7 + punpckhbw m4, m7 + + ; src + mova m0, [srcq] + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + + ; scaling[src] +%if ARCH_X86_32 + vpgatherdw m5, m0, scalingq, r0, r5, m7 + vpgatherdw m6, m1, scalingq, r0, r5, m7 +%else + vpgatherdw m5, m0, scalingq, r13, r14, m7 + vpgatherdw m6, m1, scalingq, r13, r14, m7 +%endif + pcmpeqw m7, m7 + psrlw m7, 8 + pand m5, m7 + pand m6, m7 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m3, m5 + pmullw m4, m6 + pmulhrsw m3, m11 + pmulhrsw m4, m11 + + ; dst = clip_pixel(src, noise) + paddw m0, m3 + paddw m1, m4 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + movifnidn dstq, dstmp + mova [dstq+srcq], m0 + +%if ARCH_X86_32 + add dword [rsp+5*mmsize+8], mmsize +%else + mova m8, [pb_17_27] +%endif + add srcq, r2mp + add grain_lutq, 82 + dec hw + jz .end_y_hv_overlap + ; 2 lines get vertical overlap, then fall back to non-overlap code for + ; remaining (up to) 30 lines + btc hd, 16 + jnc .loop_y_hv_overlap + jmp .loop_y_h_overlap + +.end_y_hv_overlap: +%if ARCH_X86_32 + add r4mp, 16 +%else + add wq, 16 +%endif + jge .end_hv +%if ARCH_X86_32 + mov srcq, r1m + add srcq, r4m +%else + lea srcq, [src_bakq+wq] +%endif + xor dword r8m, 4 + add offxyd, 16 +%if ARCH_X86_32 + add dword [rsp+6*mmsize+1*gprsize], 16 +%else + add top_offxyd, 16 +%endif + jmp .loop_x_odd_v_overlap + +.end_hv: + RET + +%macro FGUV_FN 3 ; name, ss_hor, ss_ver +INIT_XMM ssse3 +%if ARCH_X86_32 +; fguv_32x32xn_i420_ssse3(dst, src, stride, fg_data, w, scaling, grain_lut, h, +; sby, luma, lstride, uv_pl, is_id) +%if STACK_ALIGNMENT < mmsize +DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8 +cglobal fguv_32x32xn_i%1, 0, 7, 8, 0 - (8 * mmsize + (13 + 3) * gprsize), \ + tmp, src, scaling, h, fg_data, picptr, unused + mov r0, r0m + mov r1, r2m + mov r2, r4m + mov r3, r6m + mov r4, r7m + mov [rsp+8*mmsize+3*gprsize], r0 + mov [rsp+8*mmsize+5*gprsize], r1 + mov [rsp+8*mmsize+7*gprsize], r2 + mov [rsp+8*mmsize+9*gprsize], r3 + mov [rsp+8*mmsize+10*gprsize], r4 + + mov r0, r8m + mov r1, r9m + mov r2, r10m + mov r4, r11m + mov r3, r12m + mov [rsp+8*mmsize+11*gprsize], r0 + mov [rsp+8*mmsize+12*gprsize], r1 + mov [rsp+8*mmsize+13*gprsize], r2 + mov [rsp+8*mmsize+14*gprsize], r4 +%else +cglobal fguv_32x32xn_i%1, 0, 7, 8, 8 * mmsize + (4) * gprsize, \ + tmp, src, scaling, h, fg_data, picptr, unused +%endif + mov srcq, srcm + mov fg_dataq, r3m + mov scalingq, r5m +%if STACK_ALIGNMENT < mmsize +%define r0m [rsp+8*mmsize+ 3*gprsize] +%define r1m [rsp+8*mmsize+ 4*gprsize] +%define r2m [rsp+8*mmsize+ 5*gprsize] +%define r3m [rsp+8*mmsize+ 6*gprsize] +%define r4m [rsp+8*mmsize+ 7*gprsize] +%define r5m [rsp+8*mmsize+ 8*gprsize] +%define r6m [rsp+8*mmsize+ 9*gprsize] +%define r7m [rsp+8*mmsize+10*gprsize] +%define r8m [rsp+8*mmsize+11*gprsize] +%define r9m [rsp+8*mmsize+12*gprsize] +%define r10m [rsp+8*mmsize+13*gprsize] +%define r11m [rsp+8*mmsize+14*gprsize] +%define r12m [rsp+8*mmsize+15*gprsize] +%endif + LEA r5, pb_mask +%define base r5-pb_mask + mov r5m, r5 +%else +cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ + grain_lut, tmp, sby, luma, lstride, uv_pl, is_id + lea r8, [pb_mask] +%define base r8-pb_mask +%endif + mov r6d, [fg_dataq+FGData.scaling_shift] + pcmpeqw m2, m2 + movd m3, [base+mul_bits+r6*2-14] + mov r6d, [fg_dataq+FGData.clip_to_restricted_range] + lea tmpd, [r6d*2] +%if ARCH_X86_32 && STACK_ALIGNMENT < mmsize + test r3, r3 +%else + cmp dword r12m, 0 ; is_idm +%endif + movd m5, [base+min+r6*2] + cmovne r6d, tmpd + movd m4, [base+max+r6*2] + psrldq m2, 14+%2 + punpcklwd m3, m3 + punpcklwd m5, m5 + punpcklwd m4, m4 + pshufd m3, m3, q0000 + pshufd m5, m5, q0000 + pshufd m4, m4, q0000 + SCRATCH 2, 10, 0 + SCRATCH 3, 11, 1 + SCRATCH 4, 12, 2 + SCRATCH 5, 13, 3 + + cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 + jne .csfl + +%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap +%endif + +%if %1 + mov r6d, dword r11m + movd m0, [fg_dataq+FGData.uv_mult+r6*4] + movd m1, [fg_dataq+FGData.uv_luma_mult+r6*4] + punpcklbw m6, m1, m0 + movd m7, [fg_dataq+FGData.uv_offset+r6*4] + punpcklwd m6, m6 + punpcklwd m7, m7 + pshufd m6, m6, q0000 + pshufd m7, m7, q0000 + SCRATCH 6, 14, 4 + SCRATCH 7, 15, 5 +%endif + + mov sbyd, r8m + mov overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1 + test overlapd, overlapd + jz %%no_vertical_overlap +%if ARCH_X86_32 +%if %2 + movd m1, [base+pb_23_22] +%else + movd m1, [base+pb_27_17_17_27] +%endif + mova m0, [base+pw_1024] +%else +%if %2 + movd m1, [pb_23_22] +%else + movd m1, [pb_27_17_17_27] +%endif + mova m0, [pw_1024] +%endif + pshufd m1, m1, q0000 + SCRATCH 0, 8, 6 + SCRATCH 1, 9, 7 + test sbyd, sbyd + jnz %%vertical_overlap + ; fall-through + +%%no_vertical_overlap: + mov r8m, overlapd +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap + imul seed, (173 << 24) | 37 +%else + imul seed, sbyd, (173 << 24) | 37 +%endif + add seed, (105 << 24) | 178 + rol seed, 8 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak +%define luma_bakq lumaq + + mov wq, r4m +%if %3 + shl r10mp, 1 +%endif +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + unused2, unused3, see, overlap, unused4, src_bak, lstride, luma_bak + + mov lstrideq, r10mp +%endif + + mov lumaq, r9mp + lea src_bakq, [srcq+wq] + lea luma_bakq, [lumaq+wq*(1+%2)] + neg wq + sub r0mp, srcq +%if ARCH_X86_32 + mov r1m, src_bakq + mov r11m, luma_bakq + mov r4m, wq + + DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 +%else + mov r11mp, src_bakq + mov r12mp, strideq +%endif + +%%loop_x: +%if ARCH_X86_32 + mov seed, r3m +%endif + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, overlap, unused1, unused2, lstride + + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx + +%if ARCH_X86_32 + DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + h, offxy, see, overlap, unused1, unused2, lstride, luma_bak +%endif + +%%loop_x_odd: + mov hd, r7m + mov grain_lutq, grain_lutmp +%%loop_y: + ; src +%if ARCH_X86_32 + mov lumaq, r9mp +%endif +%if %2 + mova m4, [lumaq+ 0] + mova m6, [lumaq+16] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq + mov r5, r5m + movd m7, [base+pb_1] +%else + movd m7, [pb_1] +%endif + pshufd m7, m7, q0000 + pxor m2, m2 + pmaddubsw m4, m7 + pmaddubsw m6, m7 + pavgw m4, m2 + pavgw m6, m2 +%else + mova m4, [lumaq] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq +%endif + pxor m2, m2 +%endif + +%if %1 +%if %2 + packuswb m4, m6 ; luma +%endif + punpckhbw m6, m4, m0 + punpcklbw m4, m0 ; { luma, chroma } + pmaddubsw m6, m14 + pmaddubsw m4, m14 + psraw m6, 6 + psraw m4, 6 + paddw m6, m15 + paddw m4, m15 + packuswb m4, m6 ; pack+unpack = clip + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%elif %2 == 0 + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%endif + + ; scaling[luma_src] +%if ARCH_X86_32 + vpgatherdw m7, m4, scalingq, r0, r5 + vpgatherdw m5, m6, scalingq, r0, r5 +%else + vpgatherdw m7, m4, scalingq, r12, r2 + vpgatherdw m5, m6, scalingq, r12, r2 +%endif + pcmpeqw m1, m1 + psrlw m1, 8 + pand m7, m1 + pand m5, m1 + + ; unpack chroma_source + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq+ 0] + pcmpgtb m6, m2, m3 + punpcklbw m2, m3, m6 + punpckhbw m3, m6 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmullw m2, m7 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + movifnidn dstq, dstmp + mova [dstq+srcq], m0 + +%if ARCH_X86_32 + add srcq, r2mp + ; we already incremented lumaq above +%else + add srcq, r12mp +%if %3 + lea lumaq, [lumaq+lstrideq*2] +%else + add lumaq, lstrideq +%endif +%endif + add grain_lutq, 82 + dec hw + jg %%loop_y + +%if ARCH_X86_32 + DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut + + mov wq, r4m +%endif + add wq, 16 + jge %%end +%if ARCH_X86_32 + mov srcq, r1mp + mov lumaq, r11mp +%else + mov srcq, r11mp +%endif + lea lumaq, [luma_bakq+wq*(1+%2)] + add srcq, wq +%if ARCH_X86_32 + mov r4m, wq + mov r9m, lumaq +%endif +%if %2 == 0 + ; adjust top_offxy +%if ARCH_X86_32 + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + add r11d, 16 +%endif + add offxyd, 16 + btc dword r8m, 2 + jc %%loop_x_even + test dword r8m, 2 + jz %%loop_x_odd + jmp %%loop_x_odd_v_overlap +%%loop_x_even: +%endif + test dword r8m, 1 + jz %%loop_x + + ; r8m = sbym + test dword r8m, 2 + jne %%loop_x_hv_overlap + + ; horizontal overlap (without vertical overlap) +%%loop_x_h_overlap: +%if ARCH_X86_32 +%if %2 + lea r6, [offxyd+16] + mov [rsp+8*mmsize+0*gprsize], r6 +%else + mov [rsp+8*mmsize+0*gprsize], offxyd +%endif + + DEFINE_ARGS luma, src, scaling, see, w, picptr, grain_lut + + mov seed, r3m +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, unused1, unused2, lstride + +%if %2 + lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx +%else + mov left_offxyd, offyd +%endif +%endif + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS luma, src, scaling, offy, w, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, unused1, unused2, lstride + + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx + +%if ARCH_X86_32 + DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, unused1, unused2, lstride, luma_bak +%endif + + mov hd, r7m + mov grain_lutq, grain_lutmp +%%loop_y_h_overlap: + ; src +%if ARCH_X86_32 + mov lumaq, r9mp +%endif +%if %2 + mova m4, [lumaq+ 0] + mova m6, [lumaq+16] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq + mov r5, r5m + movd m7, [base+pb_1] +%else + movd m7, [pb_1] +%endif + pshufd m7, m7, q0000 + pxor m2, m2 + pmaddubsw m4, m7 + pmaddubsw m6, m7 + pavgw m4, m2 + pavgw m6, m2 +%else + mova m4, [lumaq] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq +%endif + pxor m2, m2 +%endif + +%if %1 +%if %2 + packuswb m4, m6 ; luma +%endif + punpckhbw m6, m4, m0 + punpcklbw m4, m0 ; { luma, chroma } + pmaddubsw m6, m14 + pmaddubsw m4, m14 + psraw m6, 6 + psraw m4, 6 + paddw m6, m15 + paddw m4, m15 + packuswb m4, m6 ; pack+unpack = clip + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%elif %2 == 0 + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%endif + + ; scaling[luma_src] +%if ARCH_X86_32 + vpgatherdw m7, m4, scalingq, r0, r5 + vpgatherdw m5, m6, scalingq, r0, r5 +%else + vpgatherdw m7, m4, scalingq, r12, r2 + vpgatherdw m5, m6, scalingq, r12, r2 +%endif + pcmpeqw m1, m1 + psrlw m1, 8 + pand m7, m1 + pand m5, m1 + + ; unpack chroma_source + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq+ 0] +%if ARCH_X86_32 + mov r0, [rsp+8*mmsize+0*gprsize] + movd m4, [grain_lutq+r0+ 0] +%else + movd m4, [grain_lutq+left_offxyq+ 0] +%endif + punpcklbw m2, m4, m3 + pmaddubsw m4, m9, m2 + pmulhrsw m4, m8 + packsswb m4, m4 + pand m4, m10 + pandn m2, m10, m3 + por m3, m4, m2 + pxor m4, m4 + pcmpgtb m4, m3 + punpcklbw m2, m3, m4 + punpckhbw m3, m4 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmullw m2, m7 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + movifnidn dstq, dstmp + mova [dstq+srcq], m0 + +%if ARCH_X86_32 + add srcq, r2mp + ; lumaq has already been incremented above +%else + add srcq, r12mp +%if %3 + lea lumaq, [lumaq+lstrideq*2] +%else + add lumaq, lstrideq +%endif +%endif + add grain_lutq, 82 + dec hw + jg %%loop_y_h_overlap + +%if ARCH_X86_32 + DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut + + mov wq, r4m +%endif + add wq, 16 + jge %%end +%if ARCH_X86_32 + mov srcq, r1mp + mov lumaq, r11mp +%else + mov srcq, r11mp +%endif + lea lumaq, [luma_bakq+wq*(1+%2)] + add srcq, wq +%if ARCH_X86_32 + mov r4m, wq + mov r9m, lumaq +%endif +%if %2 == 0 + xor dword r8m, 4 + ; adjust top_offxyd +%if ARCH_X86_32 + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + add r11d, 16 +%endif + add offxyd, 16 +%endif + + ; r8m = sbym + test dword r8m, 2 +%if %2 + jne %%loop_x_hv_overlap + jmp %%loop_x_h_overlap +%else + jne %%loop_x_odd_v_overlap + jmp %%loop_x_odd +%endif + +%%end: + RET + +%%vertical_overlap: +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap +%endif + + or overlapd, 2 ; top_overlap: overlap & 2 + mov r8m, overlapd + movzx sbyd, sbyb +%if ARCH_X86_32 + imul r4, [fg_dataq+FGData.seed], 0x00010001 + DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused +%else + imul seed, [fg_dataq+FGData.seed], 0x00010001 +%endif + imul tmpd, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add tmpd, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and tmpd, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, tmpd +%if ARCH_X86_32 + xor sbyd, seed ; (cur_seed << 16) | top_seed + + DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak + + mov r3m, seed + mov wq, r4m +%if %3 + shl r10mp, 1 +%endif +%else + xor seed, sbyd ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + tmp, unused2, see, overlap, unused3, src_bak, lstride, luma_bak + + mov lstrideq, r10mp +%endif + + mov lumaq, r9mp + lea src_bakq, [srcq+wq] + lea luma_bakq, [lumaq+wq*(1+%2)] + neg wq + sub r0mp, srcq +%if ARCH_X86_32 + mov r1m, src_bakq + mov r11m, luma_bakq + mov r4m, wq + + DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 +%else + mov r11mp, src_bakq + mov r12mp, strideq +%endif + +%%loop_x_v_overlap: +%if ARCH_X86_32 + mov seed, r3m + xor tmpd, tmpd +%endif + ; we assume from the block above that bits 8-15 of tmpd are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp tmpb ; parity of top_seed + shr seed, 16 + shl tmpd, 16 + test seeb, seeh + setp tmpb ; parity of cur_seed + or r6d, 0x00010001 + xor tmpd, r6d + mov seed, tmpd + ror seed, 1 ; updated (cur_seed << 16) | top_seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, overlap, top_offxy, unused, lstride + + mov offxd, seed + mov offyd, seed +%endif + ror offyd, 8 + ror offxd, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + +%if ARCH_X86_32 + DEFINE_ARGS tmp, src, scaling, offxy, h, picptr, top_offxy +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + h, offxy, see, overlap, top_offxy, unused, lstride, luma_bak +%endif + + movzx top_offxyd, offxyw + shr offxyd, 16 +%if ARCH_X86_32 + mov [rsp+8*mmsize+1*gprsize], top_offxyd + + DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut +%endif + +%%loop_x_odd_v_overlap: + mov hd, r7m + mov grain_lutq, grain_lutmp +%if ARCH_X86_32 + mov r5, r5m + mova m1, [base+pb_27_17] +%else + mova m1, [pb_27_17] +%endif +%%loop_y_v_overlap: +%if ARCH_X86_32 + mov lumaq, r9mp +%endif +%if %2 + mova m4, [lumaq+ 0] + mova m6, [lumaq+16] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq + mov r5, r5m + movd m7, [base+pb_1] +%else + movd m7, [pb_1] +%endif + pshufd m7, m7, q0000 + pxor m2, m2 + pmaddubsw m4, m7 + pmaddubsw m6, m7 + pavgw m4, m2 + pavgw m6, m2 +%else + mova m4, [lumaq] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq +%endif + pxor m2, m2 +%endif + +%if %1 +%if %2 + packuswb m4, m6 ; luma +%endif + punpckhbw m6, m4, m0 + punpcklbw m4, m0 ; { luma, chroma } + pmaddubsw m6, m14 + pmaddubsw m4, m14 + psraw m6, 6 + psraw m4, 6 + paddw m6, m15 + paddw m4, m15 + packuswb m4, m6 ; pack+unpack = clip + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%elif %2 == 0 + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%endif + + ; scaling[luma_src] +%if ARCH_X86_32 + vpgatherdw m7, m4, scalingq, r0, r5 + vpgatherdw m5, m6, scalingq, r0, r5 +%else + vpgatherdw m7, m4, scalingq, r12, r2 + vpgatherdw m5, m6, scalingq, r12, r2 +%endif + pcmpeqw m4, m4 + psrlw m4, 8 + pand m7, m4 + pand m5, m4 + + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq] +%if ARCH_X86_32 + mov r0, [rsp+8*mmsize+1*gprsize] + movu m4, [grain_lutq+r0] +%else + movu m4, [grain_lutq+top_offxyq] +%endif + punpckhbw m6, m4, m3 + punpcklbw m4, m3 +%if %3 + pmaddubsw m2, m9, m6 + pmaddubsw m3, m9, m4 +%else + pmaddubsw m2, m1, m6 + pmaddubsw m3, m1, m4 +%endif + pmulhrsw m2, m8 + pmulhrsw m3, m8 + packsswb m3, m2 + pxor m6, m6 + pcmpgtb m6, m3 + punpcklbw m2, m3, m6 + punpckhbw m3, m6 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmullw m2, m7 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + + ; unpack chroma_source + pxor m4, m4 + punpckhbw m6, m0, m4 + punpcklbw m0, m4 ; m0-1: src as word + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m6, m3 + pmaxsw m0, m13 + pmaxsw m6, m13 + pminsw m0, m12 + pminsw m6, m12 + packuswb m0, m6 + movifnidn dstq, dstmp + mova [dstq+srcq], m0 + + dec hw + je %%end_y_v_overlap +%if ARCH_X86_32 + add srcq, r2mp + ; lumaq has already been incremented above +%else + add srcq, r12mp +%if %3 + lea lumaq, [lumaq+lstrideq*2] +%else + add lumaq, lstrideq +%endif +%endif + add grain_lutq, 82 +%if %3 == 0 + btc hd, 16 +%if ARCH_X86_32 + mov r5, r5m + mova m1, [base+pb_17_27] +%else + mova m1, [pb_17_27] +%endif + jnc %%loop_y_v_overlap +%endif + jmp %%loop_y + +%%end_y_v_overlap: +%if ARCH_X86_32 + DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut + + mov wq, r4m +%endif + add wq, 16 + jge %%end_hv +%if ARCH_X86_32 + mov srcq, r1mp + mov lumaq, r11mp +%else + mov srcq, r11mp +%endif + lea lumaq, [luma_bakq+wq*(1+%2)] + add srcq, wq +%if ARCH_X86_32 + mov r4m, wq + mov r9m, lumaq +%endif + +%if %2 + ; since fg_dataq.overlap is guaranteed to be set, we never jump + ; back to .loop_x_v_overlap, and instead always fall-through to + ; h+v overlap +%else +%if ARCH_X86_32 + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + add top_offxyd, 16 +%endif + add offxyd, 16 + btc dword r8m, 2 + jnc %%loop_x_odd_v_overlap +%endif + +%%loop_x_hv_overlap: +%if ARCH_X86_32 + DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, unused + + mov r6, [rsp+8*mmsize+1*gprsize] +%if %2 + lea r0, [r3d+16] + add r6, 16 + mov [rsp+8*mmsize+0*gprsize], r0 ; left_offxy +%else + mov [rsp+8*mmsize+0*gprsize], r3 ; left_offxy +%endif + mov [rsp+8*mmsize+2*gprsize], r6 ; topleft_offxy + + DEFINE_ARGS tmp, src, scaling, see, w, picptr, unused + + mov seed, r3m + xor tmpd, tmpd +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + tmp, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride + +%if %2 + lea topleft_offxyq, [top_offxyq+16] + lea left_offxyq, [offxyq+16] +%else + mov topleft_offxyq, top_offxyq + mov left_offxyq, offxyq +%endif + + ; we assume from the block above that bits 8-15 of tmpd are zero'ed +%endif + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp tmpb ; parity of top_seed + shr seed, 16 + shl tmpd, 16 + test seeb, seeh + setp tmpb ; parity of cur_seed + or r6d, 0x00010001 + xor tmpd, r6d + mov seed, tmpd + ror seed, 1 ; updated (cur_seed << 16) | top_seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS tmp, src, scaling, offy, w, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride + + mov offxd, seed + mov offyd, seed +%endif + ror offyd, 8 + ror offxd, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + +%if ARCH_X86_32 + DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride, luma_bak +%endif + + movzx top_offxyd, offxyw + shr offxyd, 16 +%if ARCH_X86_32 + mov [rsp+8*mmsize+1*gprsize], top_offxyd +%endif + + mov hd, r7m + mov grain_lutq, grain_lutmp +%if ARCH_X86_32 + mov r5, r5m + mova m3, [base+pb_27_17] +%else + mova m3, [pb_27_17] +%endif +%%loop_y_hv_overlap: + ; src +%if ARCH_X86_32 + DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut + + mov lumaq, r9mp +%endif +%if %2 + mova m4, [lumaq+ 0] + mova m6, [lumaq+16] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq + mov r5, r5m + movd m7, [base+pb_1] +%else + movd m7, [pb_1] +%endif + pshufd m7, m7, q0000 + pxor m2, m2 + pmaddubsw m4, m7 + pmaddubsw m6, m7 + pavgw m4, m2 + pavgw m6, m2 +%else + mova m4, [lumaq] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq +%endif + pxor m2, m2 +%endif + +%if %1 +%if %2 + packuswb m4, m6 ; luma +%endif + punpckhbw m6, m4, m0 + punpcklbw m4, m0 ; { luma, chroma } + pmaddubsw m6, m14 + pmaddubsw m4, m14 + psraw m6, 6 + psraw m4, 6 + paddw m6, m15 + paddw m4, m15 + packuswb m4, m6 ; pack+unpack = clip + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%elif %2 == 0 + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%endif + + ; scaling[src] +%if ARCH_X86_32 + vpgatherdw m7, m4, scalingq, r0, r5 + vpgatherdw m5, m6, scalingq, r0, r5 +%else + movd m1, [grain_lutq+topleft_offxyq] +%if %3 + vpgatherdw m7, m4, scalingq, r2, r12 + vpgatherdw m5, m6, scalingq, r2, r12 +%else + vpgatherdw m7, m4, scalingq, r2, r13 + vpgatherdw m5, m6, scalingq, r2, r13 +%endif +%endif + pcmpeqw m2, m2 + psrlw m2, 8 + pand m7, m2 + pand m5, m2 + + ; grain = grain_lut[offy+y][offx+x] +%if ARCH_X86_32 + mov r0, [rsp+8*mmsize+2*gprsize] ; topleft_offxy + mov r5, [rsp+8*mmsize+1*gprsize] ; top_offxy + movd m1, [grain_lutq+r0] + mov r0, [rsp+8*mmsize+0*gprsize] ; left_offxy +%endif + movu m2, [grain_lutq+offxyq] +%if ARCH_X86_32 + movu m6, [grain_lutq+r5] + movd m4, [grain_lutq+r0] +%else + movu m6, [grain_lutq+top_offxyq] + movd m4, [grain_lutq+left_offxyq] +%endif + ; do h interpolation first (so top | top/left -> top, left | cur -> cur) + punpcklbw m1, m6 + punpcklbw m4, m2 +%if %2 + punpcklwd m4, m1 +%else + punpckldq m4, m1 +%endif + pmaddubsw m1, m9, m4 + pmulhrsw m1, m8 + packsswb m1, m1 + pandn m4, m10, m2 + pandn m2, m10, m6 + psrldq m6, m1, 2-%2 + pand m1, m10 + pand m6, m10 + por m4, m1 + por m2, m6 + ; followed by v interpolation (top | cur -> cur) + punpckhbw m1, m2, m4 + punpcklbw m2, m4 +%if %3 + pmaddubsw m4, m9, m1 + pmaddubsw m1, m9, m2 +%else + pmaddubsw m4, m3, m1 + pmaddubsw m1, m3, m2 +%endif + pmulhrsw m4, m8 + pmulhrsw m1, m8 + packsswb m1, m4 + pxor m4, m4 + pcmpgtb m4, m1 + punpcklbw m2, m1, m4 + punpckhbw m1, m4 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m2, m7 + pmullw m1, m5 + pmulhrsw m2, m11 + pmulhrsw m1, m11 + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif + + ; unpack chroma source + pxor m4, m4 + punpckhbw m5, m0, m4 + punpcklbw m0, m4 ; m0-1: src as word + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m5, m1 + pmaxsw m0, m13 + pmaxsw m5, m13 + pminsw m0, m12 + pminsw m5, m12 + packuswb m0, m5 + movifnidn dstq, dstmp + mova [dstq+srcq], m0 + +%if ARCH_X86_32 + add srcq, r2mp + ; lumaq has been adjusted above already +%else + add srcq, r12mp +%if %3 + lea lumaq, [lumaq+lstrideq*(1+%2)] +%else + add lumaq, r10mp +%endif +%endif + add grain_lutq, 82 + dec hw +%if %3 + jg %%loop_y_h_overlap +%else + jle %%end_y_hv_overlap +%if ARCH_X86_32 + mov r5, r5m + mova m3, [base+pb_17_27] +%else + mova m3, [pb_17_27] +%endif + btc hd, 16 + jnc %%loop_y_hv_overlap +%if ARCH_X86_64 + mov lstrideq, r10mp +%endif + jmp %%loop_y_h_overlap +%%end_y_hv_overlap: +%if ARCH_X86_64 + mov lstrideq, r10mp +%endif +%endif + +%if ARCH_X86_32 + DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut + + mov wq, r4m +%endif + add wq, 16 + jge %%end_hv +%if ARCH_X86_32 + mov srcq, r1mp + mov lumaq, r11mp +%else + mov srcq, r11mp +%endif + lea lumaq, [luma_bakq+wq*(1+%2)] + add srcq, wq +%if ARCH_X86_32 + mov r4m, wq + mov r9m, lumaq +%endif +%if %2 + jmp %%loop_x_hv_overlap +%else +%if ARCH_X86_32 + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + add top_offxyd, 16 +%endif + add offxyd, 16 + xor dword r8m, 4 + jmp %%loop_x_odd_v_overlap +%endif + +%%end_hv: + RET +%endmacro + + %%FGUV_32x32xN_LOOP 1, %2, %3 +.csfl: + %%FGUV_32x32xN_LOOP 0, %2, %3 +%endmacro + +FGUV_FN 420, 1, 1 + +%if STACK_ALIGNMENT < mmsize +DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 +%endif + +FGUV_FN 422, 1, 0 + +%if STACK_ALIGNMENT < mmsize +DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 +%endif + +FGUV_FN 444, 0, 0 diff --git a/third_party/dav1d/src/x86/ipred.asm b/third_party/dav1d/src/x86/ipred.asm new file mode 100644 index 0000000000..20fd89dc12 --- /dev/null +++ b/third_party/dav1d/src/x86/ipred.asm @@ -0,0 +1,5387 @@ +; Copyright © 2018, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 64 + +%macro SMOOTH_WEIGHT_TABLE 1-* + %rep %0 + db %1-128, 127-%1 + %rotate 1 + %endrep +%endmacro + +; sm_weights[], but modified to precalculate x and 256-x with offsets to +; enable efficient use of pmaddubsw (which requires signed values) +smooth_weights: SMOOTH_WEIGHT_TABLE \ + 0, 0, 255, 128, 255, 149, 85, 64, \ + 255, 197, 146, 105, 73, 50, 37, 32, \ + 255, 225, 196, 170, 145, 123, 102, 84, \ + 68, 54, 43, 33, 26, 20, 17, 16, \ + 255, 240, 225, 210, 196, 182, 169, 157, \ + 145, 133, 122, 111, 101, 92, 83, 74, \ + 66, 59, 52, 45, 39, 34, 29, 25, \ + 21, 17, 14, 12, 10, 9, 8, 8, \ + 255, 248, 240, 233, 225, 218, 210, 203, \ + 196, 189, 182, 176, 169, 163, 156, 150, \ + 144, 138, 133, 127, 121, 116, 111, 106, \ + 101, 96, 91, 86, 82, 77, 73, 69, \ + 65, 61, 57, 54, 50, 47, 44, 41, \ + 38, 35, 32, 29, 27, 25, 22, 20, \ + 18, 16, 15, 13, 12, 10, 9, 8, \ + 7, 6, 6, 5, 5, 4, 4, 4 + +pb_1to32: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + db 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 +pb_32to1: db 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17 +pb_16to1: db 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 +z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39 + db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1 +z_filter_k: db 0, 16, 0, 16, 0, 20, 0, 20, 8, 16, 8, 16 + db 32, 16, 32, 16, 24, 20, 24, 20, 16, 16, 16, 16 + db 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 8, 0 +z_filter_s: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7 + db 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15 + db 15, 15, 15, 15, 15, 15, 15, 15 ; should be in one cache line +pb_128: times 4 db 128 ; those are just placed here for alignment. +pb_36_m4: times 2 db 36, -4 +z3_shuf: db 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0 +z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0 +z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0 +z_upsample1: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 +z_upsample2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 8, 8, 8 +z2_upsample: db 7, 6, 15, 14, 5, 4, 13, 12, 3, 2, 11, 10, 1, 0, 9, 8 +z1_shuf_w4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 +z2_shuf_h2: db 3, 2, 7, 6, 11, 10, 15, 14, 2, 1, 6, 5, 10, 9, 14, 13 +z2_shuf_h4: db 7, 6, 15, 14, 6, 5, 14, 13, 5, 4, 13, 12, 4, 3, 12, 11 +z3_shuf_w4: db 4, 3, 3, 2, 2, 1, 1, 0, 12, 11, 11, 10, 10, 9, 9, 8 +z_transpose4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 +z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64 + dw 16*64, 17*64, 18*64, 19*64, 20*64, 21*64, 22*64, 23*64 +z2_base_inc: dw 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64, 8*64 + dw 9*64, 10*64, 11*64, 12*64, 13*64, 14*64, 15*64, 16*64 +z2_ymul: dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +z2_y_shuf_h4: db 90, 90, 90, 90, 14, 14, 14, 14, 27, 27, 27, 27, 31, 31, 31, 31 ; 2, 6, 3, 7 + db 32, 32, 32, 32, 12, 12, 12, 12, 1, 0, 1, 0, 5, -1, -1, -1 ; 0, 4, 1, 5 +; vpermd indices in bits 4..6 of filter_shuf1: 0, 2, 6, 4, 1, 3, 7, 5 +filter_shuf1: db 10, 4, 10, 4, 37, 6, 5, 6,103, 9, 7, 9, 72, -1, 8, -1 + db 16, 4, 0, 4, 53, 6, 5, 6,119, 11, 7, 11, 95, -1, 15, -1 +filter_shuf2: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 2, 7, 2, 1, -1, 1, -1 +filter_shuf3: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 11, 7, 11; 15, -1, 15, -1 +pb_127_m127: times 2 db 127, -127 +ipred_v_shuf: db 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13 + db 2, 3, 2, 3, 6, 7, 6, 7, 10, 11, 10, 11, 14, 15, 14, 15 +ipred_h_shuf: db 7, 7, 7, 7, 3, 3, 3, 3, 5, 5, 5, 5, 1, 1, 1, 1 + db 6, 6, 6, 6, 2, 2, 2, 2, 4, 4, 4, 4; 0, 0, 0, 0 +pw_64: times 2 dw 64 + +cfl_ac_444_w16_pad1_shuffle: db 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1 + times 9 db 7, -1 +cfl_ac_w16_pad_shuffle: ; w=16, w_pad=1 + db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + ; w=8, w_pad=1 as well as second half of previous one +cfl_ac_w8_pad1_shuffle: db 0, 1, 2, 3, 4, 5 + times 5 db 6, 7 + ; w=16,w_pad=2 + db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + times 8 db 14, 15 + ; w=16,w_pad=3 + db 0, 1, 2, 3, 4, 5 + times 13 db 6, 7 +pb_15to0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + +%define pb_0to15 cfl_ac_w16_pad_shuffle +%define pb_1 (ipred_h_shuf+12) +%define pb_2 (ipred_h_shuf+20) +%define pb_3 (ipred_h_shuf+ 4) +%define pb_4 (ipred_h_shuf+24) +%define pb_5 (ipred_h_shuf+ 8) +%define pb_7 (ipred_h_shuf+ 0) +%define pb_8 (z_upsample2 +12) +%define pb_12 (z2_y_shuf_h4+20) +%define pb_14 (z2_y_shuf_h4+ 4) +%define pb_15 (z_filter_s +32) +%define pb_27 (z2_y_shuf_h4+ 8) +%define pb_31 (z2_y_shuf_h4+12) +%define pb_32 (z2_y_shuf_h4+16) +%define pb_90 (z2_y_shuf_h4+ 0) +%define pw_1 (z2_y_shuf_h4+24) +%define pw_8 (z_filter_k +32) + +pw_62: times 2 dw 62 +pw_128: times 2 dw 128 +pw_255: times 2 dw 255 +pw_512: times 2 dw 512 + +%macro JMP_TABLE 3-* + %xdefine %1_%2_table (%%table - 2*4) + %xdefine %%base mangle(private_prefix %+ _%1_%2) + %%table: + %rep %0 - 2 + dd %%base %+ .%3 - (%%table - 2*4) + %rotate 1 + %endrep +%endmacro + +%define ipred_dc_splat_avx2_table (ipred_dc_avx2_table + 10*4) +%define ipred_cfl_splat_avx2_table (ipred_cfl_avx2_table + 8*4) + +JMP_TABLE ipred_smooth, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_smooth_v, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_smooth_h, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_paeth, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_filter, avx2, w4, w8, w16, w32 +JMP_TABLE ipred_dc, avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ + s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 +JMP_TABLE ipred_dc_left, avx2, h4, h8, h16, h32, h64 +JMP_TABLE ipred_h, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z1, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z2, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z3, avx2, h4, h8, h16, h32, h64 +JMP_TABLE ipred_cfl, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \ + s4-8*4, s8-8*4, s16-8*4, s32-8*4 +JMP_TABLE ipred_cfl_left, avx2, h4, h8, h16, h32 +JMP_TABLE ipred_cfl_ac_420, avx2, w16_pad1, w16_pad2, w16_pad3 +JMP_TABLE ipred_cfl_ac_422, avx2, w16_pad1, w16_pad2, w16_pad3 +JMP_TABLE ipred_cfl_ac_444, avx2, w32_pad1, w32_pad2, w32_pad3, w4, w8, w16, w32 +JMP_TABLE pal_pred, avx2, w4, w8, w16, w32, w64 + +cextern dr_intra_derivative +cextern filter_intra_taps + +SECTION .text + +INIT_YMM avx2 +cglobal ipred_dc_top, 3, 7, 6, dst, stride, tl, w, h + lea r5, [ipred_dc_left_avx2_table] + tzcnt wd, wm + inc tlq + movu m0, [tlq] + movifnidn hd, hm + mov r6d, 0x8000 + shrx r6d, r6d, wd + movd xm3, r6d + movsxd r6, [r5+wq*4] + pcmpeqd m2, m2 + pmaddubsw m0, m2 + add r6, r5 + add r5, ipred_dc_splat_avx2_table-ipred_dc_left_avx2_table + movsxd wq, [r5+wq*4] + add wq, r5 + jmp r6 + +cglobal ipred_dc_left, 3, 7, 6, dst, stride, tl, w, h, stride3 + mov hd, hm ; zero upper half + tzcnt r6d, hd + sub tlq, hq + tzcnt wd, wm + movu m0, [tlq] + mov r5d, 0x8000 + shrx r5d, r5d, r6d + movd xm3, r5d + lea r5, [ipred_dc_left_avx2_table] + movsxd r6, [r5+r6*4] + pcmpeqd m2, m2 + pmaddubsw m0, m2 + add r6, r5 + add r5, ipred_dc_splat_avx2_table-ipred_dc_left_avx2_table + movsxd wq, [r5+wq*4] + add wq, r5 + jmp r6 +.h64: + movu m1, [tlq+32] ; unaligned when jumping here from dc_top + pmaddubsw m1, m2 + paddw m0, m1 +.h32: + vextracti128 xm1, m0, 1 + paddw xm0, xm1 +.h16: + punpckhqdq xm1, xm0, xm0 + paddw xm0, xm1 +.h8: + psrlq xm1, xm0, 32 + paddw xm0, xm1 +.h4: + pmaddwd xm0, xm2 + pmulhrsw xm0, xm3 + lea stride3q, [strideq*3] + vpbroadcastb m0, xm0 + mova m1, m0 + jmp wq + +cglobal ipred_dc, 3, 7, 6, dst, stride, tl, w, h, stride3 + movifnidn hd, hm + movifnidn wd, wm + tzcnt r6d, hd + lea r5d, [wq+hq] + movd xm4, r5d + tzcnt r5d, r5d + movd xm5, r5d + lea r5, [ipred_dc_avx2_table] + tzcnt wd, wd + movsxd r6, [r5+r6*4] + movsxd wq, [r5+wq*4+5*4] + pcmpeqd m3, m3 + psrlw xm4, 1 + add r6, r5 + add wq, r5 + lea stride3q, [strideq*3] + jmp r6 +.h4: + movd xm0, [tlq-4] + pmaddubsw xm0, xm3 + jmp wq +.w4: + movd xm1, [tlq+1] + pmaddubsw xm1, xm3 + psubw xm0, xm4 + paddw xm0, xm1 + pmaddwd xm0, xm3 + cmp hd, 4 + jg .w4_mul + psrlw xm0, 3 + jmp .w4_end +.w4_mul: + punpckhqdq xm1, xm0, xm0 + lea r2d, [hq*2] + mov r6d, 0x55563334 + paddw xm0, xm1 + shrx r6d, r6d, r2d + psrlq xm1, xm0, 32 + paddw xm0, xm1 + movd xm1, r6d + psrlw xm0, 2 + pmulhuw xm0, xm1 +.w4_end: + vpbroadcastb xm0, xm0 +.s4: + movd [dstq+strideq*0], xm0 + movd [dstq+strideq*1], xm0 + movd [dstq+strideq*2], xm0 + movd [dstq+stride3q ], xm0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s4 + RET +ALIGN function_align +.h8: + movq xm0, [tlq-8] + pmaddubsw xm0, xm3 + jmp wq +.w8: + movq xm1, [tlq+1] + vextracti128 xm2, m0, 1 + pmaddubsw xm1, xm3 + psubw xm0, xm4 + paddw xm0, xm2 + punpckhqdq xm2, xm0, xm0 + paddw xm0, xm2 + paddw xm0, xm1 + psrlq xm1, xm0, 32 + paddw xm0, xm1 + pmaddwd xm0, xm3 + psrlw xm0, xm5 + cmp hd, 8 + je .w8_end + mov r6d, 0x5556 + mov r2d, 0x3334 + cmp hd, 32 + cmove r6d, r2d + movd xm1, r6d + pmulhuw xm0, xm1 +.w8_end: + vpbroadcastb xm0, xm0 +.s8: + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm0 + movq [dstq+stride3q ], xm0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s8 + RET +ALIGN function_align +.h16: + mova xm0, [tlq-16] + pmaddubsw xm0, xm3 + jmp wq +.w16: + movu xm1, [tlq+1] + vextracti128 xm2, m0, 1 + pmaddubsw xm1, xm3 + psubw xm0, xm4 + paddw xm0, xm2 + paddw xm0, xm1 + punpckhqdq xm1, xm0, xm0 + paddw xm0, xm1 + psrlq xm1, xm0, 32 + paddw xm0, xm1 + pmaddwd xm0, xm3 + psrlw xm0, xm5 + cmp hd, 16 + je .w16_end + mov r6d, 0x5556 + mov r2d, 0x3334 + test hb, 8|32 + cmovz r6d, r2d + movd xm1, r6d + pmulhuw xm0, xm1 +.w16_end: + vpbroadcastb xm0, xm0 +.s16: + mova [dstq+strideq*0], xm0 + mova [dstq+strideq*1], xm0 + mova [dstq+strideq*2], xm0 + mova [dstq+stride3q ], xm0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s16 + RET +ALIGN function_align +.h32: + mova m0, [tlq-32] + pmaddubsw m0, m3 + jmp wq +.w32: + movu m1, [tlq+1] + pmaddubsw m1, m3 + paddw m0, m1 + vextracti128 xm1, m0, 1 + psubw xm0, xm4 + paddw xm0, xm1 + punpckhqdq xm1, xm0, xm0 + paddw xm0, xm1 + psrlq xm1, xm0, 32 + paddw xm0, xm1 + pmaddwd xm0, xm3 + psrlw xm0, xm5 + cmp hd, 32 + je .w32_end + lea r2d, [hq*2] + mov r6d, 0x33345556 + shrx r6d, r6d, r2d + movd xm1, r6d + pmulhuw xm0, xm1 +.w32_end: + vpbroadcastb m0, xm0 +.s32: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s32 + RET +ALIGN function_align +.h64: + mova m0, [tlq-64] + mova m1, [tlq-32] + pmaddubsw m0, m3 + pmaddubsw m1, m3 + paddw m0, m1 + jmp wq +.w64: + movu m1, [tlq+ 1] + movu m2, [tlq+33] + pmaddubsw m1, m3 + pmaddubsw m2, m3 + paddw m0, m1 + paddw m0, m2 + vextracti128 xm1, m0, 1 + psubw xm0, xm4 + paddw xm0, xm1 + punpckhqdq xm1, xm0, xm0 + paddw xm0, xm1 + psrlq xm1, xm0, 32 + paddw xm0, xm1 + pmaddwd xm0, xm3 + psrlw xm0, xm5 + cmp hd, 64 + je .w64_end + mov r6d, 0x33345556 + shrx r6d, r6d, hd + movd xm1, r6d + pmulhuw xm0, xm1 +.w64_end: + vpbroadcastb m0, xm0 + mova m1, m0 +.s64: + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m1 + mova [dstq+strideq*1+32*0], m0 + mova [dstq+strideq*1+32*1], m1 + mova [dstq+strideq*2+32*0], m0 + mova [dstq+strideq*2+32*1], m1 + mova [dstq+stride3q +32*0], m0 + mova [dstq+stride3q +32*1], m1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s64 + RET + +cglobal ipred_dc_128, 2, 7, 6, dst, stride, tl, w, h, stride3 + lea r5, [ipred_dc_splat_avx2_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r5+wq*4] + vpbroadcastd m0, [r5-ipred_dc_splat_avx2_table+pb_128] + mova m1, m0 + add wq, r5 + lea stride3q, [strideq*3] + jmp wq + +cglobal ipred_v, 3, 7, 6, dst, stride, tl, w, h, stride3 + lea r5, [ipred_dc_splat_avx2_table] + tzcnt wd, wm + movu m0, [tlq+ 1] + movu m1, [tlq+33] + movifnidn hd, hm + movsxd wq, [r5+wq*4] + add wq, r5 + lea stride3q, [strideq*3] + jmp wq + +%macro IPRED_H 2 ; w, store_type + vpbroadcastb m0, [tlq-1] + vpbroadcastb m1, [tlq-2] + vpbroadcastb m2, [tlq-3] + sub tlq, 4 + vpbroadcastb m3, [tlq+0] + mov%2 [dstq+strideq*0], m0 + mov%2 [dstq+strideq*1], m1 + mov%2 [dstq+strideq*2], m2 + mov%2 [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w%1 + RET +ALIGN function_align +%endmacro + +INIT_XMM avx2 +cglobal ipred_h, 3, 6, 4, dst, stride, tl, w, h, stride3 + lea r5, [ipred_h_avx2_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r5+wq*4] + add wq, r5 + lea stride3q, [strideq*3] + jmp wq +.w4: + IPRED_H 4, d +.w8: + IPRED_H 8, q +.w16: + IPRED_H 16, a +INIT_YMM avx2 +.w32: + IPRED_H 32, a +.w64: + vpbroadcastb m0, [tlq-1] + vpbroadcastb m1, [tlq-2] + vpbroadcastb m2, [tlq-3] + sub tlq, 4 + vpbroadcastb m3, [tlq+0] + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m0 + mova [dstq+strideq*1+32*0], m1 + mova [dstq+strideq*1+32*1], m1 + mova [dstq+strideq*2+32*0], m2 + mova [dstq+strideq*2+32*1], m2 + mova [dstq+stride3q +32*0], m3 + mova [dstq+stride3q +32*1], m3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w64 + RET + +%macro PAETH 2 ; top, ldiff + pavgb m1, m%1, m3 ; Calculating tldiff normally requires + pxor m0, m%1, m3 ; 10-bit intermediates, but we can do it + pand m0, m4 ; in 8-bit with some tricks which avoids + psubusb m2, m5, m1 ; having to unpack everything to 16-bit. + psubb m1, m0 + psubusb m1, m5 + por m1, m2 + paddusb m1, m1 + por m1, m0 ; min(tldiff, 255) + psubusb m2, m5, m3 + psubusb m0, m3, m5 + por m2, m0 ; tdiff + pminub m2, m%2 + pcmpeqb m0, m%2, m2 ; ldiff <= tdiff + vpblendvb m0, m%1, m3, m0 + pminub m1, m2 + pcmpeqb m1, m2 ; ldiff <= tldiff || tdiff <= tldiff + vpblendvb m0, m5, m0, m1 +%endmacro + +cglobal ipred_paeth, 3, 6, 9, dst, stride, tl, w, h +%define base r5-ipred_paeth_avx2_table + lea r5, [ipred_paeth_avx2_table] + tzcnt wd, wm + vpbroadcastb m5, [tlq] ; topleft + movifnidn hd, hm + movsxd wq, [r5+wq*4] + vpbroadcastd m4, [base+pb_1] + add wq, r5 + jmp wq +.w4: + vpbroadcastd m6, [tlq+1] ; top + mova m8, [base+ipred_h_shuf] + lea r3, [strideq*3] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 ; ldiff +.w4_loop: + sub tlq, 8 + vpbroadcastq m3, [tlq] + pshufb m3, m8 ; left + PAETH 6, 7 + vextracti128 xm1, m0, 1 + movd [dstq+strideq*0], xm0 + movd [dstq+strideq*1], xm1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+r3 ], xm1, 2 + cmp hd, 4 + je .ret + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 1 + pextrd [dstq+strideq*1], xm1, 1 + pextrd [dstq+strideq*2], xm0, 3 + pextrd [dstq+r3 ], xm1, 3 + lea dstq, [dstq+strideq*4] + sub hd, 8 + jg .w4_loop +.ret: + RET +ALIGN function_align +.w8: + vpbroadcastq m6, [tlq+1] + mova m8, [base+ipred_h_shuf] + lea r3, [strideq*3] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 +.w8_loop: + sub tlq, 4 + vpbroadcastd m3, [tlq] + pshufb m3, m8 + PAETH 6, 7 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+r3 ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_loop + RET +ALIGN function_align +.w16: + vbroadcasti128 m6, [tlq+1] + mova xm8, xm4 ; lower half = 1, upper half = 0 + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 +.w16_loop: + sub tlq, 2 + vpbroadcastd m3, [tlq] + pshufb m3, m8 + PAETH 6, 7 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16_loop + RET +ALIGN function_align +.w32: + movu m6, [tlq+1] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 +.w32_loop: + dec tlq + vpbroadcastb m3, [tlq] + PAETH 6, 7 + mova [dstq], m0 + add dstq, strideq + dec hd + jg .w32_loop + RET +ALIGN function_align +.w64: + movu m6, [tlq+ 1] + movu m7, [tlq+33] +%if WIN64 + movaps r4m, xmm9 +%endif + psubusb m8, m5, m6 + psubusb m0, m6, m5 + psubusb m9, m5, m7 + psubusb m1, m7, m5 + por m8, m0 + por m9, m1 +.w64_loop: + dec tlq + vpbroadcastb m3, [tlq] + PAETH 6, 8 + mova [dstq+32*0], m0 + PAETH 7, 9 + mova [dstq+32*1], m0 + add dstq, strideq + dec hd + jg .w64_loop +%if WIN64 + movaps xmm9, r4m +%endif + RET + +%macro SMOOTH 6 ; src[1-2], mul[1-2], add[1-2] + ; w * a = (w - 128) * a + 128 * a + ; (256 - w) * b = (127 - w) * b + 129 * b + pmaddubsw m0, m%3, m%1 + pmaddubsw m1, m%4, m%2 + paddw m0, m%5 + paddw m1, m%6 + psrlw m0, 8 + psrlw m1, 8 + packuswb m0, m1 +%endmacro + +cglobal ipred_smooth_v, 3, 7, 0, dst, stride, tl, w, h, weights +%define base r6-ipred_smooth_v_avx2_table + lea r6, [ipred_smooth_v_avx2_table] + tzcnt wd, wm + mov hd, hm + movsxd wq, [r6+wq*4] + vpbroadcastd m0, [base+pb_127_m127] + vpbroadcastd m1, [base+pw_128] + lea weightsq, [base+smooth_weights+hq*4] + neg hq + vpbroadcastb m5, [tlq+hq] ; bottom + add wq, r6 + jmp wq +.w4: + vpbroadcastd m2, [tlq+1] + punpcklbw m2, m5 ; top, bottom + mova m5, [base+ipred_v_shuf] + lea r3, [strideq*3] + punpckldq m4, m5, m5 + punpckhdq m5, m5 + pmaddubsw m3, m2, m0 + paddw m1, m2 ; 1 * top + 256 * bottom + 128, overflow is ok + paddw m3, m1 ; 128 * top + 129 * bottom + 128 +.w4_loop: + vbroadcasti128 m1, [weightsq+hq*2] + pshufb m0, m1, m4 + pshufb m1, m5 + SMOOTH 0, 1, 2, 2, 3, 3 + vextracti128 xm1, m0, 1 + movd [dstq+strideq*0], xm0 + movd [dstq+strideq*1], xm1 + pextrd [dstq+strideq*2], xm0, 1 + pextrd [dstq+r3 ], xm1, 1 + cmp hd, -4 + je .ret + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 2 + pextrd [dstq+strideq*1], xm1, 2 + pextrd [dstq+strideq*2], xm0, 3 + pextrd [dstq+r3 ], xm1, 3 + lea dstq, [dstq+strideq*4] + add hq, 8 + jl .w4_loop +.ret: + RET +ALIGN function_align +.w8: + vpbroadcastq m2, [tlq+1] + punpcklbw m2, m5 + mova m5, [base+ipred_v_shuf] + lea r3, [strideq*3] + pshufd m4, m5, q0000 + pshufd m5, m5, q1111 + pmaddubsw m3, m2, m0 + paddw m1, m2 + paddw m3, m1 +.w8_loop: + vpbroadcastq m1, [weightsq+hq*2] + pshufb m0, m1, m4 + pshufb m1, m5 + SMOOTH 0, 1, 2, 2, 3, 3 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+r3 ], xm1 + lea dstq, [dstq+strideq*4] + add hq, 4 + jl .w8_loop + RET +ALIGN function_align +.w16: + WIN64_SPILL_XMM 7 + vbroadcasti128 m3, [tlq+1] + mova m6, [base+ipred_v_shuf] + punpcklbw m2, m3, m5 + punpckhbw m3, m5 + pmaddubsw m4, m2, m0 + pmaddubsw m5, m3, m0 + paddw m0, m1, m2 + paddw m1, m3 + paddw m4, m0 + paddw m5, m1 +.w16_loop: + vpbroadcastd m1, [weightsq+hq*2] + pshufb m1, m6 + SMOOTH 1, 1, 2, 3, 4, 5 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + add hq, 2 + jl .w16_loop + RET +ALIGN function_align +.w32: + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 6 + movu m3, [tlq+1] + punpcklbw m2, m3, m5 + punpckhbw m3, m5 + pmaddubsw m4, m2, m0 + pmaddubsw m5, m3, m0 + paddw m0, m1, m2 + paddw m1, m3 + paddw m4, m0 + paddw m5, m1 +.w32_loop: + vpbroadcastw m1, [weightsq+hq*2] + SMOOTH 1, 1, 2, 3, 4, 5 + mova [dstq], m0 + add dstq, strideq + inc hq + jl .w32_loop + RET +ALIGN function_align +.w64: + WIN64_SPILL_XMM 11 + movu m4, [tlq+ 1] + movu m8, [tlq+33] + punpcklbw m3, m4, m5 + punpckhbw m4, m5 + punpcklbw m7, m8, m5 + punpckhbw m8, m5 + pmaddubsw m5, m3, m0 + pmaddubsw m6, m4, m0 + pmaddubsw m9, m7, m0 + pmaddubsw m10, m8, m0 + paddw m2, m1, m3 + paddw m5, m2 + paddw m2, m1, m4 + paddw m6, m2 + paddw m0, m1, m7 + paddw m9, m0 + paddw m1, m8 + paddw m10, m1 +.w64_loop: + vpbroadcastw m2, [weightsq+hq*2] + SMOOTH 2, 2, 3, 4, 5, 6 + mova [dstq+32*0], m0 + SMOOTH 2, 2, 7, 8, 9, 10 + mova [dstq+32*1], m0 + add dstq, strideq + inc hq + jl .w64_loop + RET + +%macro SETUP_STACK_FRAME 3 ; stack_size, regs_used, xmm_regs_used + %assign stack_offset 0 + %assign stack_size_padded 0 + %assign regs_used %2 + %xdefine rstk rsp + SETUP_STACK_POINTER %1 + %if regs_used != %2 && WIN64 + PUSH r%2 + %endif + ALLOC_STACK %1, %3 +%endmacro + +cglobal ipred_smooth_h, 3, 7, 0, dst, stride, tl, w, h +%define base r6-ipred_smooth_h_avx2_table + lea r6, [ipred_smooth_h_avx2_table] + mov wd, wm + vpbroadcastb m3, [tlq+wq] ; right + tzcnt wd, wd + mov hd, hm + movsxd wq, [r6+wq*4] + vpbroadcastd m4, [base+pb_127_m127] + vpbroadcastd m5, [base+pw_128] + add wq, r6 + jmp wq +.w4: + WIN64_SPILL_XMM 8 + vpbroadcastq m6, [base+smooth_weights+4*2] + mova m7, [base+ipred_h_shuf] + sub tlq, 8 + sub tlq, hq + lea r3, [strideq*3] +.w4_loop: + vpbroadcastq m2, [tlq+hq] + pshufb m2, m7 + punpcklbw m1, m2, m3 ; left, right + punpckhbw m2, m3 + pmaddubsw m0, m1, m4 ; 127 * left - 127 * right + paddw m0, m1 ; 128 * left + 129 * right + pmaddubsw m1, m6 + paddw m1, m5 + paddw m0, m1 + pmaddubsw m1, m2, m4 + paddw m1, m2 + pmaddubsw m2, m6 + paddw m2, m5 + paddw m1, m2 + psrlw m0, 8 + psrlw m1, 8 + packuswb m0, m1 + vextracti128 xm1, m0, 1 + movd [dstq+strideq*0], xm0 + movd [dstq+strideq*1], xm1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+r3 ], xm1, 2 + cmp hd, 4 + je .ret + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 1 + pextrd [dstq+strideq*1], xm1, 1 + pextrd [dstq+strideq*2], xm0, 3 + pextrd [dstq+r3 ], xm1, 3 + lea dstq, [dstq+strideq*4] + sub hd, 8 + jg .w4_loop +.ret: + RET +ALIGN function_align +.w8: + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 8 + vbroadcasti128 m6, [base+smooth_weights+8*2] + mova m7, [base+ipred_h_shuf] + sub tlq, 4 + lea r3, [strideq*3] + sub tlq, hq +.w8_loop: + vpbroadcastd m2, [tlq+hq] + pshufb m2, m7 + punpcklbw m1, m2, m3 + punpckhbw m2, m3 + pmaddubsw m0, m1, m4 + paddw m0, m1 + pmaddubsw m1, m6 + paddw m1, m5 + paddw m0, m1 + pmaddubsw m1, m2, m4 + paddw m1, m2 + pmaddubsw m2, m6 + paddw m2, m5 + paddw m1, m2 + psrlw m0, 8 + psrlw m1, 8 + packuswb m0, m1 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+r3 ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_loop + RET +ALIGN function_align +.w16: + SETUP_STACK_FRAME 32*4, 7, 8 + lea r3, [rsp+64*2-4] + call .prep ; only worthwhile for for w16 and above + sub tlq, 2 + vpbroadcastd xm6, [base+pb_1] + mova xm7, [base+ipred_v_shuf+16] + vinserti128 m7, [base+ipred_v_shuf+ 0], 1 + vbroadcasti128 m4, [base+smooth_weights+16*2] + vbroadcasti128 m5, [base+smooth_weights+16*3] +.w16_loop: + vpbroadcastd m1, [tlq+hq] + vpbroadcastd m2, [r3+hq*2] + pshufb m1, m6 + punpcklbw m1, m3 + pshufb m2, m7 + SMOOTH 4, 5, 1, 1, 2, 2 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16_loop + RET +ALIGN function_align +.w32: + SETUP_STACK_FRAME 32*4, 7, 6 + lea r3, [rsp+64*2-2] + call .prep + dec tlq + mova xm4, [base+smooth_weights+16*4] + vinserti128 m4, [base+smooth_weights+16*6], 1 + mova xm5, [base+smooth_weights+16*5] + vinserti128 m5, [base+smooth_weights+16*7], 1 +.w32_loop: + vpbroadcastb m1, [tlq+hq] + punpcklbw m1, m3 + vpbroadcastw m2, [r3+hq*2] + SMOOTH 4, 5, 1, 1, 2, 2 + mova [dstq], m0 + add dstq, strideq + dec hd + jg .w32_loop + RET +ALIGN function_align +.w64: + SETUP_STACK_FRAME 32*4, 7, 9 + lea r3, [rsp+64*2-2] + call .prep + add r6, smooth_weights+16*15-ipred_smooth_h_avx2_table + dec tlq + mova xm5, [r6-16*7] + vinserti128 m5, [r6-16*5], 1 + mova xm6, [r6-16*6] + vinserti128 m6, [r6-16*4], 1 + mova xm7, [r6-16*3] + vinserti128 m7, [r6-16*1], 1 + mova xm8, [r6-16*2] + vinserti128 m8, [r6-16*0], 1 +.w64_loop: + vpbroadcastb m2, [tlq+hq] + punpcklbw m2, m3 + vpbroadcastw m4, [r3+hq*2] + SMOOTH 5, 6, 2, 2, 4, 4 + mova [dstq+32*0], m0 + SMOOTH 7, 8, 2, 2, 4, 4 + mova [dstq+32*1], m0 + add dstq, strideq + dec hd + jg .w64_loop + RET +ALIGN function_align +.prep: + vpermq m2, [tlq-32*1], q3120 + punpckhbw m1, m2, m3 + punpcklbw m2, m3 + pmaddubsw m0, m1, m4 ; 127 * left - 127 * right + paddw m1, m5 ; 1 * left + 256 * right + 128 + paddw m0, m1 ; 128 * left + 129 * right + 128 + pmaddubsw m1, m2, m4 + paddw m2, m5 + paddw m1, m2 + vpermq m2, [tlq-32*2], q3120 + mova [rsp+gprsize+32*3], m0 + mova [rsp+gprsize+32*2], m1 + punpckhbw m1, m2, m3 + punpcklbw m2, m3 + pmaddubsw m0, m1, m4 + paddw m1, m5 + paddw m0, m1 + pmaddubsw m1, m2, m4 + paddw m2, m5 + paddw m1, m2 + mova [rsp+gprsize+32*1], m0 + mova [rsp+gprsize+32*0], m1 + sub r3, hq + sub tlq, hq + sub r3, hq + ret + +%macro SMOOTH_2D_END 6 ; src[1-2], mul[1-2], add[1-2] + pmaddubsw m0, m%3, m%1 + pmaddubsw m1, m%4, m%2 +%ifnum %5 + paddw m0, m%5 +%else + paddw m0, %5 +%endif +%ifnum %6 + paddw m1, m%6 +%else + paddw m1, %6 +%endif + pavgw m0, m2 + pavgw m1, m3 + psrlw m0, 8 + psrlw m1, 8 + packuswb m0, m1 +%endmacro + +cglobal ipred_smooth, 3, 7, 0, dst, stride, tl, w, h, v_weights +%define base r6-ipred_smooth_avx2_table + lea r6, [ipred_smooth_avx2_table] + mov wd, wm + vpbroadcastb m4, [tlq+wq] ; right + tzcnt wd, wd + mov hd, hm + mov r5, tlq + sub r5, hq + movsxd wq, [r6+wq*4] + vpbroadcastd m5, [base+pb_127_m127] + vpbroadcastb m0, [r5] ; bottom + vpbroadcastd m3, [base+pw_255] + add wq, r6 + lea v_weightsq, [base+smooth_weights+hq*2] + jmp wq +.w4: + WIN64_SPILL_XMM 12 + mova m10, [base+ipred_h_shuf] + vpbroadcastq m11, [base+smooth_weights+4*2] + mova m7, [base+ipred_v_shuf] + vpbroadcastd m8, [tlq+1] + sub tlq, 8 + lea r3, [strideq*3] + sub tlq, hq + punpcklbw m8, m0 ; top, bottom + pshufd m6, m7, q2200 + pshufd m7, m7, q3311 + pmaddubsw m9, m8, m5 + paddw m3, m8 ; 1 * top + 255 * bottom + 255 + paddw m9, m3 ; 128 * top + 129 * bottom + 255 +.w4_loop: + vpbroadcastq m1, [tlq+hq] + pshufb m1, m10 + punpcklbw m0, m1, m4 ; left, right + punpckhbw m1, m4 + pmaddubsw m2, m0, m5 ; 127 * left - 127 * right + pmaddubsw m3, m1, m5 + paddw m2, m0 ; 128 * left + 129 * right + paddw m3, m1 + pmaddubsw m0, m11 + pmaddubsw m1, m11 + paddw m2, m0 + paddw m3, m1 + vbroadcasti128 m1, [v_weightsq] + add v_weightsq, 16 + pshufb m0, m1, m6 + pshufb m1, m7 + SMOOTH_2D_END 0, 1, 8, 8, 9, 9 + vextracti128 xm1, m0, 1 + movd [dstq+strideq*0], xm0 + movd [dstq+strideq*1], xm1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+r3 ], xm1, 2 + cmp hd, 4 + je .ret + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 1 + pextrd [dstq+strideq*1], xm1, 1 + pextrd [dstq+strideq*2], xm0, 3 + pextrd [dstq+r3 ], xm1, 3 + lea dstq, [dstq+strideq*4] + sub hd, 8 + jg .w4_loop +.ret: + RET +ALIGN function_align +.w8: + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 12 + mova m10, [base+ipred_h_shuf] + vbroadcasti128 m11, [base+smooth_weights+8*2] + mova m7, [base+ipred_v_shuf] + vpbroadcastq m8, [tlq+1] + sub tlq, 4 + lea r3, [strideq*3] + sub tlq, hq + punpcklbw m8, m0 + pshufd m6, m7, q0000 + pshufd m7, m7, q1111 + pmaddubsw m9, m8, m5 + paddw m3, m8 + paddw m9, m3 +.w8_loop: + vpbroadcastd m1, [tlq+hq] + pshufb m1, m10 + punpcklbw m0, m1, m4 + punpckhbw m1, m4 + pmaddubsw m2, m0, m5 + pmaddubsw m3, m1, m5 + paddw m2, m0 + paddw m3, m1 + pmaddubsw m0, m11 + pmaddubsw m1, m11 + paddw m2, m0 + paddw m3, m1 + vpbroadcastq m1, [v_weightsq] + add v_weightsq, 8 + pshufb m0, m1, m6 + pshufb m1, m7 + SMOOTH_2D_END 0, 1, 8, 8, 9, 9 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+r3 ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_loop + RET +ALIGN function_align +.w16: + SETUP_STACK_FRAME 32*4, 7, 14 + vbroadcasti128 m11, [tlq+1] + lea r3, [rsp+64*2-4] + punpcklbw m10, m11, m0 ; top, bottom + punpckhbw m11, m0 + call .prep_v + sub tlq, 2 + pmaddubsw m12, m10, m5 + pmaddubsw m13, m11, m5 + vpbroadcastd xm5, [base+pb_1] + mova m9, [base+ipred_v_shuf] + vbroadcasti128 m6, [base+smooth_weights+16*2] + vbroadcasti128 m7, [base+smooth_weights+16*3] + vpermq m8, m9, q1032 + paddw m0, m10, m3 + paddw m3, m11 + paddw m12, m0 + paddw m13, m3 +.w16_loop: + vpbroadcastd m3, [tlq+hq] + vpbroadcastd m0, [r3+hq*2] + vpbroadcastd m1, [v_weightsq] + add v_weightsq, 4 + pshufb m3, m5 + punpcklbw m3, m4 ; left, right + pmaddubsw m2, m3, m6 + pmaddubsw m3, m7 + pshufb m0, m8 + pshufb m1, m9 + paddw m2, m0 + paddw m3, m0 + SMOOTH_2D_END 1, 1, 10, 11, 12, 13 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16_loop + RET +ALIGN function_align +.w32: + SETUP_STACK_FRAME 32*4, 7, 11 + movu m8, [tlq+1] + lea r3, [rsp+64*2-2] + punpcklbw m7, m8, m0 + punpckhbw m8, m0 + call .prep_v + dec tlq + pmaddubsw m9, m7, m5 + pmaddubsw m10, m8, m5 + mova xm5, [base+smooth_weights+16*4] + vinserti128 m5, [base+smooth_weights+16*6], 1 + mova xm6, [base+smooth_weights+16*5] + vinserti128 m6, [base+smooth_weights+16*7], 1 + paddw m0, m7, m3 + paddw m3, m8 + paddw m9, m0 + paddw m10, m3 +.w32_loop: + vpbroadcastb m3, [tlq+hq] + punpcklbw m3, m4 + vpbroadcastw m0, [r3+hq*2] + vpbroadcastw m1, [v_weightsq] + add v_weightsq, 2 + pmaddubsw m2, m3, m5 + pmaddubsw m3, m6 + paddw m2, m0 + paddw m3, m0 + SMOOTH_2D_END 1, 1, 7, 8, 9, 10 + mova [dstq], m0 + add dstq, strideq + dec hd + jg .w32_loop + RET +ALIGN function_align +.w64: + SETUP_STACK_FRAME 32*8, 7, 16 + movu m13, [tlq+1 ] + movu m15, [tlq+33] + add r6, smooth_weights+16*15-ipred_smooth_avx2_table + lea r3, [rsp+64*2-2] + punpcklbw m12, m13, m0 + punpckhbw m13, m0 + punpcklbw m14, m15, m0 + punpckhbw m15, m0 + call .prep_v + dec tlq + pmaddubsw m0, m12, m5 + pmaddubsw m1, m13, m5 + pmaddubsw m2, m14, m5 + pmaddubsw m5, m15, m5 + mova xm8, [r6-16*7] + vinserti128 m8, [r6-16*5], 1 + mova xm9, [r6-16*6] + vinserti128 m9, [r6-16*4], 1 + mova xm10, [r6-16*3] + vinserti128 m10, [r6-16*1], 1 + mova xm11, [r6-16*2] + vinserti128 m11, [r6-16*0], 1 + lea r6, [rsp+32*4] + paddw m0, m3 + paddw m1, m3 + paddw m2, m3 + paddw m3, m5 + paddw m0, m12 + paddw m1, m13 + paddw m2, m14 + paddw m3, m15 + mova [r6+32*0], m0 + mova [r6+32*1], m1 + mova [r6+32*2], m2 + mova [r6+32*3], m3 +.w64_loop: + vpbroadcastb m5, [tlq+hq] + punpcklbw m5, m4 + vpbroadcastw m6, [r3+hq*2] + vpbroadcastw m7, [v_weightsq] + add v_weightsq, 2 + pmaddubsw m2, m5, m8 + pmaddubsw m3, m5, m9 + paddw m2, m6 + paddw m3, m6 + SMOOTH_2D_END 7, 7, 12, 13, [r6+32*0], [r6+32*1] + mova [dstq+32*0], m0 + pmaddubsw m2, m5, m10 + pmaddubsw m3, m5, m11 + paddw m2, m6 + paddw m3, m6 + SMOOTH_2D_END 7, 7, 14, 15, [r6+32*2], [r6+32*3] + mova [dstq+32*1], m0 + add dstq, strideq + dec hd + jg .w64_loop + RET +ALIGN function_align +.prep_v: + vpermq m2, [tlq-32*1], q3120 + punpckhbw m1, m2, m4 + punpcklbw m2, m4 + pmaddubsw m0, m1, m5 ; 127 * left - 127 * right + paddw m0, m1 ; 128 * left + 129 * right + pmaddubsw m1, m2, m5 + paddw m1, m2 + vpermq m2, [tlq-32*2], q3120 + mova [rsp+gprsize+32*3], m0 + mova [rsp+gprsize+32*2], m1 + punpckhbw m1, m2, m4 + punpcklbw m2, m4 + pmaddubsw m0, m1, m5 + paddw m0, m1 + pmaddubsw m1, m2, m5 + paddw m1, m2 + mova [rsp+gprsize+32*1], m0 + mova [rsp+gprsize+32*0], m1 + sub r3, hq + sub tlq, hq + sub r3, hq + ret + +cglobal ipred_z1, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase + %assign org_stack_offset stack_offset + lea r6, [ipred_z1_avx2_table] + tzcnt wd, wm + movifnidn angled, anglem + movifnidn hd, hm + lea r7, [dr_intra_derivative] + inc tlq + movsxd wq, [r6+wq*4] + add wq, r6 + mov dxd, angled + and dxd, 0x7e + add angled, 165 ; ~90 + movzx dxd, word [r7+dxq] + xor angled, 0x4ff ; d = 90 - angle + vpbroadcastd m3, [pw_512] + vpbroadcastd m4, [pw_62] + vpbroadcastd m5, [pw_64] + jmp wq +.w4: + cmp angleb, 40 + jae .w4_no_upsample + lea r3d, [angleq-1024] + sar r3d, 7 + add r3d, hd + jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm) + ALLOC_STACK -32, 8 + mova xm1, [tlq-1] + pshufb xm0, xm1, [z_upsample1] + pshufb xm1, [z_upsample2] + vpbroadcastd xm2, [pb_36_m4] ; upshifted by 2 to be able to reuse + add dxd, dxd ; pw_512 (which is already in m3) + pmaddubsw xm0, xm2 ; for rounding instead of pw_2048 + pextrd [rsp+16], xm1, 3 ; top[max_base_x] + pmaddubsw xm1, xm2 + movd xm7, dxd + mov r3d, dxd ; xpos + vpbroadcastw m7, xm7 + paddw xm1, xm0 + movq xm0, [tlq] + pmulhrsw xm1, xm3 + pslldq m6, m7, 8 + paddw xm2, xm7, xm7 + lea r2, [strideq*3] + paddw m6, m7 + packuswb xm1, xm1 + paddw m6, m2 ; xpos2 xpos3 xpos0 xpos1 + punpcklbw xm0, xm1 + psllw m7, 2 + mova [rsp], xm0 +.w4_upsample_loop: + lea r5d, [r3+dxq] + shr r3d, 6 ; base0 + vpbroadcastq m1, [rsp+r3] + lea r3d, [r5+dxq] + shr r5d, 6 ; base1 + vpbroadcastq m2, [rsp+r5] + lea r5d, [r3+dxq] + shr r3d, 6 ; base2 + movq xm0, [rsp+r3] + lea r3d, [r5+dxq] + shr r5d, 6 ; base3 + movhps xm0, [rsp+r5] + vpblendd m1, m2, 0xc0 + pand m2, m4, m6 ; frac + vpblendd m0, m1, 0xf0 + psubw m1, m5, m2 ; 64-frac + psllw m2, 8 + por m1, m2 ; 64-frac, frac + pmaddubsw m0, m1 + paddw m6, m7 ; xpos += dx + pmulhrsw m0, m3 + packuswb m0, m0 + vextracti128 xm1, m0, 1 + movd [dstq+strideq*2], xm0 + pextrd [dstq+r2 ], xm0, 1 + movd [dstq+strideq*0], xm1 + pextrd [dstq+strideq*1], xm1, 1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4_upsample_loop + RET +ALIGN function_align +.filter_strength: ; w4/w8/w16 + ; The C version uses a lot of branches, but we can do all the comparisons + ; in parallel and use popcnt to get the final filter strength value. +%define base r3-z_filter_t0 + lea r3, [z_filter_t0] + movd xm0, maxbased + movd xm2, angled + shr angled, 8 ; is_sm << 1 + vpbroadcastb m0, xm0 + vpbroadcastb m2, xm2 + pcmpeqb m1, m0, [base+z_filter_wh] + pand m1, m2 + mova xm2, [r3+angleq*8] ; upper ymm half zero in both cases + pcmpgtb m1, m2 + pmovmskb r5d, m1 + ret +.w4_no_upsample: + %assign stack_offset org_stack_offset + ALLOC_STACK -16, 11 + mov maxbased, 7 + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w4_main + lea maxbased, [hq+3] + call .filter_strength + mov maxbased, 7 + test r5d, r5d + jz .w4_main ; filter_strength == 0 + popcnt r5d, r5d + vpbroadcastd m7, [base+pb_8] + vbroadcasti128 m2, [tlq-1] + pminub m1, m7, [base+z_filter_s] + vpbroadcastd m8, [base+z_filter_k-4+r5*4+12*0] + pminub m7, [base+z_filter_s+8] + vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1] + vpbroadcastd m10, [base+z_filter_k-4+r5*4+12*2] + pshufb m0, m2, m1 + shufps m1, m7, q2121 + pmaddubsw m0, m8 + pshufb m1, m2, m1 + pmaddubsw m1, m9 + pshufb m2, m7 + pmaddubsw m2, m10 + paddw m0, m1 + paddw m0, m2 + pmulhrsw m0, m3 + mov r3d, 9 + mov tlq, rsp + cmp hd, 4 + cmovne maxbased, r3d + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + mova [tlq], xm0 +.w4_main: + movd xm6, dxd + vpbroadcastq m0, [z_base_inc] ; base_inc << 6 + vpbroadcastb m7, [tlq+maxbaseq] + shl maxbased, 6 + vpbroadcastw m6, xm6 + mov r3d, dxd ; xpos + movd xm9, maxbased + vpbroadcastw m9, xm9 + vbroadcasti128 m8, [z1_shuf_w4] + psrlw m7, 8 ; top[max_base_x] + paddw m10, m6, m6 + psubw m9, m0 ; max_base_x + vpblendd m6, m10, 0xcc + mova xm0, xm10 + paddw m6, m0 ; xpos2 xpos3 xpos0 xpos1 + paddw m10, m10 +.w4_loop: + lea r5d, [r3+dxq] + shr r3d, 6 ; base0 + vpbroadcastq m1, [tlq+r3] + lea r3d, [r5+dxq] + shr r5d, 6 ; base1 + vpbroadcastq m2, [tlq+r5] + lea r5d, [r3+dxq] + shr r3d, 6 ; base2 + movq xm0, [tlq+r3] + lea r3d, [r5+dxq] + shr r5d, 6 ; base3 + movhps xm0, [tlq+r5] + vpblendd m1, m2, 0xc0 + pand m2, m4, m6 ; frac + vpblendd m0, m1, 0xf0 + psubw m1, m5, m2 ; 64-frac + psllw m2, 8 + pshufb m0, m8 + por m1, m2 ; 64-frac, frac + pmaddubsw m0, m1 + pcmpgtw m1, m9, m6 ; base < max_base_x + pmulhrsw m0, m3 + paddw m6, m10 ; xpos += dx + lea r5, [dstq+strideq*2] + vpblendvb m0, m7, m0, m1 + packuswb m0, m0 + vextracti128 xm1, m0, 1 + movd [r5 +strideq*0], xm0 + pextrd [r5 +strideq*1], xm0, 1 + movd [dstq+strideq*0], xm1 + pextrd [dstq+strideq*1], xm1, 1 + sub hd, 4 + jz .w4_end + lea dstq, [dstq+strideq*4] + cmp r3d, maxbased + jb .w4_loop + packuswb xm7, xm7 + lea r6, [strideq*3] +.w4_end_loop: + movd [dstq+strideq*0], xm7 + movd [dstq+strideq*1], xm7 + movd [dstq+strideq*2], xm7 + movd [dstq+r6 ], xm7 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4_end_loop +.w4_end: + RET +ALIGN function_align +.w8: + lea r3d, [angleq+216] + mov r3b, hb + cmp r3d, 8 + ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 + %assign stack_offset org_stack_offset + ALLOC_STACK -32, 8 + movu xm2, [z_filter_s+6] + mova xm0, [tlq-1] + movd xm6, hd + vinserti128 m0, [tlq+7], 1 + vpbroadcastb xm6, xm6 + vbroadcasti128 m1, [z_upsample1] + pminub xm6, xm2 + vpbroadcastd m7, [pb_36_m4] + vinserti128 m2, xm6, 1 + add dxd, dxd + pshufb m1, m0, m1 + pshufb m2, m0, m2 + movd xm6, dxd + pmaddubsw m1, m7 + pmaddubsw m2, m7 + vpbroadcastw m6, xm6 + mov r3d, dxd + psrldq m0, 1 + lea r2, [strideq*3] + paddw m7, m6, m6 + paddw m1, m2 + vpblendd m6, m7, 0xf0 + pmulhrsw m1, m3 + pslldq m2, m7, 8 + paddw m7, m7 + paddw m6, m2 + packuswb m1, m1 + punpcklbw m0, m1 + mova [rsp], m0 +.w8_upsample_loop: + lea r5d, [r3+dxq] + shr r3d, 6 ; base0 + movu xm0, [rsp+r3] + lea r3d, [r5+dxq] + shr r5d, 6 ; base1 + vinserti128 m0, [rsp+r5], 1 + lea r5d, [r3+dxq] + shr r3d, 6 ; base2 + pand m1, m4, m6 + psubw m2, m5, m1 + psllw m1, 8 + por m2, m1 + punpcklqdq m1, m2, m2 ; frac0 frac1 + pmaddubsw m0, m1 + movu xm1, [rsp+r3] + lea r3d, [r5+dxq] + shr r5d, 6 ; base3 + vinserti128 m1, [rsp+r5], 1 + punpckhqdq m2, m2 ; frac2 frac3 + pmaddubsw m1, m2 + pmulhrsw m0, m3 + paddw m6, m7 + pmulhrsw m1, m3 + packuswb m0, m1 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*2], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+r2 ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_upsample_loop + RET +.w8_no_intra_edge_filter: + and maxbased, 7 + or maxbased, 8 ; imin(h+7, 15) + jmp .w8_main +.w8_no_upsample: + %assign stack_offset org_stack_offset + ALLOC_STACK -32, 10 + lea maxbased, [hq+7] + test angled, 0x400 + jnz .w8_no_intra_edge_filter + call .filter_strength + test r5d, r5d + jz .w8_main ; filter_strength == 0 + popcnt r5d, r5d + movu xm2, [tlq] + pminub xm1, xm0, [base+z_filter_s+14] + vinserti128 m2, [tlq-1], 1 + vinserti128 m1, [base+z_filter_s+ 0], 1 + vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*0] + pminub xm0, [base+z_filter_s+22] + vinserti128 m0, [base+z_filter_s+ 8], 1 + pshufb m6, m2, m1 + pmaddubsw m6, m7 + vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*1] + movzx r3d, byte [tlq+15] + shufps m1, m0, q2121 + pshufb m1, m2, m1 + pmaddubsw m1, m7 + paddw m1, m6 + sub r5d, 3 + jnz .w8_3tap + ; filter_strength == 3 uses a 5-tap filter instead of a 3-tap one, + ; which also results in an awkward edge case where out[w*2] is + ; slightly different from out[max_base_x] when h > w. + vpbroadcastd m7, [z_filter_k+4*8] + movzx r2d, byte [tlq+14] + pshufb m2, m0 + pmaddubsw m2, m7 + sub r2d, r3d + lea r2d, [r2+r3*8+4] + shr r2d, 3 ; (tlq[w*2-2] + tlq[w*2-1]*7 + 4) >> 3 + mov [rsp+16], r2b + paddw m1, m2 +.w8_3tap: + pmulhrsw m1, m3 + sar r5d, 1 + mov tlq, rsp + add r5d, 17 ; w*2 + (filter_strength == 3) + cmp hd, 16 + cmovns maxbased, r5d + mov [tlq+r5], r3b + vextracti128 xm0, m1, 1 + packuswb xm0, xm1 + mova [tlq], xm0 +.w8_main: + movd xm2, dxd + vbroadcasti128 m0, [z_base_inc] + vpbroadcastw m2, xm2 + vpbroadcastb m7, [tlq+maxbaseq] + shl maxbased, 6 + movd xm9, maxbased + vbroadcasti128 m8, [z_filter_s+2] + vpbroadcastw m9, xm9 + psrlw m7, 8 + psubw m9, m0 + mov r3d, dxd + paddw m6, m2, m2 + vpblendd m2, m6, 0xf0 +.w8_loop: + lea r5d, [r3+dxq] + shr r3d, 6 + pand m0, m4, m2 + psubw m1, m5, m0 + psllw m0, 8 + por m1, m0 + movu xm0, [tlq+r3] + lea r3d, [r5+dxq] + shr r5d, 6 ; base1 + vinserti128 m0, [tlq+r5], 1 + pshufb m0, m8 + pmaddubsw m0, m1 + pcmpgtw m1, m9, m2 + paddw m2, m6 + pmulhrsw m0, m3 + vpblendvb m0, m7, m0, m1 + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + sub hd, 2 + jz .w8_end + lea dstq, [dstq+strideq*2] + cmp r3d, maxbased + jb .w8_loop + packuswb xm7, xm7 +.w8_end_loop: + movq [dstq+strideq*0], xm7 + movq [dstq+strideq*1], xm7 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_end_loop +.w8_end: + RET +.w16_no_intra_edge_filter: + and maxbased, 15 + or maxbased, 16 ; imin(h+15, 31) + jmp .w16_main +ALIGN function_align +.w16: + %assign stack_offset org_stack_offset + ALLOC_STACK -64, 12 + lea maxbased, [hq+15] + test angled, 0x400 + jnz .w16_no_intra_edge_filter + call .filter_strength + test r5d, r5d + jz .w16_main ; filter_strength == 0 + popcnt r5d, r5d + vpbroadcastd m1, [base+pb_12] + vbroadcasti128 m6, [base+z_filter_s+8] + vinserti128 m2, m6, [base+z_filter_s], 0 + vinserti128 m6, [base+z_filter_s+16], 1 + mova xm10, [tlq-1] + vinserti128 m10, [tlq+3], 1 + vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*0] + vbroadcasti128 m7, [base+z_filter_s+14] + vinserti128 m8, m7, [base+z_filter_s+6], 0 + vinserti128 m7, [base+z_filter_s+22], 1 + psubw m0, m1 + movu xm11, [tlq+12] + vinserti128 m11, [tlq+16], 1 + pminub m8, m0 + pminub m7, m0 + pshufb m0, m10, m2 + shufps m2, m6, q2121 + pmaddubsw m0, m9 + pshufb m1, m11, m8 + shufps m8, m7, q2121 + pmaddubsw m1, m9 + vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1] + movzx r3d, byte [tlq+31] + pshufb m2, m10, m2 + pmaddubsw m2, m9 + pshufb m8, m11, m8 + pmaddubsw m8, m9 + paddw m0, m2 + paddw m1, m8 + sub r5d, 3 + jnz .w16_3tap + vpbroadcastd m9, [z_filter_k+4*8] + movzx r2d, byte [tlq+30] + pshufb m10, m6 + pmaddubsw m10, m9 + pshufb m11, m7 + pmaddubsw m11, m9 + sub r2d, r3d + lea r2d, [r2+r3*8+4] + shr r2d, 3 + mov [rsp+32], r2b + paddw m0, m10 + paddw m1, m11 +.w16_3tap: + pmulhrsw m0, m3 + pmulhrsw m1, m3 + sar r5d, 1 + mov tlq, rsp + add r5d, 33 + cmp hd, 32 + cmovns maxbased, r5d + mov [tlq+r5], r3b + packuswb m0, m1 + vpermq m0, m0, q3120 + mova [tlq], m0 +.w16_main: + movd xm6, dxd + vbroadcasti128 m0, [z_base_inc] + vpbroadcastb m7, [tlq+maxbaseq] + shl maxbased, 6 + vpbroadcastw m6, xm6 + movd xm9, maxbased + vbroadcasti128 m8, [z_filter_s+2] + vpbroadcastw m9, xm9 + mov r3d, dxd + psubw m9, m0 + paddw m11, m6, m6 + psubw m10, m9, m3 ; 64*8 + vpblendd m6, m11, 0xf0 +.w16_loop: + lea r5d, [r3+dxq] + shr r3d, 6 ; base0 + pand m1, m4, m6 + psubw m2, m5, m1 + psllw m1, 8 + por m2, m1 + movu xm0, [tlq+r3+0] + movu xm1, [tlq+r3+8] + lea r3d, [r5+dxq] + shr r5d, 6 ; base1 + vinserti128 m0, [tlq+r5+0], 1 + vinserti128 m1, [tlq+r5+8], 1 + pshufb m0, m8 + pshufb m1, m8 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packuswb m0, m1 + pcmpgtw m1, m9, m6 + pcmpgtw m2, m10, m6 + packsswb m1, m2 + paddw m6, m11 + vpblendvb m0, m7, m0, m1 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + sub hd, 2 + jz .w16_end + lea dstq, [dstq+strideq*2] + cmp r3d, maxbased + jb .w16_loop +.w16_end_loop: + mova [dstq+strideq*0], xm7 + mova [dstq+strideq*1], xm7 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16_end_loop +.w16_end: + RET +ALIGN function_align +.w32: + %assign stack_offset org_stack_offset + ALLOC_STACK -96, 15 + lea r3d, [hq+31] + mov maxbased, 63 + cmp hd, 32 + cmovs maxbased, r3d + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w32_main + vbroadcasti128 m0, [pb_0to15] + sub r3d, 29 ; h+2 + movu xm13, [tlq+29] ; 32-39 + movd xm1, r3d + movu xm14, [tlq+37] ; 40-47 + sub r3d, 8 ; h-6 + vinserti128 m14, [tlq+51], 1 ; 56-63 + vpbroadcastb xm1, xm1 + mova xm11, [tlq- 1] ; 0- 7 + vinserti128 m11, [tlq+13], 1 ; 16-23 + movd xm2, r3d + movu xm12, [tlq+ 5] ; 8-15 + vinserti128 m12, [tlq+19], 1 ; 24-31 + pminub xm1, xm0 ; clip 32x8 + mova m7, [z_filter_s+0] + pshufb xm13, xm1 + vpbroadcastd m1, [pb_12] + vpbroadcastb xm2, xm2 + vinserti128 m13, [tlq+43], 1 ; 48-55 + vinserti128 m8, m7, [z_filter_s+4], 1 + vpblendd m2, m1, 0xf0 + vinserti128 m7, [z_filter_s+12], 0 + pminub m2, m0 ; clip 32x16 and 32x(32|64) + vpbroadcastd m9, [z_filter_k+4*2+12*0] + pshufb m14, m2 + pshufb m0, m11, m8 + shufps m8, m7, q1021 + pmaddubsw m0, m9 + pshufb m2, m12, m8 + pmaddubsw m2, m9 + pshufb m1, m13, m8 + pmaddubsw m1, m9 + pshufb m6, m14, m8 + pmaddubsw m6, m9 + vpbroadcastd m9, [z_filter_k+4*2+12*1] + pshufb m10, m11, m8 + shufps m8, m7, q2121 + pmaddubsw m10, m9 + paddw m0, m10 + pshufb m10, m12, m8 + pmaddubsw m10, m9 + paddw m2, m10 + pshufb m10, m13, m8 + pmaddubsw m10, m9 + paddw m1, m10 + pshufb m10, m14, m8 + pmaddubsw m10, m9 + paddw m6, m10 + vpbroadcastd m9, [z_filter_k+4*2+12*2] + pshufb m11, m8 + pmaddubsw m11, m9 + pshufb m12, m7 + pmaddubsw m12, m9 + movzx r3d, byte [tlq+63] + movzx r2d, byte [tlq+62] + paddw m0, m11 + paddw m2, m12 + pshufb m13, m7 + pmaddubsw m13, m9 + pshufb m14, m7 + pmaddubsw m14, m9 + paddw m1, m13 + paddw m6, m14 + sub r2d, r3d + lea r2d, [r2+r3*8+4] ; edge case for 32x64 + pmulhrsw m0, m3 + pmulhrsw m2, m3 + pmulhrsw m1, m3 + pmulhrsw m6, m3 + shr r2d, 3 + mov [rsp+64], r2b + mov tlq, rsp + mov [tlq+65], r3b + mov r3d, 65 + cmp hd, 64 + cmove maxbased, r3d + packuswb m0, m2 + packuswb m1, m6 + mova [tlq+ 0], m0 + mova [tlq+32], m1 +.w32_main: + movd xm6, dxd + vpbroadcastb m7, [tlq+maxbaseq] + shl maxbased, 6 + vpbroadcastw m6, xm6 + movd xm9, maxbased + vbroadcasti128 m8, [z_filter_s+2] + vpbroadcastw m9, xm9 + mov r5d, dxd + psubw m9, [z_base_inc] + mova m11, m6 + psubw m10, m9, m3 ; 64*8 +.w32_loop: + mov r3d, r5d + shr r3d, 6 + pand m1, m4, m6 + psubw m2, m5, m1 + psllw m1, 8 + por m2, m1 + movu m0, [tlq+r3+0] + movu m1, [tlq+r3+8] + add r5d, dxd + pshufb m0, m8 + pshufb m1, m8 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packuswb m0, m1 + pcmpgtw m1, m9, m6 + pcmpgtw m2, m10, m6 + packsswb m1, m2 + paddw m6, m11 + vpblendvb m0, m7, m0, m1 + mova [dstq], m0 + dec hd + jz .w32_end + add dstq, strideq + cmp r5d, maxbased + jb .w32_loop + test hb, 1 + jz .w32_end_loop + mova [dstq], m7 + add dstq, strideq + dec hd + jz .w32_end +.w32_end_loop: + mova [dstq+strideq*0], m7 + mova [dstq+strideq*1], m7 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_end_loop +.w32_end: + RET +ALIGN function_align +.w64: + %assign stack_offset org_stack_offset + ALLOC_STACK -128, 16 + lea maxbased, [hq+63] + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w64_main + mova xm11, [tlq- 1] ; 0- 7 + vinserti128 m11, [tlq+13], 1 ; 16-23 + movu xm12, [tlq+ 5] ; 8-15 + vinserti128 m12, [tlq+19], 1 ; 24-31 + mova m7, [z_filter_s+0] + vinserti128 m8, m7, [z_filter_s+4], 1 + vinserti128 m7, [z_filter_s+12], 0 + vpbroadcastd m9, [z_filter_k+4*2+12*0] + movu xm13, [tlq+29] ; 32-39 + vinserti128 m13, [tlq+43], 1 ; 48-55 + movu xm14, [tlq+37] ; 40-47 + vinserti128 m14, [tlq+51], 1 ; 56-63 + pshufb m0, m11, m8 + shufps m8, m7, q1021 + pmaddubsw m0, m9 + pshufb m2, m12, m8 + pmaddubsw m2, m9 + pshufb m1, m13, m8 + pmaddubsw m1, m9 + pshufb m6, m14, m8 + pmaddubsw m6, m9 + vpbroadcastd m9, [z_filter_k+4*2+12*1] + pshufb m10, m11, m8 + shufps m15, m8, m7, q2121 + pmaddubsw m10, m9 + paddw m0, m10 + pshufb m10, m12, m15 + pmaddubsw m10, m9 + paddw m2, m10 + pshufb m10, m13, m15 + pmaddubsw m10, m9 + paddw m1, m10 + pshufb m10, m14, m15 + pmaddubsw m10, m9 + paddw m6, m10 + vpbroadcastd m10, [z_filter_k+4*2+12*2] + pshufb m11, m15 + pmaddubsw m11, m10 + pshufb m12, m7 + pmaddubsw m12, m10 + pshufb m13, m7 + pmaddubsw m13, m10 + pshufb m14, m7 + pmaddubsw m14, m10 + paddw m0, m11 + paddw m2, m12 + paddw m1, m13 + paddw m6, m14 + movu xm11, [tlq+ 61] ; 64- 71 + vinserti128 m11, [tlq+ 75], 1 ; 80- 87 + movu xm12, [tlq+ 69] ; 72- 79 + vinserti128 m12, [tlq+ 83], 1 ; 88- 95 + movu xm13, [tlq+ 93] ; 96-103 + vinserti128 m13, [tlq+107], 1 ; 112-119 + movu xm14, [tlq+101] ; 104-111 + vinserti128 m14, [tlq+115], 1 ; 120-127 + pmulhrsw m0, m3 + pmulhrsw m2, m3 + pmulhrsw m1, m3 + pmulhrsw m6, m3 + lea r3d, [hq-20] + mov tlq, rsp + packuswb m0, m2 + packuswb m1, m6 + vpbroadcastd xm2, [pb_14] + vbroadcasti128 m6, [pb_0to15] + mova [tlq+32*0], m0 + mova [tlq+32*1], m1 + movd xm0, r3d + vpbroadcastd m1, [pb_12] + vpbroadcastb m0, xm0 + paddb m0, m2 + pminub m0, m6 ; clip 64x16 and 64x32 + pshufb m12, m0 + pminub m1, m6 ; clip 64x64 + pshufb m14, m1 + pshufb m0, m11, m7 + pmaddubsw m0, m10 + pshufb m2, m12, m7 + pmaddubsw m2, m10 + pshufb m1, m13, m7 + pmaddubsw m1, m10 + pshufb m6, m14, m7 + pmaddubsw m6, m10 + pshufb m7, m11, m15 + pmaddubsw m7, m9 + pshufb m10, m12, m15 + pmaddubsw m10, m9 + paddw m0, m7 + pshufb m7, m13, m15 + pmaddubsw m7, m9 + paddw m2, m10 + pshufb m10, m14, m15 + pmaddubsw m10, m9 + paddw m1, m7 + paddw m6, m10 + vpbroadcastd m9, [z_filter_k+4*2+12*0] + pshufb m11, m8 + pmaddubsw m11, m9 + pshufb m12, m8 + pmaddubsw m12, m9 + pshufb m13, m8 + pmaddubsw m13, m9 + pshufb m14, m8 + pmaddubsw m14, m9 + paddw m0, m11 + paddw m2, m12 + paddw m1, m13 + paddw m6, m14 + pmulhrsw m0, m3 + pmulhrsw m2, m3 + pmulhrsw m1, m3 + pmulhrsw m6, m3 + packuswb m0, m2 + packuswb m1, m6 + mova [tlq+32*2], m0 + mova [tlq+32*3], m1 +.w64_main: + movd xm12, dxd + vpbroadcastb m7, [tlq+maxbaseq] + lea r3d, [dxq-64] + shl maxbased, 6 + vpbroadcastw m12, xm12 + sub r3d, maxbased + vbroadcasti128 m8, [z_filter_s+2] + movd xm6, r3d + mov r5d, dxd + mova m10, [pb_1to32] + vpbroadcastd m11, [pb_32] + vpbroadcastw m6, xm6 +.w64_loop: + mov r3d, r5d + shr r3d, 6 + movu m0, [tlq+r3+ 0] + movu m1, [tlq+r3+ 8] + pand m2, m4, m6 + psubw m9, m5, m2 + psllw m2, 8 + por m9, m2 + pshufb m0, m8 + pshufb m1, m8 + pmaddubsw m0, m9 + pmaddubsw m1, m9 + psraw m2, m6, 6 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packsswb m2, m2 + paddb m2, m10 + packuswb m0, m1 + vpblendvb m0, m7, m0, m2 + mova [dstq+ 0], m0 + movu m0, [tlq+r3+32] + movu m1, [tlq+r3+40] + add r5d, dxd + pshufb m0, m8 + pshufb m1, m8 + pmaddubsw m0, m9 + pmaddubsw m1, m9 + paddb m2, m11 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + paddw m6, m12 + packuswb m0, m1 + vpblendvb m0, m7, m0, m2 + mova [dstq+32], m0 + dec hd + jz .w64_end + add dstq, strideq + cmp r5d, maxbased + jb .w64_loop +.w64_end_loop: + mova [dstq+ 0], m7 + mova [dstq+32], m7 + add dstq, strideq + dec hd + jg .w64_end_loop +.w64_end: + RET + +cglobal ipred_z2, 3, 10, 16, 224, dst, stride, tl, w, h, angle, dx, dy +%define base r9-z_filter_t0 + lea r9, [ipred_z2_avx2_table] + tzcnt wd, wm + movifnidn angled, anglem + movifnidn hd, hm + lea dxq, [dr_intra_derivative-90] + movsxd wq, [r9+wq*4] + movzx dyd, angleb + xor angled, 0x400 + mov r8, dxq + sub dxq, dyq + add wq, r9 + add r9, z_filter_t0-ipred_z2_avx2_table + mova m2, [tlq-64] + mova m0, [tlq-32] + mova m1, [tlq] + and dyd, ~1 + and dxq, ~1 + movzx dyd, word [r8+dyq] ; angle - 90 + movzx dxd, word [dxq+270] ; 180 - angle + vpbroadcastd m13, [base+pw_512] + vpbroadcastd m14, [base+pw_62] + vpbroadcastd m15, [base+pw_64] + mova [rsp+ 0], m2 + mova [rsp+32], m0 + mova [rsp+64], m1 + neg dxd + neg dyd + jmp wq +.w4: + vpbroadcastq m6, [base+z2_base_inc] ; base_inc << 6 + vbroadcasti128 m10, [base+z1_shuf_w4] + vbroadcasti128 m11, [base+z2_shuf_h4] + lea r2d, [dxq+(65<<6)] ; xpos + movd xm5, dyd + mov r8d, (63-4)<<6 + mov dyq, -4 + pshuflw xm5, xm5, q0000 + pmullw xm5, [base+z2_ymul] + test angled, 0x400 + jnz .w4_main ; !enable_intra_edge_filter + lea r3d, [hq+2] + add angled, 1022 + shl r3d, 6 + test r3d, angled + jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) + vpbroadcastd xm3, [base+pb_4] + call .upsample_above + sub angled, 1075 ; angle - 53 + lea r3d, [hq+3] + xor angled, 0x7f ; 180 - angle + call .filter_strength + jmp .w4_filter_left +ALIGN function_align +.filter_strength: + movd xm8, r3d + mov r3d, angled + movd xm7, angled + vpbroadcastb m8, xm8 + shr r3d, 8 ; is_sm << 1 + vpbroadcastb m7, xm7 + pcmpeqb m8, [base+z_filter_wh] + mova xm9, [r9+r3*8] + pand m0, m8, m7 + pcmpgtb m0, m9 + pmovmskb r3d, m0 + ret +ALIGN function_align +.upsample_above: ; w4/w8 + pshufb xm2, xm1, [base+z_upsample1-2] + pminub xm3, [base+z_filter_s+4] + vpbroadcastd xm4, [base+pb_36_m4] + vbroadcasti128 m10, [base+pb_0to15] + pshufb xm3, xm1, xm3 + pmaddubsw xm2, xm4 + pmaddubsw xm3, xm4 + lea r2d, [r2+dxq+(1<<6)] + add dxd, dxd + paddw xm2, xm3 + pmulhrsw xm2, xm13 + sub r8d, 3<<6 + paddw m6, m6 + packuswb xm2, xm2 + punpcklbw xm1, xm2 + mova [rsp+gprsize+64], xm1 + ret +ALIGN function_align +.upsample_left: ; h4/h8 + mov r3d, hd + and r3d, 4 + movd xm2, [rsp+gprsize+64] + movddup xm0, [rsp+gprsize+56] + movd xm1, r3d + palignr xm2, xm0, 1 + vpbroadcastb xm1, xm1 + pshufb xm2, [base+z_filter_s+18] + vpbroadcastd xm3, [base+pb_36_m4] + pmaxub xm1, [base+z_upsample1-2] + pshufb xm1, xm0, xm1 + pmaddubsw xm2, xm3 + pmaddubsw xm1, xm3 + paddw xm5, xm5 + add dyq, dyq + paddw xm1, xm2 + pmulhrsw xm1, xm13 + vbroadcasti128 m11, [base+z2_upsample] + paddw xm5, xm15 + packuswb xm1, xm1 + punpcklbw xm0, xm1 + mova [rsp+gprsize+48], xm0 + ret +.w4_no_upsample_above: + lea r3d, [hq+3] + sub angled, 1112 ; angle - 90 + call .filter_strength + test r3d, r3d + jz .w4_no_filter_above + popcnt r3d, r3d + vpbroadcastd xm2, [base+pb_4] + pminub xm2, [base+z_filter_s] + vpbroadcastd xm0, [base+z_filter_k-4+r3*4+12*0] + vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1] + pshufb xm3, xm1, xm2 ; 00 01 12 23 + pshufd xm2, xm2, q0321 + pmaddubsw xm0, xm3, xm0 + pshufb xm2, xm1, xm2 ; 12 23 34 44 + pmaddubsw xm2, xm4 + vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*2] + punpckhqdq xm3, xm3 ; 34 44 44 44 + pmaddubsw xm3, xm4 + movd xm4, r6m ; max_width + pminsw xm4, xm15 + vpbroadcastb xm4, xm4 + paddw xm0, xm2 + paddw xm0, xm3 + pmulhrsw xm0, xm13 + psubb xm4, [base+pb_1to32] + psrlq xm1, 8 + packuswb xm0, xm0 + vpblendvb xm0, xm1, xm4 + movd [rsp+65], xm0 +.w4_no_filter_above: + lea r3d, [hq+2] + add angled, 973 ; angle + 883 + shl r3d, 6 + test r3d, angled + jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8) + vpbroadcastd xm0, [base+pb_90] + psubb xm0, xm7 ; 180 - angle + pand xm0, xm8 ; reuse from previous filter_strength call + pcmpgtb xm0, xm9 + pmovmskb r3d, xm0 +.w4_filter_left: + test r3d, r3d + jz .w4_main + popcnt r3d, r3d + mov r5d, 10 + cmp hd, 16 + movu xm2, [rsp+49] + vinserti128 m2, [rsp+43], 1 + cmovs r5d, hd + xor r5d, 15 ; h == 16 ? 5 : 15 - h + movd xm0, r5d + vbroadcasti128 m1, [base+z_filter_s+12] + vbroadcasti128 m4, [base+z_filter_s+16] + vinserti128 m3, m1, [z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd 55 55 56 67 78 89 9a ab + vpblendd m1, m4, 0x0f ; 78 89 9a ab bc cd de ef 56 67 78 89 9a ab bc cd + vinserti128 m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff 78 89 9a ab bc cd de ef + vpbroadcastb m0, xm0 + pmaxub m0, m3 + vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*0] + pshufb m0, m2, m0 + pmaddubsw m0, m3 + vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*1] + pshufb m1, m2, m1 + pmaddubsw m1, m3 + vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*2] + pshufb m2, m4 + pmaddubsw m2, m3 + movd xm4, r7m ; max_height + pminsw xm4, xm15 + vpbroadcastb xm4, xm4 + psubb xm4, [base+pb_16to1] + paddw m1, m0 + paddw m1, m2 + pmulhrsw m1, m13 + vextracti128 xm0, m1, 1 + packuswb xm0, xm1 + vpblendvb xm0, [rsp+48], xm4 + mova [rsp+48], xm0 + jmp .w4_main +.w4_upsample_left: + call .upsample_left +.w4_main: + movd xm0, dxd + mova m12, [base+z2_y_shuf_h4] + lea r5, [rsp+56] ; left-7 + vpbroadcastw m0, xm0 + lea r9, [strideq*3] + psraw xm1, xm5, 6 + pand xm5, xm14 ; frac_y + pxor xm2, xm2 + paddw m7, m0, m0 + psubw xm4, xm2, xm1 ; base_y + vpblendd m0, m7, 0xcc + mova xm1, xm7 + punpcklwd xm4, xm2 + paddw m0, m1 ; xpos2 xpos3 xpos0 xpos1 + psubw xm1, xm15, xm5 ; 64-frac_y + psllw xm5, 8 + paddw m7, m7 + paddw m6, m0 + por xm5, xm1 ; 64-frac_y, frac_y + vpbroadcastq m5, xm5 +.w4_loop: + lea r3d, [r2+dxq] + shr r2d, 6 ; base_x0 + vpbroadcastq m1, [rsp+r2] + lea r2d, [r3+dxq] + shr r3d, 6 ; base_x1 + vpbroadcastq m2, [rsp+r3] + lea r3d, [r2+dxq] + shr r2d, 6 ; base_x2 + movq xm0, [rsp+r2] + lea r2d, [r3+dxq] + shr r3d, 6 ; base_x3 + movhps xm0, [rsp+r3] + vpblendd m1, m2, 0xc0 + pand m2, m14, m6 ; frac_x + vpblendd m0, m1, 0xf0 + psubw m1, m15, m2 ; 64-frac_x + psllw m2, 8 + pshufb m0, m10 + por m1, m2 ; 64-frac_x, frac_x + pmaddubsw m0, m1 + cmp r3d, 64 + jge .w4_toponly + mova m1, m7 ; arbitrary negative value + vpgatherdq m3, [r5+xm4], m1 + pshufb m1, m3, m11 + vpermd m1, m12, m1 + pmaddubsw m1, m5 + psraw m2, m6, 15 ; base_x < topleft + vpblendvb m0, m1, m2 +.w4_toponly: + pmulhrsw m0, m13 + paddw m6, m7 ; xpos += dx + add r5, dyq + packuswb m0, m0 + vextracti128 xm1, m0, 1 + movd [dstq+strideq*2], xm0 + pextrd [dstq+r9 ], xm0, 1 + movd [dstq+strideq*0], xm1 + pextrd [dstq+strideq*1], xm1, 1 + sub hd, 4 + jz .w4_end + lea dstq, [dstq+strideq*4] + cmp r2d, r8d + jge .w4_loop +.w4_leftonly_loop: + mova m1, m7 + vpgatherdq m2, [r5+xm4], m1 + add r5, dyq + pshufb m0, m2, m11 + vpermd m0, m12, m0 + pmaddubsw m0, m5 + pmulhrsw m0, m13 + packuswb m0, m0 + vextracti128 xm1, m0, 1 + movd [dstq+strideq*2], xm0 + pextrd [dstq+r9 ], xm0, 1 + movd [dstq+strideq*0], xm1 + pextrd [dstq+strideq*1], xm1, 1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4_leftonly_loop +.w4_end: + RET +.w8: + vbroadcasti128 m6, [base+z2_base_inc] ; base_inc << 6 + movd xm5, dyd + vbroadcasti128 m10, [base+z_filter_s+2] + vbroadcasti128 m11, [base+z2_shuf_h4] + lea r2d, [dxq+(65<<6)] ; xpos + vpbroadcastw xm5, xm5 + mov r8d, (63-8)<<6 + mov dyq, -4 + pmullw xm5, [base+z2_ymul] + test angled, 0x400 + jnz .w8_main + lea r3d, [angleq+126] + mov r3b, hb + cmp r3d, 8 + ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm + vpbroadcastd xm3, [base+pb_8] + movhps [rsp+80], xm1 + call .upsample_above + sub angled, 53 ; angle - 53 + lea r3d, [hq+7] + xor angled, 0x7f ; 180 - angle + call .filter_strength + jmp .w8_filter_left +.w8_no_upsample_above: + lea r3d, [hq+7] + sub angled, 90 ; angle - 90 + call .filter_strength + test r3d, r3d + jz .w8_no_filter_above + popcnt r3d, r3d + vpbroadcastd xm3, [base+pb_8] + pminub xm3, [base+z_filter_s+8] + vpbroadcastd xm0, [base+z_filter_k-4+r3*4+12*0] + vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1] + pshufb xm2, xm1, [base+z_filter_s] ; 00 01 12 23 34 45 56 67 + pmaddubsw xm0, xm2, xm0 + pshufb xm3, xm1, xm3 ; 34 45 56 67 78 88 88 88 + shufps xm2, xm3, q2121 ; 12 23 34 45 56 67 78 88 + pmaddubsw xm2, xm4 + vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*2] + pmaddubsw xm3, xm4 + movd xm4, r6m ; max_width + pminuw xm4, xm15 + vpbroadcastb xm4, xm4 + paddw xm0, xm2 + paddw xm0, xm3 + pmulhrsw xm0, xm13 + psubb xm4, [base+pb_1to32] + psrldq xm1, 1 + packuswb xm0, xm0 + vpblendvb xm0, xm1, xm4 + movq [rsp+65], xm0 +.w8_no_filter_above: + lea r3d, [angleq-51] + mov r3b, hb + cmp r3d, 8 + jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm + vpbroadcastd m0, [base+pb_90] + psubb m0, m7 + pand m0, m8 + pcmpgtb m0, m9 + pmovmskb r3d, m0 +.w8_filter_left: + test r3d, r3d + jz .w8_main + popcnt r3d, r3d + vpbroadcastd m7, [base+z_filter_k-4+r3*4+12*0] + vpbroadcastd m8, [base+z_filter_k-4+r3*4+12*1] + vpbroadcastd m9, [base+z_filter_k-4+r3*4+12*2] + cmp hd, 32 + jne .w8_filter_left_h16 + movu xm2, [rsp+27] + vinserti128 m2, [rsp+35], 1 + vpbroadcastd xm0, [base+pb_5] + vbroadcasti128 m3, [base+z_filter_s+ 8] + vbroadcasti128 m1, [base+z_filter_s+12] + vbroadcasti128 m4, [base+z_filter_s+16] + pmaxub m3, m0 + pshufb m3, m2, m3 + pmaddubsw m3, m7 + pshufb m1, m2, m1 + pmaddubsw m1, m8 + pshufb m2, m4 + pmaddubsw m2, m9 + paddw m3, m1 + paddw m3, m2 + pmulhrsw m3, m13 + jmp .w8_filter_left_top16 +.w8_filter_left_h16: + mov r5d, 10 + cmp hd, 16 + cmovs r5d, hd + xor r5d, 15 ; h == 16 ? 5 : 15 - h + movd xm0, r5d + vpbroadcastb m0, xm0 +.w8_filter_left_top16: + vbroadcasti128 m1, [base+z_filter_s+12] + vinserti128 m2, m1, [base+z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd 55 55 56 67 78 89 9a ab + vbroadcasti128 m4, [base+z_filter_s+16] + vpblendd m1, m4, 0x0f ; 78 89 9a ab bc cd de ef 56 67 78 89 9a ab bc cd + vinserti128 m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff 78 89 9a ab bc cd de ef + pmaxub m0, m2 + movu xm2, [rsp+49] + vinserti128 m2, [rsp+43], 1 + pshufb m0, m2, m0 + pmaddubsw m0, m7 + movd xm7, r7m ; max_height + pshufb m1, m2, m1 + pmaddubsw m1, m8 + pshufb m2, m4 + pmaddubsw m2, m9 + pminsw xm7, xm15 + paddw m1, m0 + vpbroadcastb m7, xm7 + paddw m1, m2 + pmulhrsw m1, m13 + psubb m7, [base+pb_32to1] + packuswb m3, m1 + vpermq m3, m3, q1320 + vpblendvb m3, [rsp+32], m7 + mova [rsp+32], m3 + jmp .w8_main +.w8_upsample_left: + call .upsample_left +.w8_main: + movd xm3, dxd + lea r5, [rsp+56] ; left-7 + pshufd xm1, xm5, q3120 + pand xm5, xm14 + vpbroadcastw m3, xm3 + pxor xm0, xm0 + psubw xm2, xm15, xm5 + psraw xm1, 6 + lea r9, [strideq*3] + paddw m7, m3, m3 + psubw xm9, xm0, xm1 ; base_y + psllw xm5, 8 + punpcklwd xm8, xm9, xm0 ; base_y 0, 1, 4, 5 + vpblendd m3, m7, 0xf0 ; xpos0 xpos1 + por xm5, xm2 ; 64-frac_y, frac_y + punpckhwd xm9, xm0 ; base_y 2, 3, 6, 7 + paddw m6, m3 + vinserti128 m12, m5, xm5, 1 +.w8_loop: + lea r3d, [r2+dxq] + shr r2d, 6 ; base_x0 + movu xm0, [rsp+r2] + lea r2d, [r3+dxq] + shr r3d, 6 ; base_x1 + vinserti128 m0, [rsp+r3], 1 + lea r3d, [r2+dxq] + shr r2d, 6 ; base_x2 + movu xm1, [rsp+r2] + lea r2d, [r3+dxq] + shr r3d, 6 ; base_x3 + vinserti128 m1, [rsp+r3], 1 + pand m2, m14, m6 + paddsw m4, m6, m7 + psubw m5, m15, m2 + psllw m2, 8 + pshufb m0, m10 + por m2, m5 + pmaddubsw m0, m2 + pand m2, m14, m4 + psubw m5, m15, m2 + psllw m2, 8 + pshufb m1, m10 + por m2, m5 + pmaddubsw m1, m2 + cmp r3d, 64 + jge .w8_toponly + mova m5, m7 + vpgatherdq m3, [r5+xm9], m7 + mova m7, m5 + vpgatherdq m2, [r5+xm8], m5 + pshufb m3, m11 + pshufb m2, m11 + punpckldq m5, m2, m3 ; a0 b0 c0 d0 a1 b1 c1 d1 e0 f0 g0 h0 e1 f1 g1 h1 + punpckhdq m2, m3 ; a2 b2 c2 d2 a3 b3 c3 d3 e2 f2 g2 h2 e3 f3 g3 h3 + vpermq m5, m5, q3120 ; y0 y1 + vpermq m2, m2, q3120 ; y2 y3 + pmaddubsw m5, m12 + pmaddubsw m2, m12 + psraw m6, 15 ; base_x < topleft + vpblendvb m0, m5, m6 + psraw m3, m4, 15 + vpblendvb m1, m2, m3 +.w8_toponly: + pmulhrsw m0, m13 + pmulhrsw m1, m13 + paddw m6, m4, m7 ; xpos += dx + add r5, dyq + packuswb m0, m1 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*2], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+r9 ], xm1 + sub hd, 4 + jz .w8_end + lea dstq, [dstq+strideq*4] + cmp r2d, r8d + jge .w8_loop +.w8_leftonly_loop: + mova m0, m7 + vpgatherdq m5, [r5+xm9], m7 + mova m7, m0 + vpgatherdq m3, [r5+xm8], m0 + add r5, dyq + pshufb m2, m5, m11 + pshufb m1, m3, m11 + punpckldq m0, m1, m2 + punpckhdq m1, m2 + vpermq m0, m0, q3120 + vpermq m1, m1, q3120 + pmaddubsw m0, m12 + pmaddubsw m1, m12 + pmulhrsw m0, m13 + pmulhrsw m1, m13 + packuswb m0, m1 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*2], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+r9 ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_leftonly_loop +.w8_end: + RET +.w16: + mov r8d, hd + test angled, 0x400 + jnz .w16_main + lea r3d, [hq+15] + sub angled, 90 + call .filter_strength + test r3d, r3d + jz .w16_no_filter_above + popcnt r3d, r3d + vbroadcasti128 m6, [tlq+1] + mova xm2, [base+z_filter_s] + vinserti128 m2, [base+z_filter_s+14], 1 ; 00 01 12 23 34 45 56 67 67 78 89 9a ab bc cd de + movu xm3, [base+z_filter_s+8] + vinserti128 m3, [base+z_filter_s+22], 1 ; 34 45 56 67 78 89 9a ab ab bc cd de ef ff ff ff + vpblendd m1, m6, 0xf0 + vpbroadcastd m0, [base+z_filter_k-4+r3*4+12*0] + vpbroadcastd m4, [base+z_filter_k-4+r3*4+12*1] + vpbroadcastd m5, [base+z_filter_k-4+r3*4+12*2] + pshufb m2, m1, m2 + pshufb m1, m3 + pmaddubsw m0, m2, m0 + shufps m2, m1, q2121 ; 12 23 34 45 56 67 78 89 89 9a ab bc cd de ef ff + pmaddubsw m2, m4 + pmaddubsw m1, m5 + movd xm4, r6m ; max_width + pminsw xm4, xm15 + vpbroadcastb xm4, xm4 + paddw m0, m2 + paddw m0, m1 + pmulhrsw m0, m13 + psubb xm4, [base+pb_1to32] + vextracti128 xm2, m0, 1 + packuswb xm0, xm2 + vpblendvb xm0, xm6, xm4 + movu [rsp+65], xm0 +.w16_no_filter_above: + vpbroadcastd m0, [base+pb_90] + psubb m0, m7 + pand m0, m8 + pcmpgtb m0, m9 + pmovmskb r3d, m0 + test r3d, r3d + jz .w16_main + popcnt r3d, r3d + vpbroadcastd m7, [base+z_filter_k-4+r3*4+12*0] + vpbroadcastd m8, [base+z_filter_k-4+r3*4+12*1] + vpbroadcastd m9, [base+z_filter_k-4+r3*4+12*2] +.w16_filter_left: + movd xm6, r7m ; max_height + pminsw xm6, xm15 + vpbroadcastb m6, xm6 + cmp hd, 32 + jl .w16_filter_left_h16 + vpbroadcastd xm0, [base+pb_5] + vbroadcasti128 m10, [base+z_filter_s+ 8] + vbroadcasti128 m11, [base+z_filter_s+12] + vbroadcasti128 m12, [base+z_filter_s+16] + je .w16_filter_left_h32 + movu m3, [tlq-69] + movu m5, [tlq-61] + pmaxub m1, m10, m0 + pshufb m1, m3, m1 + pmaddubsw m1, m7 + pshufb m2, m3, m11 + pmaddubsw m2, m8 + pshufb m3, m12 + pmaddubsw m3, m9 + paddw m1, m2 + pshufb m2, m5, m10 + pmaddubsw m2, m7 + pshufb m4, m5, m11 + pmaddubsw m4, m8 + pshufb m5, m12 + pmaddubsw m5, m9 + paddw m1, m3 + vpbroadcastd m3, [base+pb_32] + paddb m3, [base+pb_32to1] + paddw m2, m4 + paddw m2, m5 + pmulhrsw m1, m13 + pmulhrsw m2, m13 + psubb m3, m6, m3 + packuswb m1, m2 + vpblendvb m1, [tlq-64], m3 + mova [rsp], m1 + jmp .w16_filter_left_top32 +.w16_filter_left_h32: + pmaxub m10, m0 +.w16_filter_left_top32: + movu xm2, [tlq-37] + vinserti128 m2, [tlq-29], 1 + pshufb m3, m2, m10 + pshufb m1, m2, m11 + pshufb m2, m12 + pmaddubsw m3, m7 + pmaddubsw m1, m8 + pmaddubsw m2, m9 + paddw m3, m1 + paddw m3, m2 + pmulhrsw m3, m13 + jmp .w16_filter_left_top16 +.w16_filter_left_h16: + mov r5d, 10 + cmp hd, 16 + cmovs r5d, hd + xor r5d, 15 ; h == 16 ? 5 : 15 - h + movd xm0, r5d + vpbroadcastb m0, xm0 +.w16_filter_left_top16: + movu xm2, [tlq-15] + vinserti128 m2, [tlq-21], 1 + vbroadcasti128 m1, [base+z_filter_s+12] + vbroadcasti128 m4, [base+z_filter_s+16] + vinserti128 m5, m1, [base+z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd 34 45 56 67 78 89 9a ab + vpblendd m1, m4, 0x0f ; 78 89 9a ab bc cd de ef 56 67 78 89 9a ab bc cd + vinserti128 m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff 78 89 9a ab bc cd de ef + pmaxub m0, m5 + pshufb m0, m2, m0 + pmaddubsw m0, m7 + pshufb m1, m2, m1 + pmaddubsw m1, m8 + pshufb m2, m4 + pmaddubsw m2, m9 + psubb m6, [base+pb_32to1] + paddw m1, m0 + paddw m1, m2 + pmulhrsw m1, m13 + packuswb m3, m1 + vpermq m3, m3, q1320 + vpblendvb m3, [tlq-32], m6 + mova [rsp+32], m3 +.w16_main: + movd xm1, dyd + vbroadcasti128 m10, [base+z_filter_s+2] + movd xm7, dxd + vbroadcasti128 m11, [base+z2_shuf_h2] + vpbroadcastw m1, xm1 + vpbroadcastw m7, xm7 + mov r7, dstq + pmullw m0, m1, [base+z2_ymul] + psllw xm1, 4 + paddw m6, m7, [base+z2_base_inc] + lea r9d, [dxq+(65<<6)] ; xpos + movd [rsp+156], xm1 +.w16_loop0: + mov r2d, r9d + mova [rsp+160], m0 + lea r5, [rsp+60] ; left-3 + mova [rsp+192], m6 + pxor m1, m1 + psraw m2, m0, 6 + pand m0, m14 + psubw m9, m1, m2 ; base_y + psubw m12, m15, m0 + punpcklwd m8, m9, m1 ; base_y 0, 1, 2, 3, 8, 9, 10, 11 + psllw m0, 8 + punpckhwd m9, m1 ; base_y 4, 5, 6, 7, 12, 13, 14, 15 + por m12, m0 ; 64-frac_y, frac_y +.w16_loop: + lea r3d, [r2+dxq] + shr r2d, 6 ; base_x0 + movu xm0, [rsp+r2] + vinserti128 m0, [rsp+r2+8], 1 + lea r2d, [r3+dxq] + shr r3d, 6 ; base_x1 + movu xm1, [rsp+r3] + vinserti128 m1, [rsp+r3+8], 1 + pand m2, m14, m6 + paddsw m5, m6, m7 + psubw m3, m15, m2 + psllw m2, 8 + pshufb m0, m10 + por m2, m3 + pmaddubsw m0, m2 + pand m2, m14, m5 + psubw m3, m15, m2 + psllw m2, 8 + pshufb m1, m10 + por m2, m3 + pmaddubsw m1, m2 + cmp r3d, 64 + jge .w16_toponly + punpckhwd m2, m5, m5 ; mask out unnecessary loads + vpgatherdd m4, [r5+m9], m2 + punpcklwd m2, m5, m5 + vpgatherdd m3, [r5+m8], m2 + pshufb m4, m11 ; e0 f0 g0 h0 e1 f1 g1 h1 m0 n0 o0 p0 m1 n1 o1 p1 + pshufb m3, m11 ; a0 b0 c0 d0 a1 b1 c1 d1 i0 j0 k0 l0 i1 j1 k1 l1 + punpcklqdq m2, m3, m4 ; y0 + punpckhqdq m3, m4 ; y1 + pmaddubsw m2, m12 + pmaddubsw m3, m12 + psraw m6, 15 ; base_x < topleft + vpblendvb m0, m2, m6 + psraw m6, m5, 15 + vpblendvb m1, m3, m6 +.w16_toponly: + pmulhrsw m0, m13 + pmulhrsw m1, m13 + paddw m6, m5, m7 ; xpos += dx + sub r5, 2 + packuswb m0, m1 + vpermq m0, m0, q3120 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + sub hd, 2 + jz .w16_end + lea dstq, [dstq+strideq*2] + cmp r2d, (63-16)<<6 + jge .w16_loop +.w16_leftonly_loop: + mova m0, m7 + vpgatherdd m4, [r5+m9], m7 + mova m7, m0 + vpgatherdd m3, [r5+m8], m0 + sub r5, 2 + pshufb m2, m4, m11 + pshufb m1, m3, m11 + punpcklqdq m0, m1, m2 + punpckhqdq m1, m2 + pmaddubsw m0, m12 + pmaddubsw m1, m12 + pmulhrsw m0, m13 + pmulhrsw m1, m13 + packuswb m0, m1 + vpermq m0, m0, q3120 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16_leftonly_loop +.w16_end: + sub r8d, 1<<8 + jl .w16_ret + vpbroadcastd m0, [rsp+156] + paddw m0, [rsp+160] ; base_y += 16*dy + paddw m6, m13, [rsp+192] + add r7, 16 + add r9d, 16<<6 + movzx hd, r8b + mov dstq, r7 + paddw m6, m13 ; base_x += 16*64 + jmp .w16_loop0 +.w16_ret: + RET +.w32: + mova m2, [tlq+32] + lea r8d, [hq+(1<<8)] + mova [rsp+96], m2 + test angled, 0x400 + jnz .w16_main + vpbroadcastd m7, [base+z_filter_k+4*2+12*0] + vpbroadcastd m8, [base+z_filter_k+4*2+12*1] + vpbroadcastd m9, [base+z_filter_k+4*2+12*2] + mova xm5, [base+z_filter_s] + vinserti128 m5, [base+z_filter_s+10], 1 ; 00 01 12 23 34 45 56 67 45 56 67 78 89 9a ab bc + vinserti128 m1, [tlq+11], 1 + movu xm6, [base+z_filter_s+12] + vinserti128 m6, [base+z_filter_s+22], 1 ; 56 67 78 89 9a ab bc cd ab bc cd de ef ff ff ff + movu xm3, [tlq+ 6] + vinserti128 m3, [tlq+17], 1 + movd xm0, r6m ; max_width + pminsw xm0, xm15 + vpbroadcastb m10, xm0 +.w32_filter_above: + pshufb m0, m1, m5 + shufps m4, m5, m6, q1021 ; 12 23 34 45 56 67 78 89 67 78 89 9a ab bc cd de + pmaddubsw m0, m7 + pshufb m2, m1, m4 + shufps m5, m6, q2132 ; 34 45 56 67 78 89 9a ab 89 9a ab bc cd de ef ff + pmaddubsw m2, m8 + pshufb m1, m5 + pmaddubsw m1, m9 + paddw m0, m2 + paddw m0, m1 + pshufb m1, m3, m4 + pmaddubsw m1, m7 + pshufb m2, m3, m5 + pmaddubsw m2, m8 + pshufb m3, m6 + pmaddubsw m3, m9 + paddw m1, m2 + paddw m1, m3 + pmulhrsw m0, m13 + pmulhrsw m1, m13 + psubb m10, [base+pb_1to32] + packuswb m0, m1 + vpblendvb m0, [tlq+1], m10 + movu [rsp+65], m0 + jmp .w16_filter_left +.w64: + mova m2, [tlq+32] + mov r3d, [tlq+64] + lea r8d, [hq+(3<<8)] + mova [rsp+ 96], m2 + mov [rsp+128], r3d + test angled, 0x400 + jnz .w16_main + vpbroadcastd m7, [base+z_filter_k+4*2+12*0] + vpbroadcastd m8, [base+z_filter_k+4*2+12*1] + vpbroadcastd m9, [base+z_filter_k+4*2+12*2] + movu xm6, [base+z_filter_s+ 4] + vinserti128 m6, [base+z_filter_s+10], 1 ; 12 23 34 45 56 67 78 89 45 56 67 78 89 9a ab bc + movu xm3, [tlq+30] + vinserti128 m3, [tlq+43], 1 + movu xm5, [base+z_filter_s+16] + vinserti128 m5, [base+z_filter_s+22], 1 ; 78 89 9a ab bc cd de ef ab bc cd de ef ff ff ff + pshufb m0, m3, m6 + shufps m4, m6, m5, q1021 ; 34 45 56 67 78 89 9a ab 67 78 89 9a ab bc cd de + pmaddubsw m0, m7 + pshufb m2, m3, m4 + shufps m6, m5, q2132 ; 56 67 78 89 9a ab bc cd 89 9a ab bc cd de ef ff + pmaddubsw m2, m8 + pshufb m3, m6 + pmaddubsw m3, m9 + paddw m0, m2 + paddw m0, m3 + movu xm2, [tlq+36] + vinserti128 m2, [tlq+49], 1 + pshufb m4, m2, m4 + pmaddubsw m4, m7 + pshufb m3, m2, m6 + pmaddubsw m3, m8 + pshufb m2, m5 + pmaddubsw m2, m9 + movd xm5, r6m ; max_width + pminsw xm5, xm15 + vpbroadcastb m10, xm5 + paddw m3, m4 + paddw m2, m3 + vpbroadcastd m3, [base+pb_32] + pmulhrsw m0, m13 + pmulhrsw m2, m13 + mova xm5, [base+z_filter_s] + vinserti128 m5, [base+z_filter_s+6], 1 + psubb m3, m10, m3 + psubb m3, [base+pb_1to32] + vinserti128 m1, [tlq+13], 1 + packuswb m0, m2 + vpblendvb m0, [tlq+33], m3 + movu xm3, [tlq+ 6] + vinserti128 m3, [tlq+19], 1 + movu [rsp+97], m0 + jmp .w32_filter_above + +cglobal ipred_z3, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase + %assign org_stack_offset stack_offset + lea r6, [ipred_z3_avx2_table] + tzcnt hd, hm + movifnidn angled, anglem + lea r7, [dr_intra_derivative+45*2-1] + dec tlq + movsxd hq, [r6+hq*4] + sub angled, 180 + add hq, r6 + mov dyd, angled + neg dyd + xor angled, 0x400 + or dyq, ~0x7e + movzx dyd, word [r7+dyq] + vpbroadcastd m3, [pw_512] + vpbroadcastd m4, [pw_62] + vpbroadcastd m5, [pw_64] + mov org_wd, wd + jmp hq +.h4: + lea r7, [strideq*3] + cmp angleb, 40 + jae .h4_no_upsample + lea r4d, [angleq-1024] + sar r4d, 7 + add r4d, wd + jg .h4_no_upsample ; !enable_intra_edge_filter || w > 8 || (w == 8 && is_sm) + ALLOC_STACK -32, 9 + movu xm8, [tlq-7] + pshufb xm0, xm8, [z_upsample1-4] + vpbroadcastb xm2, xm8 + pshufb xm1, xm8, [z_filter_s+2] + mova [rsp+16], xm2 ; top[max_base_y] + vpbroadcastd xm2, [pb_36_m4] + add dyd, dyd + pmaddubsw xm0, xm2 + pmaddubsw xm1, xm2 + movd xm7, dyd + mov r2d, dyd + vpbroadcastw m7, xm7 + paddw xm1, xm0 + pmulhrsw xm1, xm3 + pslldq m6, m7, 8 + paddw xm2, xm7, xm7 + paddw m6, m7 + packuswb xm1, xm1 + paddw m6, m2 + punpcklbw xm1, xm8 + mova xm8, [z_transpose4] + psllw m7, 2 + pshufb xm1, [pb_15to0] + mova [rsp], xm1 +.h4_upsample_loop: + lea r4d, [r2+dyq] + shr r2d, 6 + vpbroadcastq m1, [rsp+r2] + lea r2d, [r4+dyq] + shr r4d, 6 + vpbroadcastq m2, [rsp+r4] + lea r4d, [r2+dyq] + shr r2d, 6 + movq xm0, [rsp+r2] + lea r2d, [r4+dyq] + shr r4d, 6 + movhps xm0, [rsp+r4] + vpblendd m1, m2, 0xc0 + pand m2, m4, m6 + vpblendd m0, m1, 0xf0 + psubw m1, m5, m2 + psllw m2, 8 + por m1, m2 + pmaddubsw m0, m1 + paddw m6, m7 + pmulhrsw m0, m3 + vextracti128 xm1, m0, 1 + packuswb xm1, xm0 + pshufb xm1, xm8 + movd [dstq+strideq*0], xm1 + pextrd [dstq+strideq*1], xm1, 1 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+r7 ], xm1, 3 + add dstq, 4 + sub wd, 4 + jg .h4_upsample_loop + RET +ALIGN function_align +.filter_strength: ; h4/h8/h16 +%define base r4-z_filter_t0 + lea r4, [z_filter_t0] + movd xm0, maxbased + movd xm2, angled + shr angled, 8 ; is_sm << 1 + vpbroadcastb m0, xm0 + vpbroadcastb m2, xm2 + pcmpeqb m1, m0, [base+z_filter_wh] + pand m1, m2 + mova xm2, [r4+angleq*8] + pcmpgtb m1, m2 + pmovmskb r5d, m1 + ret +.h4_no_upsample: + %assign stack_offset org_stack_offset + ALLOC_STACK -16, 12 + mov maxbased, 7 + test angled, 0x400 ; !enable_intra_edge_filter + jnz .h4_main + lea maxbased, [wq+3] + call .filter_strength + mov maxbased, 7 + test r5d, r5d + jz .h4_main ; filter_strength == 0 + popcnt r5d, r5d + vpbroadcastd m7, [base+pb_7] + vbroadcasti128 m2, [tlq-14] + pmaxub m1, m7, [base+z_filter_s-4] + vpbroadcastd m8, [base+z_filter_k-4+r5*4+12*0] + pmaxub m7, [base+z_filter_s+4] + vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1] + vpbroadcastd m10, [base+z_filter_k-4+r5*4+12*2] + pshufb m0, m2, m1 + shufps m1, m7, q2121 + pmaddubsw m0, m8 + pshufb m1, m2, m1 + pmaddubsw m1, m9 + pshufb m2, m7 + pmaddubsw m2, m10 + paddw m0, m1 + paddw m0, m2 + pmulhrsw m0, m3 + mov r4d, 9 + lea tlq, [rsp+15] + cmp wd, 4 + cmovne maxbased, r4d + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + mova [rsp], xm0 +.h4_main: + movd xm6, dyd + vpbroadcastq m0, [z_base_inc] ; base_inc << 6 + mov r4, tlq + sub tlq, 4 + neg dyq + vpbroadcastw m6, xm6 + sub r4, maxbaseq + shl maxbased, 6 + vpbroadcastb m7, [r4] + lea r4, [dyq+63] ; ypos + movd xm9, maxbased + not maxbased + vbroadcasti128 m8, [z3_shuf_w4] + add maxbased, 64 + vpbroadcastw m9, xm9 + psrlw m7, 8 ; top[max_base_y] + paddw m10, m6, m6 + psubw m9, m0 ; max_base_y + vpblendd m6, m10, 0xcc + mova xm0, xm10 + paddw m6, m0 ; ypos2 ypos3 ypos0 ypos1 + paddw m10, m10 + mova xm11, [z_transpose4] +.h4_loop: + lea r5, [r4+dyq] + sar r4, 6 ; base0 + vpbroadcastq m1, [tlq+r4] + lea r4, [r5+dyq] + sar r5, 6 ; base1 + vpbroadcastq m2, [tlq+r5] + lea r5, [r4+dyq] + sar r4, 6 ; base2 + movq xm0, [tlq+r4] + lea r4, [r5+dyq] + sar r5, 6 ; base3 + movhps xm0, [tlq+r5] + vpblendd m1, m2, 0xc0 + pand m2, m4, m6 ; frac + vpblendd m0, m1, 0xf0 + psubw m1, m5, m2 ; 64-frac + psllw m2, 8 + pshufb m0, m8 + por m1, m2 ; 64-frac, frac + pmaddubsw m0, m1 + pcmpgtw m1, m9, m6 ; base < max_base_y + pmulhrsw m0, m3 + paddw m6, m10 ; ypos += dy + vpblendvb m0, m7, m0, m1 + vextracti128 xm1, m0, 1 + packuswb xm1, xm0 + pshufb xm1, xm11 ; transpose + movd [dstq+strideq*0], xm1 + pextrd [dstq+strideq*1], xm1, 1 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+r7 ], xm1, 3 + sub wd, 4 + jz .h4_end + add dstq, 4 + cmp r4d, maxbased + jg .h4_loop + packuswb xm7, xm7 +.h4_end_loop: + movd [dstq+strideq*0], xm7 + movd [dstq+strideq*1], xm7 + movd [dstq+strideq*2], xm7 + movd [dstq+r7 ], xm7 + add dstq, 4 + sub wd, 4 + jg .h4_end_loop +.h4_end: + RET +ALIGN function_align +.h8: + lea r4d, [angleq+216] + mov r4b, wb + cmp r4d, 8 + ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8 + %assign stack_offset org_stack_offset + ALLOC_STACK -32, 8 + and r4d, 4 + mova xm0, [tlq-15] + vinserti128 m0, [tlq- 9], 1 + movd xm1, r4d + movu xm2, [z_filter_s+2] + vinserti128 m2, [z_filter_s+6], 1 + vpbroadcastb xm1, xm1 ; w & 4 + vpbroadcastd m7, [pb_36_m4] + pmaxub xm1, [z_upsample1-4] ; clip 4x8 + vinserti128 m1, [z_upsample1], 1 + add dyd, dyd + pshufb m1, m0, m1 + pshufb m2, m0, m2 + vinserti128 m0, [tlq-7], 1 + movd xm6, dyd + pmaddubsw m1, m7 + pmaddubsw m2, m7 + vpbroadcastw m6, xm6 + mov r2d, dyd + lea r5, [strideq*3] + paddw m7, m6, m6 + paddw m1, m2 + vpblendd m6, m7, 0xf0 + pmulhrsw m1, m3 + pslldq m2, m7, 8 + paddw m7, m7 + paddw m6, m2 + vbroadcasti128 m2, [pb_15to0] + packuswb m1, m1 + punpcklbw m1, m0 + pshufb m1, m2 + vextracti128 [rsp+ 0], m1, 1 + mova [rsp+16], xm1 +.h8_upsample_loop: + lea r4d, [r2+dyq] + shr r2d, 6 ; base0 + movu xm0, [rsp+r2] + lea r2d, [r4+dyq] + shr r4d, 6 ; base1 + vinserti128 m0, [rsp+r4], 1 + lea r4d, [r2+dyq] + shr r2d, 6 ; base2 + pand m1, m4, m6 + psubw m2, m5, m1 + psllw m1, 8 + por m2, m1 + punpcklqdq m1, m2, m2 ; frac0 frac1 + pmaddubsw m0, m1 + movu xm1, [rsp+r2] + lea r2d, [r4+dyq] + shr r4d, 6 ; base3 + vinserti128 m1, [rsp+r4], 1 + punpckhqdq m2, m2 ; frac2 frac3 + pmaddubsw m1, m2 + pmulhrsw m0, m3 + paddw m6, m7 + pmulhrsw m1, m3 + lea r4, [dstq+strideq*4] + psllw m1, 8 + por m0, m1 + vextracti128 xm1, m0, 1 + punpcklbw xm2, xm0, xm1 + punpckhbw xm0, xm1 + movd [dstq+strideq*0], xm2 + pextrd [dstq+strideq*1], xm2, 1 + pextrd [dstq+strideq*2], xm2, 2 + pextrd [dstq+r5 ], xm2, 3 + movd [r4 +strideq*0], xm0 + pextrd [r4 +strideq*1], xm0, 1 + pextrd [r4 +strideq*2], xm0, 2 + pextrd [r4 +r5 ], xm0, 3 + add dstq, 4 + sub wd, 4 + jg .h8_upsample_loop + RET +.h8_no_intra_edge_filter: + and maxbased, 7 + or maxbased, 8 ; imin(w+7, 15) + jmp .h8_main +.h8_no_upsample: + %assign stack_offset org_stack_offset + ALLOC_STACK -32, 10 + lea maxbased, [wq+7] + test angled, 0x400 + jnz .h8_no_intra_edge_filter + call .filter_strength + test r5d, r5d + jz .h8_main ; filter_strength == 0 + popcnt r5d, r5d + vpbroadcastd xm6, [base+pb_15] + pcmpeqb xm1, xm1 + psubusb xm6, xm0 + psubb xm6, xm1 ; w == 4 ? 5 : 1 + movu xm2, [tlq-16] + pmaxub xm1, xm6, [base+z_filter_s] + vinserti128 m2, [tlq-14], 1 + vinserti128 m1, [base+z_filter_s+12], 1 + vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*0] + pmaxub xm6, [base+z_filter_s+ 8] + vinserti128 m6, [base+z_filter_s+20], 1 + pshufb m0, m2, m1 + pmaddubsw m0, m7 + vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*1] + movzx r4d, byte [tlq-15] + shufps m1, m6, q2121 + pshufb m1, m2, m1 + pmaddubsw m1, m7 + paddw m0, m1 + sub r5d, 3 + jnz .h8_3tap + vpbroadcastd m7, [z_filter_k+4*8] + movzx r2d, byte [tlq-14] + pshufb m2, m6 + pmaddubsw m2, m7 + sub r2d, r4d + lea r2d, [r2+r4*8+4] + shr r2d, 3 + mov [rsp+15], r2b + paddw m0, m2 +.h8_3tap: + pmulhrsw m0, m3 + sar r5d, 1 + lea tlq, [rsp+31] + add r5d, 17 + cmp wd, 16 + cmovns maxbased, r5d + neg r5 + mov [tlq+r5], r4b + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + mova [tlq-15], xm0 +.h8_main: + movd xm2, dyd + vbroadcasti128 m0, [z_base_inc] + mov r4, tlq + sub tlq, 8 + neg dyq + vpbroadcastw m2, xm2 + sub r4, maxbaseq + shl maxbased, 6 + vpbroadcastb m7, [r4] + lea r4, [dyq+63] + movd xm9, maxbased + not maxbased + vbroadcasti128 m8, [z3_shuf] + add maxbased, 64 + vpbroadcastw m9, xm9 + psrlw m7, 8 + psubw m9, m0 + paddw m6, m2, m2 + vpblendd m2, m6, 0x0f +.h8_loop: + lea r5, [r4+dyq] + sar r4, 6 + pand m0, m4, m2 + psubw m1, m5, m0 + psllw m0, 8 + por m1, m0 + vbroadcasti128 m0, [tlq+r4] + lea r4, [r5+dyq] + sar r5, 6 + vinserti128 m0, [tlq+r5], 0 + sub rsp, 8*2 + pshufb m0, m8 + pmaddubsw m0, m1 + pcmpgtw m1, m9, m2 + paddw m2, m6 + pmulhrsw m0, m3 + vpblendvb m0, m7, m0, m1 + vextracti128 xm1, m0, 1 + psllw xm0, 8 + por xm0, xm1 ; interleave rows (partial transpose) + mova [rsp], xm0 + sub wd, 2 + jz .h8_transpose + cmp r4d, maxbased + jg .h8_loop + packuswb xm0, xm7, xm7 +.h8_end_loop: + sub rsp, 8*2 + mova [rsp], xm0 + sub wd, 2 + jg .h8_end_loop +.h8_transpose: + mova xm2, [rsp+16*1] + sub org_wd, 8 + lea r2, [strideq*3] + lea r6, [dstq+org_wq] + cmovns dstq, r6 + punpcklwd xm1, xm2, xm0 + punpckhwd xm2, xm0 + lea r6, [dstq+strideq*4] + jge .h8_w8 + add rsp, 16*2 + movd [dstq+strideq*0], xm1 + pextrd [dstq+strideq*1], xm1, 1 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+r2 ], xm1, 3 + movd [r6 +strideq*0], xm2 + pextrd [r6 +strideq*1], xm2, 1 + pextrd [r6 +strideq*2], xm2, 2 + pextrd [r6 +r2 ], xm2, 3 + jmp .h8_end +.h8_w8_loop: + mova xm0, [rsp+16*0] + mova xm2, [rsp+16*1] + punpcklwd xm1, xm2, xm0 + punpckhwd xm2, xm0 +.h8_w8: ; w8/w16/w32 + mova xm0, [rsp+16*2] + mova xm4, [rsp+16*3] + add rsp, 16*4 + punpcklwd xm3, xm4, xm0 + punpckhwd xm4, xm0 + punpckldq xm0, xm3, xm1 + punpckhdq xm3, xm1 + punpckldq xm1, xm4, xm2 + punpckhdq xm4, xm2 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm3 + movhps [dstq+r2 ], xm3 + movq [r6 +strideq*0], xm1 + movhps [r6 +strideq*1], xm1 + movq [r6 +strideq*2], xm4 + movhps [r6 +r2 ], xm4 + sub dstq, 8 + sub r6, 8 + sub org_wd, 8 + jge .h8_w8_loop +.h8_end: + RET +.h16_no_intra_edge_filter: + and maxbased, 15 + or maxbased, 16 ; imin(w+15, 31) + jmp .h16_main +ALIGN function_align +.h16: + %assign stack_offset org_stack_offset + ALLOC_STACK -64, 12 + lea maxbased, [wq+15] + test angled, 0x400 + jnz .h16_no_intra_edge_filter + call .filter_strength + test r5d, r5d + jz .h16_main ; filter_strength == 0 + popcnt r5d, r5d + vpbroadcastd m11, [base+pb_27] + vpbroadcastd m1, [base+pb_1] + vbroadcasti128 m6, [base+z_filter_s+12] + vinserti128 m2, m6, [base+z_filter_s+4], 0 + vinserti128 m6, [base+z_filter_s+20], 1 + movu xm10, [tlq-18] + vinserti128 m10, [tlq-14], 1 + vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*0] + vbroadcasti128 m7, [base+z_filter_s+8] + vinserti128 m8, m7, [base+z_filter_s+0], 0 + vinserti128 m7, [base+z_filter_s+16], 1 + psubusb m11, m0 + por m1, m11 + movu xm11, [tlq-32] + vinserti128 m11, [tlq-28], 1 + pmaxub m8, m1 + pmaxub m7, m1 + pshufb m0, m10, m2 + shufps m2, m6, q2121 + pmaddubsw m0, m9 + pshufb m1, m11, m8 + shufps m8, m7, q2121 + pmaddubsw m1, m9 + vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1] + movzx r4d, byte [tlq-31] + pshufb m2, m10, m2 + pmaddubsw m2, m9 + pshufb m8, m11, m8 + pmaddubsw m8, m9 + paddw m0, m2 + paddw m1, m8 + sub r5d, 3 + jnz .h16_3tap + vpbroadcastd m9, [z_filter_k+4*8] + movzx r2d, byte [tlq-30] + pshufb m10, m6 + pmaddubsw m10, m9 + pshufb m11, m7 + pmaddubsw m11, m9 + sub r2d, r4d + lea r2d, [r2+r4*8+4] + shr r2d, 3 + mov [rsp+31], r2b + paddw m0, m10 + paddw m1, m11 +.h16_3tap: + pmulhrsw m0, m3 + pmulhrsw m1, m3 + sar r5d, 1 + lea tlq, [rsp+63] + add r5d, 33 + cmp wd, 32 + cmovns maxbased, r5d + neg r5 + mov [tlq+r5], r4b + packuswb m0, m1 + vpermq m0, m0, q2031 + mova [tlq-31], m0 +.h16_main: + movd xm6, dyd + vbroadcasti128 m0, [z_base_inc] + mov r4, tlq + sub tlq, 8 + neg dyq + vpbroadcastw m6, xm6 + sub r4, maxbaseq + shl maxbased, 6 + vpbroadcastb m7, [r4] + lea r4, [dyq+63] + movd xm9, maxbased + not maxbased + vbroadcasti128 m8, [z3_shuf] + add maxbased, 64 + vpbroadcastw m9, xm9 + psubw m9, m0 + paddw m11, m6, m6 + psubw m10, m9, m3 ; 64*8 + vpblendd m6, m11, 0xf0 +.h16_loop: + lea r5, [r4+dyq] + sar r4, 6 + pand m1, m4, m6 + psubw m2, m5, m1 + psllw m1, 8 + por m2, m1 + movu xm0, [tlq+r4-0] + movu xm1, [tlq+r4-8] + lea r4, [r5+dyq] + sar r5, 6 + vinserti128 m0, [tlq+r5-0], 1 + vinserti128 m1, [tlq+r5-8], 1 + sub rsp, 32 + pshufb m0, m8 + pshufb m1, m8 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packuswb m0, m1 + pcmpgtw m1, m9, m6 + pcmpgtw m2, m10, m6 + packsswb m1, m2 + paddw m6, m11 + vpblendvb m0, m7, m0, m1 + vpermq m0, m0, q3120 + mova [rsp], m0 + sub wd, 2 + jz .h16_transpose + cmp r4d, maxbased + jg .h16_loop + mova m0, m7 +.h16_end_loop: + sub rsp, 32 + mova [rsp], m7 + sub wd, 2 + jg .h16_end_loop +.h16_transpose: + mova m2, [rsp+32*1] + sub org_wd, 8 + lea r2, [strideq*3] + lea r6, [dstq+org_wq] + cmovns dstq, r6 + punpcklbw m1, m2, m0 + punpckhbw m2, m0 + lea r3, [strideq*5] + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + lea r4, [strideq+r2*2] ; stride*7 + jge .h16_w8 + add rsp, 32*2 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+r2 ], xm0, 3 + vextracti128 xm0, m0, 1 + movd [dstq+strideq*4], xm1 + pextrd [dstq+r3 ], xm1, 1 + pextrd [dstq+r2*2 ], xm1, 2 + pextrd [dstq+r4 ], xm1, 3 + lea dstq, [dstq+strideq*8] + vextracti128 xm1, m1, 1 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+r2 ], xm0, 3 + movd [dstq+strideq*4], xm1 + pextrd [dstq+r3 ], xm1, 1 + pextrd [dstq+r2*2 ], xm1, 2 + pextrd [dstq+r4 ], xm1, 3 + jmp .h16_end +.h16_w8_loop: + mova m0, [rsp+32*0] + mova m2, [rsp+32*1] + punpcklbw m1, m2, m0 + punpckhbw m2, m0 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 +.h16_w8: + mova m2, [rsp+32*2] + mova m4, [rsp+32*3] + lea r6, [dstq+strideq*8] + add rsp, 32*4 + punpcklbw m3, m4, m2 + punpckhbw m4, m2 + punpcklbw m2, m3, m4 + punpckhbw m3, m4 + punpckldq m4, m2, m0 + punpckhdq m2, m0 + punpckldq m0, m3, m1 + punpckhdq m3, m1 + movq [dstq+strideq*0], xm4 + movhps [dstq+strideq*1], xm4 + vextracti128 xm4, m4, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+r2 ], xm2 + vextracti128 xm2, m2, 1 + movq [dstq+strideq*4], xm0 + movhps [dstq+r3 ], xm0 + vextracti128 xm0, m0, 1 + movq [dstq+r2*2 ], xm3 + movhps [dstq+r4 ], xm3 + vextracti128 xm3, m3, 1 + movq [r6+strideq*0], xm4 + movhps [r6+strideq*1], xm4 + movq [r6+strideq*2], xm2 + movhps [r6+r2 ], xm2 + movq [r6+strideq*4], xm0 + movhps [r6+r3 ], xm0 + movq [r6+r2*2 ], xm3 + movhps [r6+r4 ], xm3 + sub dstq, 8 + sub org_wd, 8 + jge .h16_w8_loop +.h16_end: + RET +ALIGN function_align +.h32: + %assign stack_offset org_stack_offset + ALLOC_STACK -96, 15 + lea maxbased, [wq+31] + and maxbased, 31 + or maxbased, 32 ; imin(w+31, 63) + test angled, 0x400 ; !enable_intra_edge_filter + jnz .h32_main + vbroadcasti128 m0, [pb_0to15] + mov r4d, 21 + mov r5d, 3 + movu xm11, [tlq-66] ; 56-63 + vinserti128 m11, [tlq-52], 1 ; 40-47 + sub r4d, wd ; 21-w + cmovns r5d, r4d + movu xm12, [tlq-58] ; 48-55 + vinserti128 m12, [tlq-44], 1 ; 32-39 + sub r4d, 8 ; 13-w + movd xm1, r5d + movu xm13, [tlq-34] ; 24-31 + vinserti128 m13, [tlq-20], 1 ; 8-15 + movd xm2, r4d + vpbroadcastb m1, xm1 + movu xm14, [tlq-28] ; 16-23 + vinserti128 m14, [tlq-14], 1 ; 0- 7 + vpbroadcastb m2, xm2 + pmaxsb m1, m0 ; clip 16x32 and (32|64)x32 + movu m7, [z_filter_s+4] + pshufb m11, m1 + vinserti128 m8, m7, [z_filter_s+8], 1 + vinserti128 m7, [z_filter_s+16], 0 + pmaxsb m2, m0 ; clip 8x32 + vpbroadcastd m9, [z_filter_k+4*2+12*0] + pshufb m12, m2 + pshufb m0, m11, m8 + pmaddubsw m0, m9 + pshufb m2, m12, m8 + pmaddubsw m2, m9 + pshufb m1, m13, m8 + pmaddubsw m1, m9 + shufps m8, m7, q1021 + pshufb m6, m14, m8 + pmaddubsw m6, m9 + vpbroadcastd m9, [z_filter_k+4*2+12*1] + pshufb m10, m11, m8 + pmaddubsw m10, m9 + paddw m0, m10 + pshufb m10, m12, m8 + pmaddubsw m10, m9 + paddw m2, m10 + pshufb m10, m13, m8 + pmaddubsw m10, m9 + shufps m8, m7, q2121 + paddw m1, m10 + pshufb m10, m14, m8 + pmaddubsw m10, m9 + paddw m6, m10 + vpbroadcastd m9, [z_filter_k+4*2+12*2] + pshufb m11, m8 + pmaddubsw m11, m9 + pshufb m12, m8 + pmaddubsw m12, m9 + movzx r4d, byte [tlq-63] + movzx r2d, byte [tlq-62] + paddw m0, m11 + paddw m2, m12 + pshufb m13, m8 + pmaddubsw m13, m9 + pshufb m14, m7 + pmaddubsw m14, m9 + paddw m1, m13 + paddw m6, m14 + sub r2d, r4d + lea r2d, [r2+r4*8+4] ; edge case for 64x32 + pmulhrsw m0, m3 + pmulhrsw m2, m3 + pmulhrsw m1, m3 + pmulhrsw m6, m3 + shr r2d, 3 + mov [rsp+31], r2b + lea tlq, [rsp+95] + mov [tlq-65], r4b + mov r4d, 65 + cmp wd, 64 + cmove maxbased, r4d + packuswb m0, m2 + packuswb m1, m6 + mova [tlq-63], m0 + mova [tlq-31], m1 +.h32_main: + movd xm6, dyd + mov r4, tlq + sub tlq, 8 + neg dyq + vpbroadcastw m6, xm6 + sub r4, maxbaseq + shl maxbased, 6 + vpbroadcastb m7, [r4] + lea r4, [dyq+63] + movd xm9, maxbased + not maxbased + vbroadcasti128 m8, [z3_shuf] + add maxbased, 64 + vpbroadcastw m9, xm9 + psubw m9, [z_base_inc] + mova m11, m6 + psubw m10, m9, m3 ; 64*8 +.h32_loop: + mov r5, r4 + sar r5, 6 + pand m1, m4, m6 + psubw m2, m5, m1 + psllw m1, 8 + por m2, m1 + movu xm0, [tlq+r5- 0] + vinserti128 m0, [tlq+r5-16], 1 + movu xm1, [tlq+r5- 8] + vinserti128 m1, [tlq+r5-24], 1 + sub rsp, 32 + add r4, dyq + pshufb m0, m8 + pshufb m1, m8 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packuswb m0, m1 + pcmpgtw m1, m9, m6 + pcmpgtw m2, m10, m6 + packsswb m1, m2 + paddw m6, m11 + vpblendvb m0, m7, m0, m1 + mova [rsp], m0 + dec wd + jz .h32_transpose + cmp r4d, maxbased + jg .h32_loop +.h32_end_loop: + sub rsp, 32 + mova [rsp], m7 + dec wd + jg .h32_end_loop +.h32_transpose: + lea dstq, [dstq+org_wq-8] + lea r2, [strideq*3] + lea r3, [strideq*5] + lea r4, [strideq+r2*2] ; stride*7 +.h32_w8_loop: + mova m7, [rsp+32*0] + mova m6, [rsp+32*1] + mova m5, [rsp+32*2] + mova m4, [rsp+32*3] + mova m3, [rsp+32*4] + mova m2, [rsp+32*5] + mova m1, [rsp+32*6] + mova m0, [rsp+32*7] + lea r6, [dstq+strideq*8] + add rsp, 32*8 + punpcklbw m8, m0, m1 + punpckhbw m0, m1 + punpcklbw m1, m2, m3 + punpckhbw m2, m3 + punpcklbw m3, m4, m5 + punpckhbw m4, m5 + punpcklbw m5, m6, m7 + punpckhbw m6, m7 + punpcklwd m7, m8, m1 + punpckhwd m8, m1 + punpcklwd m1, m0, m2 + punpckhwd m0, m2 + punpcklwd m2, m3, m5 + punpckhwd m3, m5 + punpcklwd m5, m4, m6 + punpckhwd m4, m6 + punpckldq m6, m7, m2 + punpckhdq m7, m2 + punpckldq m2, m8, m3 + punpckhdq m8, m3 + punpckldq m3, m1, m5 + punpckhdq m1, m5 + punpckldq m5, m0, m4 + punpckhdq m0, m4 + movq [dstq+strideq*0], xm6 + movhps [dstq+strideq*1], xm6 + vextracti128 xm6, m6, 1 + movq [dstq+strideq*2], xm7 + movhps [dstq+r2 ], xm7 + vextracti128 xm7, m7, 1 + movq [dstq+strideq*4], xm2 + movhps [dstq+r3 ], xm2 + vextracti128 xm2, m2, 1 + movq [dstq+r2*2 ], xm8 + movhps [dstq+r4 ], xm8 + vextracti128 xm8, m8, 1 + movq [r6+strideq*0], xm3 + movhps [r6+strideq*1], xm3 + vextracti128 xm3, m3, 1 + movq [r6+strideq*2], xm1 + movhps [r6+r2 ], xm1 + vextracti128 xm1, m1, 1 + movq [r6+strideq*4], xm5 + movhps [r6+r3 ], xm5 + vextracti128 xm5, m5, 1 + movq [r6+r2*2 ], xm0 + movhps [r6+r4 ], xm0 + lea r6, [r6+strideq*8] + vextracti128 xm0, m0, 1 + movq [r6+strideq*0], xm6 + movhps [r6+strideq*1], xm6 + movq [r6+strideq*2], xm7 + movhps [r6+r2 ], xm7 + movq [r6+strideq*4], xm2 + movhps [r6+r3 ], xm2 + movq [r6+r2*2 ], xm8 + movhps [r6+r4 ], xm8 + lea r6, [r6+strideq*8] + movq [r6+strideq*0], xm3 + movhps [r6+strideq*1], xm3 + movq [r6+strideq*2], xm1 + movhps [r6+r2 ], xm1 + movq [r6+strideq*4], xm5 + movhps [r6+r3 ], xm5 + movq [r6+r2*2 ], xm0 + movhps [r6+r4 ], xm0 + sub dstq, 8 + sub org_wd, 8 + jg .h32_w8_loop + RET +ALIGN function_align +.h64: + %assign stack_offset org_stack_offset + ALLOC_STACK -128, 16 + lea maxbased, [wq+63] + test angled, 0x400 ; !enable_intra_edge_filter + jnz .h64_main + mov r4d, 21 + vpbroadcastb xm11, [tlq-127] + vpblendd xm11, [tlq-130], 0x0e ; 120-127 + sub r4d, wd ; 21-w + mov r5d, 3 + vinserti128 m11, [tlq-116], 1 ; 104-111 + movu m7, [z_filter_s+4] + cmp wd, 32 + cmove r4d, r5d + vinserti128 m8, m7, [z_filter_s+8], 1 + vbroadcasti128 m6, [pb_0to15] + movd xm1, r4d + vpbroadcastd m9, [z_filter_k+4*2+12*0] + movu xm12, [tlq-122] ; 112-119 + vinserti128 m12, [tlq-108], 1 ; 96-103 + vpbroadcastb m1, xm1 + movu xm13, [tlq- 98] ; 88- 95 + vinserti128 m13, [tlq- 84], 1 ; 72- 79 + movu xm14, [tlq- 90] ; 80- 87 + vinserti128 m14, [tlq- 76], 1 ; 64- 71 + vinserti128 m7, [z_filter_s+16], 0 + pshufb m0, m11, m8 + pmaddubsw m0, m9 + pshufb m2, m12, m8 + pmaddubsw m2, m9 + pmaxsb m1, m6 ; clip (16|32)x64 + pshufb m13, m1 + pshufb m1, m13, m8 + pmaddubsw m1, m9 + pshufb m6, m14, m8 + pmaddubsw m6, m9 + vpbroadcastd m9, [z_filter_k+4*2+12*1] + shufps m15, m8, m7, q1021 + pshufb m10, m11, m15 + pmaddubsw m10, m9 + paddw m0, m10 + pshufb m10, m12, m15 + pmaddubsw m10, m9 + paddw m2, m10 + pshufb m10, m13, m15 + pmaddubsw m10, m9 + paddw m1, m10 + pshufb m10, m14, m15 + pmaddubsw m10, m9 + paddw m6, m10 + vpbroadcastd m9, [z_filter_k+4*2+12*2] + shufps m10, m8, m7, q2132 + pshufb m11, m10 + pmaddubsw m11, m9 + pshufb m12, m10 + pmaddubsw m12, m9 + pshufb m13, m10 + pmaddubsw m13, m9 + pshufb m14, m10 + pmaddubsw m14, m9 + paddw m0, m11 + paddw m2, m12 + paddw m1, m13 + paddw m6, m14 + movu xm11, [tlq-66] ; 56-63 + vinserti128 m11, [tlq-52], 1 ; 40-47 + movu xm12, [tlq-58] ; 48-55 + vinserti128 m12, [tlq-44], 1 ; 32-39 + movu xm13, [tlq-34] ; 24-31 + vinserti128 m13, [tlq-20], 1 ; 8-15 + movu xm14, [tlq-28] ; 16-23 + vinserti128 m14, [tlq-14], 1 ; 0- 7 + pmulhrsw m0, m3 + pmulhrsw m2, m3 + pmulhrsw m1, m3 + pmulhrsw m6, m3 + lea tlq, [rsp+127] + packuswb m0, m2 + packuswb m1, m6 + mova [tlq-127], m0 + mova [tlq- 95], m1 + pshufb m0, m11, m10 + pmaddubsw m0, m9 + pshufb m2, m12, m10 + pmaddubsw m2, m9 + pshufb m1, m13, m10 + pmaddubsw m1, m9 + pshufb m6, m14, m7 + pmaddubsw m6, m9 + vpbroadcastd m9, [z_filter_k+4*2+12*1] + pshufb m7, m11, m15 + pmaddubsw m7, m9 + paddw m0, m7 + pshufb m7, m12, m15 + pmaddubsw m7, m9 + paddw m2, m7 + pshufb m7, m13, m15 + pmaddubsw m7, m9 + paddw m1, m7 + pshufb m7, m14, m10 + pmaddubsw m7, m9 + paddw m6, m7 + vpbroadcastd m9, [z_filter_k+4*2+12*0] + pshufb m11, m8 + pmaddubsw m11, m9 + pshufb m12, m8 + pmaddubsw m12, m9 + pshufb m13, m8 + pmaddubsw m13, m9 + pshufb m14, m15 + pmaddubsw m14, m9 + paddw m0, m11 + paddw m2, m12 + paddw m1, m13 + paddw m6, m14 + pmulhrsw m0, m3 + pmulhrsw m2, m3 + pmulhrsw m1, m3 + pmulhrsw m6, m3 + packuswb m0, m2 + packuswb m1, m6 + mova [tlq-63], m0 + mova [tlq-31], m1 +.h64_main: + movd xm12, dyd + neg maxbaseq + vbroadcasti128 m8, [z3_shuf] + vpbroadcastb m7, [tlq+maxbaseq] + shl maxbased, 6 + vpbroadcastw m12, xm12 + lea r5d, [dyq+maxbaseq-64] + neg dyq + or maxbased, 63 + lea r4, [dyq+63] + movd xm6, r5d + mova xm10, [pb_1to32+16] + vinserti128 m10, [pb_1to32], 1 + vpbroadcastd m11, [pb_32] + vpbroadcastw m6, xm6 +.h64_loop: + mov r5, r4 + sar r5, 6 + movu m0, [tlq+r5-24] + movu m1, [tlq+r5-32] + pand m2, m4, m6 + psubw m9, m5, m2 + psllw m2, 8 + por m9, m2 + pshufb m0, m8 + pshufb m1, m8 + pmaddubsw m0, m9 + pmaddubsw m1, m9 + psraw m2, m6, 6 + sub rsp, 64 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packsswb m2, m2 + paddb m2, m10 + packuswb m0, m1 + vpblendvb m0, m7, m0, m2 + mova [rsp+32], m0 + movu m0, [tlq+r5-56] + movu m1, [tlq+r5-64] + add r4, dyq + pshufb m0, m8 + pshufb m1, m8 + pmaddubsw m0, m9 + pmaddubsw m1, m9 + paddb m2, m11 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + paddw m6, m12 + packuswb m0, m1 + vpblendvb m0, m7, m0, m2 + mova [rsp], m0 + dec wd + jz .h64_transpose + cmp r4d, maxbased + jg .h64_loop +.h64_end_loop: + sub rsp, 64 + mova [rsp+32], m7 + mova [rsp+ 0], m7 + dec wd + jg .h64_end_loop +.h64_transpose: + lea r2, [strideq*3] + lea r3, [strideq*5] + imul r5, strideq, -8 + lea dstq, [dstq+org_wq-16] + lea r4, [strideq+r2*2] ; stride*7 +.h64_transpose_loop0: + lea r6, [rsp+16*3] +.h64_transpose_loop: + mova xm0, [r6+64*15] + vinserti128 m0, [r6+64* 7], 1 + mova xm1, [r6+64*14] + vinserti128 m1, [r6+64* 6], 1 + mova xm2, [r6+64*13] + vinserti128 m2, [r6+64* 5], 1 + mova xm3, [r6+64*12] + vinserti128 m3, [r6+64* 4], 1 + mova xm4, [r6+64*11] + vinserti128 m4, [r6+64* 3], 1 + mova xm5, [r6+64*10] + vinserti128 m5, [r6+64* 2], 1 + mova xm6, [r6+64* 9] + vinserti128 m6, [r6+64* 1], 1 + mova xm7, [r6+64* 8] + vinserti128 m7, [r6+64* 0], 1 + sub r6, 16 + punpcklbw m8, m0, m1 + punpckhbw m0, m1 + punpcklbw m1, m2, m3 + punpckhbw m2, m3 + punpcklbw m3, m4, m5 + punpckhbw m4, m5 + punpcklbw m5, m6, m7 + punpckhbw m6, m7 + punpcklwd m7, m8, m1 + punpckhwd m8, m1 + punpcklwd m1, m0, m2 + punpckhwd m0, m2 + punpcklwd m2, m3, m5 + punpckhwd m3, m5 + punpcklwd m5, m4, m6 + punpckhwd m4, m6 + punpckldq m6, m7, m2 + punpckhdq m7, m2 + punpckldq m2, m8, m3 + punpckhdq m8, m3 + punpckldq m3, m1, m5 + punpckhdq m1, m5 + punpckldq m5, m0, m4 + punpckhdq m0, m4 + vpermq m6, m6, q3120 + vpermq m7, m7, q3120 + vpermq m2, m2, q3120 + vpermq m8, m8, q3120 + vpermq m3, m3, q3120 + vpermq m1, m1, q3120 + vpermq m5, m5, q3120 + vpermq m0, m0, q3120 + mova [dstq+strideq*0], xm6 + vextracti128 [dstq+strideq*1], m6, 1 + mova [dstq+strideq*2], xm7 + vextracti128 [dstq+r2 ], m7, 1 + mova [dstq+strideq*4], xm2 + vextracti128 [dstq+r3 ], m2, 1 + mova [dstq+r2*2 ], xm8 + vextracti128 [dstq+r4 ], m8, 1 + sub dstq, r5 + mova [dstq+strideq*0], xm3 + vextracti128 [dstq+strideq*1], m3, 1 + mova [dstq+strideq*2], xm1 + vextracti128 [dstq+r2 ], m1, 1 + mova [dstq+strideq*4], xm5 + vextracti128 [dstq+r3 ], m5, 1 + mova [dstq+r2*2 ], xm0 + vextracti128 [dstq+r4 ], m0, 1 + sub dstq, r5 + cmp r6, rsp + jae .h64_transpose_loop + add rsp, 64*16 + lea dstq, [dstq+r5*8-16] + sub org_wd, 16 + jg .h64_transpose_loop0 +.h64_end: + RET + +%macro FILTER_XMM 4 ; dst, src, tmp, shuf +%ifnum %4 + pshufb xm%2, xm%4 +%else + pshufb xm%2, %4 +%endif + pshufd xm%1, xm%2, q0000 ; p0 p1 + pmaddubsw xm%1, xm2 + pshufd xm%3, xm%2, q1111 ; p2 p3 + pmaddubsw xm%3, xm3 + paddw xm%1, xm1 + paddw xm%1, xm%3 + pshufd xm%3, xm%2, q2222 ; p4 p5 + pmaddubsw xm%3, xm4 + paddw xm%1, xm%3 + pshufd xm%3, xm%2, q3333 ; p6 __ + pmaddubsw xm%3, xm5 + paddw xm%1, xm%3 + psraw xm%1, 4 + packuswb xm%1, xm%1 +%endmacro + +%macro FILTER_YMM 4 ; dst, src, tmp, shuf + pshufb m%2, m%4 + pshufd m%1, m%2, q0000 + pmaddubsw m%1, m2 + pshufd m%3, m%2, q1111 + pmaddubsw m%3, m3 + paddw m%1, m1 + paddw m%1, m%3 + pshufd m%3, m%2, q2222 + pmaddubsw m%3, m4 + paddw m%1, m%3 + pshufd m%3, m%2, q3333 + pmaddubsw m%3, m5 + paddw m%1, m%3 + psraw m%1, 4 + vpermq m%3, m%1, q1032 + packuswb m%1, m%3 +%endmacro + +; The ipred_filter SIMD processes 4x2 blocks in the following order which +; increases parallelism compared to doing things row by row. One redundant +; block is calculated for w8 and w16, two for w32. +; w4 w8 w16 w32 +; 1 1 2 1 2 3 5 1 2 3 5 b c d f +; 2 2 3 2 4 5 7 2 4 5 7 c e f h +; 3 3 4 4 6 7 9 4 6 7 9 e g h j +; ___ 4 ___ 4 5 ___ 6 8 9 a ___ 6 8 9 a g i j k ___ +; 5 8 8 i + +cglobal ipred_filter, 3, 7, 0, dst, stride, tl, w, h, filter +%define base r6-ipred_filter_avx2_table + lea r6, [filter_intra_taps] + tzcnt wd, wm +%ifidn filterd, filterm + movzx filterd, filterb +%else + movzx filterd, byte filterm +%endif + shl filterd, 6 + add filterq, r6 + lea r6, [ipred_filter_avx2_table] + movq xm0, [tlq-3] ; _ 6 5 0 1 2 3 4 + movsxd wq, [r6+wq*4] + vpbroadcastd m1, [base+pw_8] + vbroadcasti128 m2, [filterq+16*0] + vbroadcasti128 m3, [filterq+16*1] + vbroadcasti128 m4, [filterq+16*2] + vbroadcasti128 m5, [filterq+16*3] + add wq, r6 + mov hd, hm + jmp wq +.w4: + WIN64_SPILL_XMM 9 + mova xm8, [base+filter_shuf2] + sub tlq, 3 + sub tlq, hq + jmp .w4_loop_start +.w4_loop: + pinsrd xm0, xm6, [tlq+hq], 0 + lea dstq, [dstq+strideq*2] +.w4_loop_start: + FILTER_XMM 6, 0, 7, 8 + movd [dstq+strideq*0], xm6 + pextrd [dstq+strideq*1], xm6, 1 + sub hd, 2 + jg .w4_loop + RET +ALIGN function_align +.w8: + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 10 + mova m8, [base+filter_shuf1] + FILTER_XMM 7, 0, 6, [base+filter_shuf2] + vpbroadcastd m0, [tlq+4] + vpbroadcastd m6, [tlq+5] + sub tlq, 4 + sub tlq, hq + vpbroadcastq m7, xm7 + vpblendd m7, m6, 0x20 +.w8_loop: + vpbroadcastd xm6, [tlq+hq] + palignr m6, m0, 12 + vpblendd m0, m6, m7, 0xeb ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _ + ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 + mova xm6, xm7 + call .main + vpblendd xm6, xm7, 0x0c + pshufd xm6, xm6, q3120 + movq [dstq+strideq*0], xm6 + movhps [dstq+strideq*1], xm6 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_loop + RET +ALIGN function_align +.w16: +%if WIN64 + %assign stack_offset stack_offset - stack_size_padded + %assign xmm_regs_used 15 + %assign stack_size_padded 0x98 + SUB rsp, stack_size_padded +%endif + sub hd, 2 + TAIL_CALL .w16_main, 0 +.w16_main: +%if WIN64 + movaps [rsp+0xa8], xmm6 + movaps [rsp+0xb8], xmm7 + movaps [rsp+0x28], xmm8 + movaps [rsp+0x38], xmm9 + movaps [rsp+0x48], xmm10 + movaps [rsp+0x58], xmm11 + movaps [rsp+0x68], xmm12 + movaps [rsp+0x78], xmm13 + movaps [rsp+0x88], xmm14 +%endif + FILTER_XMM 12, 0, 7, [base+filter_shuf2] + vpbroadcastd m0, [tlq+5] + vpblendd m0, [tlq-12], 0x14 + mova m8, [base+filter_shuf1] + vpbroadcastq m7, xm12 + vpblendd m0, m7, 0xc2 ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _ + ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 + call .main ; c0 d0 a1 b1 a1 b1 c0 d0 + movlps xm9, xm7, [tlq+5] ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + vinserti128 m14, m8, [base+filter_shuf3], 0 + vpblendd xm12, xm7, 0x0c ; a0 b0 a1 b1 + FILTER_XMM 6, 9, 10, 14 + vpbroadcastq m6, xm6 ; a2 b2 __ __ __ __ a2 b2 + vpbroadcastd m9, [tlq+13] + vpbroadcastd m10, [tlq+12] + psrld m11, m8, 4 + vpblendd m6, m9, 0x20 ; top + sub tlq, 6 + sub tlq, hq +.w16_loop: + vpbroadcastd xm9, [tlq+hq] + palignr m9, m0, 12 + vpblendd m0, m9, m7, 0xe2 ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _ + ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 + mova xm13, xm7 + call .main ; e0 f0 c1 d1 c1 d1 e0 f0 + vpblendd m9, m12, m10, 0xf0 + vpblendd m12, m6, 0xc0 + pshufd m9, m9, q3333 + vpblendd m9, m6, 0xee + vpblendd m10, m9, m7, 0x0c ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 + FILTER_YMM 6, 10, 9, 14 ; c2 d2 a3 b3 a3 b3 c2 d2 + vpblendd m12, m6, 0x30 ; a0 b0 a1 b1 a3 b3 a2 b2 + vpermd m9, m11, m12 ; a0 a1 a2 a3 b0 b1 b2 b3 + vpblendd xm12, xm13, xm7, 0x0c ; c0 d0 c1 d1 + mova [dstq+strideq*0], xm9 + vextracti128 [dstq+strideq*1], m9, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16_loop + vpblendd xm7, xm6, xm10, 0x04 ; _ _ _ 5 _ _ _ 6 0 _ _ _ 1 2 3 4 + pshufd xm7, xm7, q1032 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 + FILTER_XMM 0, 7, 9, [base+filter_shuf1+16] + vpblendd xm6, xm0, 0x0c ; c2 d2 c3 d3 + shufps xm0, xm12, xm6, q2020 ; c0 c1 c2 c3 + shufps xm6, xm12, xm6, q3131 ; d0 d1 d2 d3 + mova [dstq+strideq*0], xm0 + mova [dstq+strideq*1], xm6 + ret +ALIGN function_align +.w32: + sub rsp, stack_size_padded + sub hd, 2 + lea r3, [dstq+16] + lea r5d, [hq-2] + call .w16_main + add tlq, r5 + mov dstq, r3 + lea r3, [strideq-4] + lea r4, [r3+strideq*2] + movq xm0, [tlq+21] + pinsrd xm0, [dstq-4], 2 + pinsrd xm0, [dstq+r3*1], 3 + FILTER_XMM 12, 0, 7, 14 ; a0 b0 a0 b0 + movq xm7, [dstq+r3*2] + pinsrd xm7, [dstq+r4], 2 + palignr xm7, xm0, 12 ; 0 _ _ _ _ _ _ _ _ _ _ 5 _ _ _ 6 + vpbroadcastd m0, [tlq+28] + vpbroadcastd m9, [tlq+29] + vbroadcasti128 m8, [base+filter_shuf1+16] + vpblendd m0, m9, 0x20 + vpblendd m0, m7, 0x0f + vpbroadcastq m7, xm12 + vpblendd m0, m7, 0xc2 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 + call .main ; c0 d0 a1 b1 a1 b1 c0 d0 + add r3, 2 + lea r4, [r4+strideq*2] + movlps xm9, xm7, [tlq+29] ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + vpblendd xm12, xm7, 0x0c ; a0 b0 a1 b1 + FILTER_XMM 6, 9, 10, 14 + vpbroadcastq m6, xm6 ; a2 b2 __ __ __ __ a2 b2 + vpbroadcastd m9, [tlq+37] + vpbroadcastd m10, [tlq+36] + vpblendd m6, m9, 0x20 ; top +.w32_loop: + movq xm9, [dstq+r3*4] + pinsrd xm9, [dstq+r4], 2 +.w32_loop_last: + palignr m9, m0, 12 + vpblendd m0, m9, m7, 0xe2 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 + mova xm13, xm7 ; c0 d0 + call .main ; e0 f0 c1 d1 c1 d1 e0 f0 + vpblendd m9, m12, m10, 0xf0 + vpblendd m12, m6, 0xc0 + pshufd m9, m9, q3333 + vpblendd m9, m6, 0xee + vpblendd m10, m9, m7, 0x0c ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 + FILTER_YMM 6, 10, 9, 14 ; c2 d2 a3 b3 a3 b3 c2 d2 + vpblendd m12, m6, 0x30 ; a0 b0 a1 b1 a3 b3 a2 b2 + vpermd m9, m11, m12 ; a0 a1 a2 a3 b0 b1 b2 b3 + vpblendd xm12, xm13, xm7, 0x0c ; c0 d0 c1 d1 + mova [dstq+strideq*0], xm9 + vextracti128 [dstq+strideq*1], m9, 1 + lea dstq, [dstq+strideq*2] + sub r5d, 2 + jg .w32_loop + jz .w32_loop_last + vpblendd xm7, xm6, xm10, 0x04 ; _ _ _ 5 _ _ _ 6 0 _ _ _ 1 2 3 4 + pshufd xm7, xm7, q1032 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 + FILTER_XMM 0, 7, 9, [base+filter_shuf1+16] + vpblendd xm6, xm0, 0x0c ; c2 d2 c3 d3 + shufps xm0, xm12, xm6, q2020 ; c0 c1 c2 c3 + shufps xm6, xm12, xm6, q3131 ; d0 d1 d2 d3 + mova [dstq+strideq*0], xm0 + mova [dstq+strideq*1], xm6 + RET +ALIGN function_align +.main: + FILTER_YMM 7, 0, 9, 8 + ret + +%if WIN64 +DECLARE_REG_TMP 5 +%else +DECLARE_REG_TMP 7 +%endif + +%macro IPRED_CFL 1 ; ac in, unpacked pixels out + psignw m3, m%1, m1 + pabsw m%1, m%1 + pmulhrsw m%1, m2 + psignw m%1, m3 + paddw m%1, m0 +%endmacro + +cglobal ipred_cfl_top, 3, 7, 6, dst, stride, tl, w, h, ac, alpha + lea t0, [ipred_cfl_left_avx2_table] + tzcnt wd, wm + inc tlq + movu m0, [tlq] + movifnidn hd, hm + mov r6d, 0x8000 + shrx r6d, r6d, wd + movd xm3, r6d + movsxd r6, [t0+wq*4] + pcmpeqd m2, m2 + pmaddubsw m0, m2 + add r6, t0 + add t0, ipred_cfl_splat_avx2_table-ipred_cfl_left_avx2_table + movsxd wq, [t0+wq*4] + add wq, t0 + movifnidn acq, acmp + jmp r6 + +cglobal ipred_cfl_left, 3, 7, 6, dst, stride, tl, w, h, ac, alpha + mov hd, hm ; zero upper half + tzcnt r6d, hd + sub tlq, hq + tzcnt wd, wm + movu m0, [tlq] + mov t0d, 0x8000 + shrx t0d, t0d, r6d + movd xm3, t0d + lea t0, [ipred_cfl_left_avx2_table] + movsxd r6, [t0+r6*4] + pcmpeqd m2, m2 + pmaddubsw m0, m2 + add r6, t0 + add t0, ipred_cfl_splat_avx2_table-ipred_cfl_left_avx2_table + movsxd wq, [t0+wq*4] + add wq, t0 + movifnidn acq, acmp + jmp r6 +.h32: + vextracti128 xm1, m0, 1 + paddw xm0, xm1 +.h16: + punpckhqdq xm1, xm0, xm0 + paddw xm0, xm1 +.h8: + psrlq xm1, xm0, 32 + paddw xm0, xm1 +.h4: + pmaddwd xm0, xm2 + pmulhrsw xm0, xm3 + vpbroadcastw m0, xm0 + jmp wq + +cglobal ipred_cfl, 3, 7, 6, dst, stride, tl, w, h, ac, alpha + movifnidn hd, hm + movifnidn wd, wm + tzcnt r6d, hd + lea t0d, [wq+hq] + movd xm4, t0d + tzcnt t0d, t0d + movd xm5, t0d + lea t0, [ipred_cfl_avx2_table] + tzcnt wd, wd + movsxd r6, [t0+r6*4] + movsxd wq, [t0+wq*4+4*4] + pcmpeqd m3, m3 + psrlw xm4, 1 + add r6, t0 + add wq, t0 + movifnidn acq, acmp + jmp r6 +.h4: + movd xm0, [tlq-4] + pmaddubsw xm0, xm3 + jmp wq +.w4: + movd xm1, [tlq+1] + pmaddubsw xm1, xm3 + psubw xm0, xm4 + paddw xm0, xm1 + pmaddwd xm0, xm3 + cmp hd, 4 + jg .w4_mul + psrlw xm0, 3 + jmp .w4_end +.w4_mul: + punpckhqdq xm1, xm0, xm0 + lea r2d, [hq*2] + mov r6d, 0x55563334 + paddw xm0, xm1 + shrx r6d, r6d, r2d + psrlq xm1, xm0, 32 + paddw xm0, xm1 + movd xm1, r6d + psrlw xm0, 2 + pmulhuw xm0, xm1 +.w4_end: + vpbroadcastw m0, xm0 +.s4: + vpbroadcastw m1, alpham + lea r6, [strideq*3] + pabsw m2, m1 + psllw m2, 9 +.s4_loop: + mova m4, [acq] + IPRED_CFL 4 + packuswb m4, m4 + vextracti128 xm5, m4, 1 + movd [dstq+strideq*0], xm4 + pextrd [dstq+strideq*1], xm4, 1 + movd [dstq+strideq*2], xm5 + pextrd [dstq+r6 ], xm5, 1 + lea dstq, [dstq+strideq*4] + add acq, 32 + sub hd, 4 + jg .s4_loop + RET +ALIGN function_align +.h8: + movq xm0, [tlq-8] + pmaddubsw xm0, xm3 + jmp wq +.w8: + movq xm1, [tlq+1] + vextracti128 xm2, m0, 1 + pmaddubsw xm1, xm3 + psubw xm0, xm4 + paddw xm0, xm2 + punpckhqdq xm2, xm0, xm0 + paddw xm0, xm2 + paddw xm0, xm1 + psrlq xm1, xm0, 32 + paddw xm0, xm1 + pmaddwd xm0, xm3 + psrlw xm0, xm5 + cmp hd, 8 + je .w8_end + mov r6d, 0x5556 + mov r2d, 0x3334 + cmp hd, 32 + cmove r6d, r2d + movd xm1, r6d + pmulhuw xm0, xm1 +.w8_end: + vpbroadcastw m0, xm0 +.s8: + vpbroadcastw m1, alpham + lea r6, [strideq*3] + pabsw m2, m1 + psllw m2, 9 +.s8_loop: + mova m4, [acq] + mova m5, [acq+32] + IPRED_CFL 4 + IPRED_CFL 5 + packuswb m4, m5 + vextracti128 xm5, m4, 1 + movq [dstq+strideq*0], xm4 + movq [dstq+strideq*1], xm5 + movhps [dstq+strideq*2], xm4 + movhps [dstq+r6 ], xm5 + lea dstq, [dstq+strideq*4] + add acq, 64 + sub hd, 4 + jg .s8_loop + RET +ALIGN function_align +.h16: + mova xm0, [tlq-16] + pmaddubsw xm0, xm3 + jmp wq +.w16: + movu xm1, [tlq+1] + vextracti128 xm2, m0, 1 + pmaddubsw xm1, xm3 + psubw xm0, xm4 + paddw xm0, xm2 + paddw xm0, xm1 + punpckhqdq xm1, xm0, xm0 + paddw xm0, xm1 + psrlq xm1, xm0, 32 + paddw xm0, xm1 + pmaddwd xm0, xm3 + psrlw xm0, xm5 + cmp hd, 16 + je .w16_end + mov r6d, 0x5556 + mov r2d, 0x3334 + test hb, 8|32 + cmovz r6d, r2d + movd xm1, r6d + pmulhuw xm0, xm1 +.w16_end: + vpbroadcastw m0, xm0 +.s16: + vpbroadcastw m1, alpham + pabsw m2, m1 + psllw m2, 9 +.s16_loop: + mova m4, [acq] + mova m5, [acq+32] + IPRED_CFL 4 + IPRED_CFL 5 + packuswb m4, m5 + vpermq m4, m4, q3120 + mova [dstq+strideq*0], xm4 + vextracti128 [dstq+strideq*1], m4, 1 + lea dstq, [dstq+strideq*2] + add acq, 64 + sub hd, 2 + jg .s16_loop + RET +ALIGN function_align +.h32: + mova m0, [tlq-32] + pmaddubsw m0, m3 + jmp wq +.w32: + movu m1, [tlq+1] + pmaddubsw m1, m3 + paddw m0, m1 + vextracti128 xm1, m0, 1 + psubw xm0, xm4 + paddw xm0, xm1 + punpckhqdq xm1, xm0, xm0 + paddw xm0, xm1 + psrlq xm1, xm0, 32 + paddw xm0, xm1 + pmaddwd xm0, xm3 + psrlw xm0, xm5 + cmp hd, 32 + je .w32_end + lea r2d, [hq*2] + mov r6d, 0x33345556 + shrx r6d, r6d, r2d + movd xm1, r6d + pmulhuw xm0, xm1 +.w32_end: + vpbroadcastw m0, xm0 +.s32: + vpbroadcastw m1, alpham + pabsw m2, m1 + psllw m2, 9 +.s32_loop: + mova m4, [acq] + mova m5, [acq+32] + IPRED_CFL 4 + IPRED_CFL 5 + packuswb m4, m5 + vpermq m4, m4, q3120 + mova [dstq], m4 + add dstq, strideq + add acq, 64 + dec hd + jg .s32_loop + RET + +cglobal ipred_cfl_128, 3, 7, 6, dst, stride, tl, w, h, ac, alpha + lea t0, [ipred_cfl_splat_avx2_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [t0+wq*4] + vpbroadcastd m0, [t0-ipred_cfl_splat_avx2_table+pw_128] + add wq, t0 + movifnidn acq, acmp + jmp wq + +cglobal ipred_cfl_ac_420, 4, 9, 5, ac, y, stride, wpad, hpad, w, h, sz, ac_bak + movifnidn hpadd, hpadm + movifnidn wd, wm + mov hd, hm + mov szd, wd + mov ac_bakq, acq + imul szd, hd + shl hpadd, 2 + sub hd, hpadd + vpbroadcastd m2, [pb_2] + pxor m4, m4 + cmp wd, 8 + jg .w16 + je .w8 + ; fall-through + + DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak +.w4: + lea stride3q, [strideq*3] +.w4_loop: + movq xm0, [yq] + movq xm1, [yq+strideq] + movhps xm0, [yq+strideq*2] + movhps xm1, [yq+stride3q] + pmaddubsw xm0, xm2 + pmaddubsw xm1, xm2 + paddw xm0, xm1 + mova [acq], xm0 + paddw xm4, xm0 + lea yq, [yq+strideq*4] + add acq, 16 + sub hd, 2 + jg .w4_loop + test hpadd, hpadd + jz .calc_avg + vpermq m0, m0, q1111 +.w4_hpad_loop: + mova [acq], m0 + paddw m4, m0 + add acq, 32 + sub hpadd, 4 + jg .w4_hpad_loop + jmp .calc_avg + +.w8: + lea stride3q, [strideq*3] + test wpadd, wpadd + jnz .w8_wpad +.w8_loop: + mova xm0, [yq] + mova xm1, [yq+strideq] + vinserti128 m0, [yq+strideq*2], 1 + vinserti128 m1, [yq+stride3q], 1 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + paddw m0, m1 + mova [acq], m0 + paddw m4, m0 + lea yq, [yq+strideq*4] + add acq, 32 + sub hd, 2 + jg .w8_loop + test hpadd, hpadd + jz .calc_avg + jmp .w8_hpad +.w8_wpad: + vbroadcasti128 m3, [cfl_ac_w8_pad1_shuffle] +.w8_wpad_loop: + movq xm0, [yq] + movq xm1, [yq+strideq] + vinserti128 m0, [yq+strideq*2], 1 + vinserti128 m1, [yq+stride3q], 1 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + paddw m0, m1 + pshufb m0, m3 + mova [acq], m0 + paddw m4, m0 + lea yq, [yq+strideq*4] + add acq, 32 + sub hd, 2 + jg .w8_wpad_loop + test hpadd, hpadd + jz .calc_avg +.w8_hpad: + vpermq m0, m0, q3232 +.w8_hpad_loop: + mova [acq], m0 + paddw m4, m0 + add acq, 32 + sub hpadd, 2 + jg .w8_hpad_loop + jmp .calc_avg + +.w16: + test wpadd, wpadd + jnz .w16_wpad +.w16_loop: + mova m0, [yq] + mova m1, [yq+strideq] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + paddw m0, m1 + mova [acq], m0 + paddw m4, m0 + lea yq, [yq+strideq*2] + add acq, 32 + dec hd + jg .w16_loop + test hpadd, hpadd + jz .calc_avg + jmp .w16_hpad_loop +.w16_wpad: + DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak + lea iptrq, [ipred_cfl_ac_420_avx2_table] + shl wpadd, 2 + mova m3, [iptrq+cfl_ac_w16_pad_shuffle- \ + ipred_cfl_ac_420_avx2_table+wpadq*8-32] + movsxd wpadq, [iptrq+wpadq+4] + add iptrq, wpadq + jmp iptrq +.w16_pad3: + vpbroadcastq m0, [yq] + vpbroadcastq m1, [yq+strideq] + jmp .w16_wpad_end +.w16_pad2: + vbroadcasti128 m0, [yq] + vbroadcasti128 m1, [yq+strideq] + jmp .w16_wpad_end +.w16_pad1: + mova m0, [yq] + mova m1, [yq+strideq] + ; fall-through +.w16_wpad_end: + pmaddubsw m0, m2 + pmaddubsw m1, m2 + paddw m0, m1 + pshufb m0, m3 + mova [acq], m0 + paddw m4, m0 + lea yq, [yq+strideq*2] + add acq, 32 + dec hd + jz .w16_wpad_done + jmp iptrq +.w16_wpad_done: + test hpadd, hpadd + jz .calc_avg +.w16_hpad_loop: + mova [acq], m0 + paddw m4, m0 + add acq, 32 + dec hpadd + jg .w16_hpad_loop + ; fall-through + +.calc_avg: + vpbroadcastd m2, [pw_1] + pmaddwd m0, m4, m2 + vextracti128 xm1, m0, 1 + tzcnt r1d, szd + paddd xm0, xm1 + movd xm2, r1d + movd xm3, szd + punpckhqdq xm1, xm0, xm0 + paddd xm0, xm1 + psrad xm3, 1 + psrlq xm1, xm0, 32 + paddd xm0, xm3 + paddd xm0, xm1 + psrad xm0, xm2 + vpbroadcastw m0, xm0 +.sub_loop: + mova m1, [ac_bakq] + psubw m1, m0 + mova [ac_bakq], m1 + add ac_bakq, 32 + sub szd, 16 + jg .sub_loop + RET + +cglobal ipred_cfl_ac_422, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak + movifnidn hpadd, hpadm + movifnidn wd, wm + mov hd, hm + mov szd, wd + mov ac_bakq, acq + imul szd, hd + shl hpadd, 2 + sub hd, hpadd + vpbroadcastd m2, [pb_4] + pxor m4, m4 + pxor m5, m5 + cmp wd, 8 + jg .w16 + je .w8 + ; fall-through + + DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak +.w4: + lea stride3q, [strideq*3] +.w4_loop: + movq xm1, [yq] + movhps xm1, [yq+strideq] + movq xm0, [yq+strideq*2] + movhps xm0, [yq+stride3q] + pmaddubsw xm0, xm2 + pmaddubsw xm1, xm2 + mova [acq], xm1 + mova [acq+16], xm0 + paddw xm4, xm0 + paddw xm5, xm1 + lea yq, [yq+strideq*4] + add acq, 32 + sub hd, 4 + jg .w4_loop + test hpadd, hpadd + jz .calc_avg + vpermq m0, m0, q1111 +.w4_hpad_loop: + mova [acq], m0 + paddw m4, m0 + add acq, 32 + sub hpadd, 4 + jg .w4_hpad_loop + jmp .calc_avg + +.w8: + lea stride3q, [strideq*3] + test wpadd, wpadd + jnz .w8_wpad +.w8_loop: + mova xm1, [yq] + vinserti128 m1, [yq+strideq], 1 + mova xm0, [yq+strideq*2] + vinserti128 m0, [yq+stride3q], 1 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + mova [acq], m1 + mova [acq+32], m0 + paddw m4, m0 + paddw m5, m1 + lea yq, [yq+strideq*4] + add acq, 64 + sub hd, 4 + jg .w8_loop + test hpadd, hpadd + jz .calc_avg + jmp .w8_hpad +.w8_wpad: + vbroadcasti128 m3, [cfl_ac_w8_pad1_shuffle] +.w8_wpad_loop: + movq xm1, [yq] + vinserti128 m1, [yq+strideq], 1 + movq xm0, [yq+strideq*2] + vinserti128 m0, [yq+stride3q], 1 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + pshufb m0, m3 + pshufb m1, m3 + mova [acq], m1 + mova [acq+32], m0 + paddw m4, m0 + paddw m5, m1 + lea yq, [yq+strideq*4] + add acq, 64 + sub hd, 4 + jg .w8_wpad_loop + test hpadd, hpadd + jz .calc_avg +.w8_hpad: + vpermq m0, m0, q3232 +.w8_hpad_loop: + mova [acq], m0 + paddw m4, m0 + add acq, 32 + sub hpadd, 2 + jg .w8_hpad_loop + jmp .calc_avg + +.w16: + test wpadd, wpadd + jnz .w16_wpad +.w16_loop: + mova m1, [yq] + mova m0, [yq+strideq] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + mova [acq], m1 + mova [acq+32], m0 + paddw m4, m0 + paddw m5, m1 + lea yq, [yq+strideq*2] + add acq, 64 + sub hd, 2 + jg .w16_loop + test hpadd, hpadd + jz .calc_avg + jmp .w16_hpad_loop +.w16_wpad: + DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak + lea iptrq, [ipred_cfl_ac_422_avx2_table] + shl wpadd, 2 + mova m3, [iptrq+cfl_ac_w16_pad_shuffle- \ + ipred_cfl_ac_422_avx2_table+wpadq*8-32] + movsxd wpadq, [iptrq+wpadq+4] + add iptrq, wpadq + jmp iptrq +.w16_pad3: + vpbroadcastq m1, [yq] + vpbroadcastq m0, [yq+strideq] + jmp .w16_wpad_end +.w16_pad2: + vbroadcasti128 m1, [yq] + vbroadcasti128 m0, [yq+strideq] + jmp .w16_wpad_end +.w16_pad1: + mova m1, [yq] + mova m0, [yq+strideq] + ; fall-through +.w16_wpad_end: + pmaddubsw m0, m2 + pmaddubsw m1, m2 + pshufb m0, m3 + pshufb m1, m3 + mova [acq], m1 + mova [acq+32], m0 + paddw m4, m0 + paddw m5, m1 + lea yq, [yq+strideq*2] + add acq, 64 + sub hd, 2 + jz .w16_wpad_done + jmp iptrq +.w16_wpad_done: + test hpadd, hpadd + jz .calc_avg +.w16_hpad_loop: + mova [acq], m0 + mova [acq+32], m0 + paddw m4, m0 + paddw m5, m0 + add acq, 64 + sub hpadd, 2 + jg .w16_hpad_loop + ; fall-through + +.calc_avg: + vpbroadcastd m2, [pw_1] + pmaddwd m5, m5, m2 + pmaddwd m0, m4, m2 + paddd m0, m5 + vextracti128 xm1, m0, 1 + tzcnt r1d, szd + paddd xm0, xm1 + movd xm2, r1d + movd xm3, szd + punpckhqdq xm1, xm0, xm0 + paddd xm0, xm1 + psrad xm3, 1 + psrlq xm1, xm0, 32 + paddd xm0, xm3 + paddd xm0, xm1 + psrad xm0, xm2 + vpbroadcastw m0, xm0 +.sub_loop: + mova m1, [ac_bakq] + psubw m1, m0 + mova [ac_bakq], m1 + add ac_bakq, 32 + sub szd, 16 + jg .sub_loop + RET + +cglobal ipred_cfl_ac_444, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak + movifnidn hpadd, hpadm + movifnidn wd, wm + mov hd, hm + mov szd, wd + imul szd, hd + shl hpadd, 2 + sub hd, hpadd + pxor m4, m4 + vpbroadcastd m5, [pw_1] + tzcnt r8d, wd + lea r5, [ipred_cfl_ac_444_avx2_table] + movsxd r8, [r5+r8*4+12] + add r5, r8 + + DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak + mov ac_bakq, acq + jmp r5 + +.w4: + lea stride3q, [strideq*3] + pxor xm2, xm2 +.w4_loop: + movd xm1, [yq] + movd xm0, [yq+strideq*2] + pinsrd xm1, [yq+strideq], 1 + pinsrd xm0, [yq+stride3q], 1 + punpcklbw xm1, xm2 + punpcklbw xm0, xm2 + psllw xm1, 3 + psllw xm0, 3 + mova [acq], xm1 + mova [acq+16], xm0 + paddw xm1, xm0 + paddw xm4, xm1 + lea yq, [yq+strideq*4] + add acq, 32 + sub hd, 4 + jg .w4_loop + test hpadd, hpadd + jz .calc_avg_mul + pshufd xm0, xm0, q3232 + paddw xm1, xm0, xm0 +.w4_hpad_loop: + mova [acq], xm0 + mova [acq+16], xm0 + paddw xm4, xm1 + add acq, 32 + sub hpadd, 4 + jg .w4_hpad_loop + jmp .calc_avg_mul + +.w8: + lea stride3q, [strideq*3] + pxor m2, m2 +.w8_loop: + movq xm1, [yq] + movq xm0, [yq+strideq*2] + vinserti128 m1, [yq+strideq], 1 + vinserti128 m0, [yq+stride3q], 1 + punpcklbw m1, m2 + punpcklbw m0, m2 + psllw m1, 3 + psllw m0, 3 + mova [acq], m1 + mova [acq+32], m0 + paddw m1, m0 + paddw m4, m1 + lea yq, [yq+strideq*4] + add acq, 64 + sub hd, 4 + jg .w8_loop + test hpadd, hpadd + jz .calc_avg_mul + vpermq m0, m0, q3232 + paddw m1, m0, m0 +.w8_hpad_loop: + mova [acq], m0 + mova [acq+32], m0 + paddw m4, m1 + add acq, 64 + sub hpadd, 4 + jg .w8_hpad_loop + jmp .calc_avg_mul + +.w16: + test wpadd, wpadd + jnz .w16_wpad +.w16_loop: + pmovzxbw m1, [yq] + pmovzxbw m0, [yq+strideq] + psllw m1, 3 + psllw m0, 3 + mova [acq], m1 + mova [acq+32], m0 + paddw m1, m0 + pmaddwd m1, m5 + paddd m4, m1 + lea yq, [yq+strideq*2] + add acq, 64 + sub hd, 2 + jg .w16_loop + test hpadd, hpadd + jz .calc_avg + jmp .w16_hpad +.w16_wpad: + mova m3, [cfl_ac_444_w16_pad1_shuffle] +.w16_wpad_loop: + vpbroadcastq m1, [yq] + vpbroadcastq m0, [yq+strideq] + pshufb m1, m3 + pshufb m0, m3 + psllw m1, 3 + psllw m0, 3 + mova [acq], m1 + mova [acq+32], m0 + paddw m1, m0 + pmaddwd m1, m5 + paddd m4, m1 + lea yq, [yq+strideq*2] + add acq, 64 + sub hd, 2 + jg .w16_wpad_loop + test hpadd, hpadd + jz .calc_avg +.w16_hpad: + paddw m1, m0, m0 + pmaddwd m1, m5 +.w16_hpad_loop: + mova [acq], m0 + mova [acq+32], m0 + paddd m4, m1 + add acq, 64 + sub hpadd, 2 + jg .w16_hpad_loop + jmp .calc_avg + +.w32: + test wpadd, wpadd + jnz .w32_wpad +.w32_loop: + pmovzxbw m1, [yq] + pmovzxbw m0, [yq+16] + psllw m1, 3 + psllw m0, 3 + mova [acq], m1 + mova [acq+32], m0 + paddw m2, m1, m0 + pmaddwd m2, m5 + paddd m4, m2 + add yq, strideq + add acq, 64 + dec hd + jg .w32_loop + test hpadd, hpadd + jz .calc_avg + jmp .w32_hpad_loop +.w32_wpad: + DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak + lea iptrq, [ipred_cfl_ac_444_avx2_table] + add wpadd, wpadd + mova m3, [iptrq+cfl_ac_444_w16_pad1_shuffle-ipred_cfl_ac_444_avx2_table] + movsxd wpadq, [iptrq+wpadq+4] + add iptrq, wpadq + jmp iptrq +.w32_pad3: + vpbroadcastq m1, [yq] + pshufb m1, m3 + vpermq m0, m1, q3232 + jmp .w32_wpad_end +.w32_pad2: + pmovzxbw m1, [yq] + pshufhw m0, m1, q3333 + vpermq m0, m0, q3333 + jmp .w32_wpad_end +.w32_pad1: + pmovzxbw m1, [yq] + vpbroadcastq m0, [yq+16] + pshufb m0, m3 + ; fall-through +.w32_wpad_end: + psllw m1, 3 + psllw m0, 3 + mova [acq], m1 + mova [acq+32], m0 + paddw m2, m1, m0 + pmaddwd m2, m5 + paddd m4, m2 + add yq, strideq + add acq, 64 + dec hd + jz .w32_wpad_done + jmp iptrq +.w32_wpad_done: + test hpadd, hpadd + jz .calc_avg +.w32_hpad_loop: + mova [acq], m1 + mova [acq+32], m0 + paddd m4, m2 + add acq, 64 + dec hpadd + jg .w32_hpad_loop + jmp .calc_avg + +.calc_avg_mul: + pmaddwd m4, m5 +.calc_avg: + vextracti128 xm1, m4, 1 + tzcnt r1d, szd + paddd xm0, xm4, xm1 + movd xm2, r1d + movd xm3, szd + punpckhqdq xm1, xm0, xm0 + paddd xm0, xm1 + psrad xm3, 1 + psrlq xm1, xm0, 32 + paddd xm0, xm3 + paddd xm0, xm1 + psrad xm0, xm2 + vpbroadcastw m0, xm0 +.sub_loop: + mova m1, [ac_bakq] + psubw m1, m0 + mova [ac_bakq], m1 + add ac_bakq, 32 + sub szd, 16 + jg .sub_loop + RET + +cglobal pal_pred, 4, 6, 5, dst, stride, pal, idx, w, h + vbroadcasti128 m4, [palq] + lea r2, [pal_pred_avx2_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r2+wq*4] + packuswb m4, m4 + add wq, r2 + lea r2, [strideq*3] + jmp wq +.w4: + pshufb xm0, xm4, [idxq] + add idxq, 16 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+r2 ], xm0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4 + RET +ALIGN function_align +.w8: + pshufb xm0, xm4, [idxq+16*0] + pshufb xm1, xm4, [idxq+16*1] + add idxq, 16*2 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r2 ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8 + RET +ALIGN function_align +.w16: + pshufb m0, m4, [idxq+32*0] + pshufb m1, m4, [idxq+32*1] + add idxq, 32*2 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], xm1 + vextracti128 [dstq+r2 ], m1, 1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w16 + RET +ALIGN function_align +.w32: + pshufb m0, m4, [idxq+32*0] + pshufb m1, m4, [idxq+32*1] + pshufb m2, m4, [idxq+32*2] + pshufb m3, m4, [idxq+32*3] + add idxq, 32*4 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+r2 ], m3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w32 + RET +ALIGN function_align +.w64: + pshufb m0, m4, [idxq+32*0] + pshufb m1, m4, [idxq+32*1] + pshufb m2, m4, [idxq+32*2] + pshufb m3, m4, [idxq+32*3] + add idxq, 32*4 + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m1 + mova [dstq+strideq*1+32*0], m2 + mova [dstq+strideq*1+32*1], m3 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w64 + RET + +%endif diff --git a/third_party/dav1d/src/x86/ipred_init_tmpl.c b/third_party/dav1d/src/x86/ipred_init_tmpl.c new file mode 100644 index 0000000000..4219ab8b12 --- /dev/null +++ b/third_party/dav1d/src/x86/ipred_init_tmpl.c @@ -0,0 +1,139 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/ipred.h" + +decl_angular_ipred_fn(dav1d_ipred_dc_avx2); +decl_angular_ipred_fn(dav1d_ipred_dc_128_avx2); +decl_angular_ipred_fn(dav1d_ipred_dc_top_avx2); +decl_angular_ipred_fn(dav1d_ipred_dc_left_avx2); +decl_angular_ipred_fn(dav1d_ipred_h_avx2); +decl_angular_ipred_fn(dav1d_ipred_v_avx2); +decl_angular_ipred_fn(dav1d_ipred_paeth_avx2); +decl_angular_ipred_fn(dav1d_ipred_smooth_avx2); +decl_angular_ipred_fn(dav1d_ipred_smooth_v_avx2); +decl_angular_ipred_fn(dav1d_ipred_smooth_h_avx2); +decl_angular_ipred_fn(dav1d_ipred_z1_avx2); +decl_angular_ipred_fn(dav1d_ipred_z2_avx2); +decl_angular_ipred_fn(dav1d_ipred_z3_avx2); +decl_angular_ipred_fn(dav1d_ipred_filter_avx2); + +decl_cfl_pred_fn(dav1d_ipred_cfl_avx2); +decl_cfl_pred_fn(dav1d_ipred_cfl_128_avx2); +decl_cfl_pred_fn(dav1d_ipred_cfl_top_avx2); +decl_cfl_pred_fn(dav1d_ipred_cfl_left_avx2); + +decl_cfl_ac_fn(dav1d_ipred_cfl_ac_420_avx2); +decl_cfl_ac_fn(dav1d_ipred_cfl_ac_422_avx2); +decl_cfl_ac_fn(dav1d_ipred_cfl_ac_444_avx2); + +decl_pal_pred_fn(dav1d_pal_pred_avx2); + +decl_angular_ipred_fn(dav1d_ipred_dc_ssse3); +decl_angular_ipred_fn(dav1d_ipred_dc_128_ssse3); +decl_angular_ipred_fn(dav1d_ipred_dc_top_ssse3); +decl_angular_ipred_fn(dav1d_ipred_dc_left_ssse3); +decl_angular_ipred_fn(dav1d_ipred_h_ssse3); +decl_angular_ipred_fn(dav1d_ipred_v_ssse3); +decl_angular_ipred_fn(dav1d_ipred_paeth_ssse3); +decl_angular_ipred_fn(dav1d_ipred_smooth_ssse3); +decl_angular_ipred_fn(dav1d_ipred_smooth_v_ssse3); +decl_angular_ipred_fn(dav1d_ipred_smooth_h_ssse3); +decl_angular_ipred_fn(dav1d_ipred_filter_ssse3); + +decl_cfl_pred_fn(dav1d_ipred_cfl_ssse3); +decl_cfl_pred_fn(dav1d_ipred_cfl_128_ssse3); +decl_cfl_pred_fn(dav1d_ipred_cfl_top_ssse3); +decl_cfl_pred_fn(dav1d_ipred_cfl_left_ssse3); + +decl_cfl_ac_fn(dav1d_ipred_cfl_ac_420_ssse3); +decl_cfl_ac_fn(dav1d_ipred_cfl_ac_422_ssse3); +decl_cfl_ac_fn(dav1d_ipred_cfl_ac_444_ssse3); + +decl_pal_pred_fn(dav1d_pal_pred_ssse3); + +COLD void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; + +#if BITDEPTH == 8 + c->intra_pred[DC_PRED] = dav1d_ipred_dc_ssse3; + c->intra_pred[DC_128_PRED] = dav1d_ipred_dc_128_ssse3; + c->intra_pred[TOP_DC_PRED] = dav1d_ipred_dc_top_ssse3; + c->intra_pred[LEFT_DC_PRED] = dav1d_ipred_dc_left_ssse3; + c->intra_pred[HOR_PRED] = dav1d_ipred_h_ssse3; + c->intra_pred[VERT_PRED] = dav1d_ipred_v_ssse3; + c->intra_pred[PAETH_PRED] = dav1d_ipred_paeth_ssse3; + c->intra_pred[SMOOTH_PRED] = dav1d_ipred_smooth_ssse3; + c->intra_pred[SMOOTH_V_PRED] = dav1d_ipred_smooth_v_ssse3; + c->intra_pred[SMOOTH_H_PRED] = dav1d_ipred_smooth_h_ssse3; + c->intra_pred[FILTER_PRED] = dav1d_ipred_filter_ssse3; + + c->cfl_pred[DC_PRED] = dav1d_ipred_cfl_ssse3; + c->cfl_pred[DC_128_PRED] = dav1d_ipred_cfl_128_ssse3; + c->cfl_pred[TOP_DC_PRED] = dav1d_ipred_cfl_top_ssse3; + c->cfl_pred[LEFT_DC_PRED] = dav1d_ipred_cfl_left_ssse3; + + c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_ipred_cfl_ac_420_ssse3; + c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_ipred_cfl_ac_422_ssse3; + c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_ipred_cfl_ac_444_ssse3; + + c->pal_pred = dav1d_pal_pred_ssse3; +#endif + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; + +#if BITDEPTH == 8 && ARCH_X86_64 + c->intra_pred[DC_PRED] = dav1d_ipred_dc_avx2; + c->intra_pred[DC_128_PRED] = dav1d_ipred_dc_128_avx2; + c->intra_pred[TOP_DC_PRED] = dav1d_ipred_dc_top_avx2; + c->intra_pred[LEFT_DC_PRED] = dav1d_ipred_dc_left_avx2; + c->intra_pred[HOR_PRED] = dav1d_ipred_h_avx2; + c->intra_pred[VERT_PRED] = dav1d_ipred_v_avx2; + c->intra_pred[PAETH_PRED] = dav1d_ipred_paeth_avx2; + c->intra_pred[SMOOTH_PRED] = dav1d_ipred_smooth_avx2; + c->intra_pred[SMOOTH_V_PRED] = dav1d_ipred_smooth_v_avx2; + c->intra_pred[SMOOTH_H_PRED] = dav1d_ipred_smooth_h_avx2; + c->intra_pred[Z1_PRED] = dav1d_ipred_z1_avx2; + c->intra_pred[Z2_PRED] = dav1d_ipred_z2_avx2; + c->intra_pred[Z3_PRED] = dav1d_ipred_z3_avx2; + c->intra_pred[FILTER_PRED] = dav1d_ipred_filter_avx2; + + c->cfl_pred[DC_PRED] = dav1d_ipred_cfl_avx2; + c->cfl_pred[DC_128_PRED] = dav1d_ipred_cfl_128_avx2; + c->cfl_pred[TOP_DC_PRED] = dav1d_ipred_cfl_top_avx2; + c->cfl_pred[LEFT_DC_PRED] = dav1d_ipred_cfl_left_avx2; + + c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_ipred_cfl_ac_420_avx2; + c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_ipred_cfl_ac_422_avx2; + c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_ipred_cfl_ac_444_avx2; + + c->pal_pred = dav1d_pal_pred_avx2; +#endif +} diff --git a/third_party/dav1d/src/x86/ipred_ssse3.asm b/third_party/dav1d/src/x86/ipred_ssse3.asm new file mode 100644 index 0000000000..06ee256645 --- /dev/null +++ b/third_party/dav1d/src/x86/ipred_ssse3.asm @@ -0,0 +1,3109 @@ +; Copyright © 2018, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA 16 + +%macro SMOOTH_WEIGHT_TABLE 1-* + %rep %0 + db %1-128, 127-%1 + %rotate 1 + %endrep +%endmacro + +; sm_weights[], but modified to precalculate x and 256-x with offsets to +; enable efficient use of pmaddubsw (which requires signed values) +smooth_weights: SMOOTH_WEIGHT_TABLE \ + 0, 0, 255, 128, 255, 149, 85, 64, \ + 255, 197, 146, 105, 73, 50, 37, 32, \ + 255, 225, 196, 170, 145, 123, 102, 84, \ + 68, 54, 43, 33, 26, 20, 17, 16, \ + 255, 240, 225, 210, 196, 182, 169, 157, \ + 145, 133, 122, 111, 101, 92, 83, 74, \ + 66, 59, 52, 45, 39, 34, 29, 25, \ + 21, 17, 14, 12, 10, 9, 8, 8, \ + 255, 248, 240, 233, 225, 218, 210, 203, \ + 196, 189, 182, 176, 169, 163, 156, 150, \ + 144, 138, 133, 127, 121, 116, 111, 106, \ + 101, 96, 91, 86, 82, 77, 73, 69, \ + 65, 61, 57, 54, 50, 47, 44, 41, \ + 38, 35, 32, 29, 27, 25, 22, 20, \ + 18, 16, 15, 13, 12, 10, 9, 8, \ + 7, 6, 6, 5, 5, 4, 4, 4 + +ipred_v_shuf : db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7 +ipred_h_shuf : db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0 +ipred_paeth_shuf : db 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 +filter_shuf1 : db 3, 4, 3, 4, 5, 6, 5, 6, 7, 2, 7, 2, 1, -1, 1, -1 +filter_shuf2 : db 3, 4, 3, 4, 5, 6, 5, 6, 7, 11, 7, 11, 15, -1, 15, -1 + +pw_8 : times 8 dw 8 +pb_3 : times 16 db 3 +pb_128 : times 8 db 128 +pw_128 : times 4 dw 128 +pw_255 : times 4 dw 255 +pb_2 : times 8 db 2 +pb_4 : times 8 db 4 +pb_127_m127 : times 4 db 127, -127 +pd_32768 : times 1 dd 32768 + + +%macro JMP_TABLE 3-* + %xdefine %1_%2_table (%%table - 2*4) + %xdefine %%base mangle(private_prefix %+ _%1_%2) + %%table: + %rep %0 - 2 + dd %%base %+ .%3 - (%%table - 2*4) + %rotate 1 + %endrep +%endmacro + +%define ipred_dc_splat_ssse3_table (ipred_dc_ssse3_table + 10*4) +%define ipred_cfl_splat_ssse3_table (ipred_cfl_ssse3_table + 8*4) + +JMP_TABLE ipred_h, ssse3, w4, w8, w16, w32, w64 +JMP_TABLE ipred_dc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ + s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 +JMP_TABLE ipred_dc_left, ssse3, h4, h8, h16, h32, h64 +JMP_TABLE ipred_smooth, ssse3, w4, w8, w16, w32, w64 +JMP_TABLE ipred_smooth_v, ssse3, w4, w8, w16, w32, w64 +JMP_TABLE ipred_smooth_h, ssse3, w4, w8, w16, w32, w64 +JMP_TABLE ipred_paeth, ssse3, w4, w8, w16, w32, w64 +JMP_TABLE pal_pred, ssse3, w4, w8, w16, w32, w64 +JMP_TABLE ipred_cfl, ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \ + s4-8*4, s8-8*4, s16-8*4, s32-8*4 +JMP_TABLE ipred_cfl_left, ssse3, h4, h8, h16, h32 +JMP_TABLE ipred_filter, ssse3, w4, w8, w16, w32 + +cextern filter_intra_taps + + +SECTION .text + +;--------------------------------------------------------------------------------------- +;int dav1d_ipred_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, +; const int width, const int height, const int a); +;--------------------------------------------------------------------------------------- +%macro IPRED_SET 3 ; width, stride, stride size pshuflw_imm8 + pshuflw m1, m0, %3 ; extend 8 byte for 2 pos + punpcklqdq m1, m1 + mova [dstq + %2], m1 +%if %1 > 16 + mova [dstq + 16 + %2], m1 +%endif +%if %1 > 32 + mova [dstq + 32 + %2], m1 + mova [dstq + 48 + %2], m1 +%endif +%endmacro + +%macro IPRED_H 1 ; width + sub tlq, 4 + movd m0, [tlq] ; get 4 bytes of topleft data + punpcklbw m0, m0 ; extend 2 byte +%if %1 == 4 + pshuflw m1, m0, q2233 + movd [dstq+strideq*0], m1 + psrlq m1, 32 + movd [dstq+strideq*1], m1 + pshuflw m0, m0, q0011 + movd [dstq+strideq*2], m0 + psrlq m0, 32 + movd [dstq+stride3q ], m0 + +%elif %1 == 8 + punpcklwd m0, m0 + punpckhdq m1, m0, m0 + punpckldq m0, m0 + movq [dstq+strideq*1], m1 + movhps [dstq+strideq*0], m1 + movq [dstq+stride3q ], m0 + movhps [dstq+strideq*2], m0 +%else + IPRED_SET %1, 0, q3333 + IPRED_SET %1, strideq, q2222 + IPRED_SET %1, strideq*2, q1111 + IPRED_SET %1, stride3q, q0000 +%endif + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w%1 + RET +%endmacro + +INIT_XMM ssse3 +cglobal ipred_h, 3, 6, 2, dst, stride, tl, w, h, stride3 + LEA r5, ipred_h_ssse3_table + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r5+wq*4] + add wq, r5 + lea stride3q, [strideq*3] + jmp wq +.w4: + IPRED_H 4 +.w8: + IPRED_H 8 +.w16: + IPRED_H 16 +.w32: + IPRED_H 32 +.w64: + IPRED_H 64 + +;--------------------------------------------------------------------------------------- +;int dav1d_ipred_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, +; const int width, const int height, const int a); +;--------------------------------------------------------------------------------------- +cglobal ipred_v, 3, 7, 6, dst, stride, tl, w, h, stride3 + LEA r5, ipred_dc_splat_ssse3_table + tzcnt wd, wm + movu m0, [tlq+ 1] + movu m1, [tlq+17] + movu m2, [tlq+33] + movu m3, [tlq+49] + movifnidn hd, hm + movsxd wq, [r5+wq*4] + add wq, r5 + lea stride3q, [strideq*3] + jmp wq + +;--------------------------------------------------------------------------------------- +;int dav1d_ipred_dc_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, +; const int width, const int height, const int a); +;--------------------------------------------------------------------------------------- +cglobal ipred_dc, 3, 7, 6, dst, stride, tl, w, h, stride3 + movifnidn hd, hm + movifnidn wd, wm + tzcnt r6d, hd + lea r5d, [wq+hq] + movd m4, r5d + tzcnt r5d, r5d + movd m5, r5d + LEA r5, ipred_dc_ssse3_table + tzcnt wd, wd + movsxd r6, [r5+r6*4] + movsxd wq, [r5+wq*4+20] + pcmpeqd m3, m3 + psrlw m4, 1 ; dc = (width + height) >> 1; + add r6, r5 + add wq, r5 + lea stride3q, [strideq*3] + jmp r6 +.h4: + movd m0, [tlq-4] + pmaddubsw m0, m3 + jmp wq +.w4: + movd m1, [tlq+1] + pmaddubsw m1, m3 + psubw m0, m4 + paddw m0, m1 + pmaddwd m0, m3 + cmp hd, 4 + jg .w4_mul + psrlw m0, 3 ; dc >>= ctz(width + height); + jmp .w4_end +.w4_mul: + punpckhqdq m1, m0, m0 + paddw m0, m1 + psrlq m1, m0, 32 + paddw m0, m1 + psrlw m0, 2 + mov r6d, 0x5556 + mov r2d, 0x3334 + test hd, 8 + cmovz r6d, r2d + movd m5, r6d + pmulhuw m0, m5 +.w4_end: + pxor m1, m1 + pshufb m0, m1 +.s4: + movd [dstq+strideq*0], m0 + movd [dstq+strideq*1], m0 + movd [dstq+strideq*2], m0 + movd [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s4 + RET +ALIGN function_align +.h8: + movq m0, [tlq-8] + pmaddubsw m0, m3 + jmp wq +.w8: + movq m1, [tlq+1] + pmaddubsw m1, m3 + psubw m4, m0 + punpckhqdq m0, m0 + psubw m0, m4 + paddw m0, m1 + pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 + paddw m0, m1 + pmaddwd m0, m3 + psrlw m0, m5 + cmp hd, 8 + je .w8_end + mov r6d, 0x5556 + mov r2d, 0x3334 + cmp hd, 32 + cmovz r6d, r2d + movd m1, r6d + pmulhuw m0, m1 +.w8_end: + pxor m1, m1 + pshufb m0, m1 +.s8: + movq [dstq+strideq*0], m0 + movq [dstq+strideq*1], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s8 + RET +ALIGN function_align +.h16: + mova m0, [tlq-16] + pmaddubsw m0, m3 + jmp wq +.w16: + movu m1, [tlq+1] + pmaddubsw m1, m3 + paddw m0, m1 + psubw m4, m0 + punpckhqdq m0, m0 + psubw m0, m4 + pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 + paddw m0, m1 + pmaddwd m0, m3 + psrlw m0, m5 + cmp hd, 16 + je .w16_end + mov r6d, 0x5556 + mov r2d, 0x3334 + test hd, 8|32 + cmovz r6d, r2d + movd m1, r6d + pmulhuw m0, m1 +.w16_end: + pxor m1, m1 + pshufb m0, m1 +.s16: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s16 + RET +ALIGN function_align +.h32: + mova m0, [tlq-32] + pmaddubsw m0, m3 + mova m2, [tlq-16] + pmaddubsw m2, m3 + paddw m0, m2 + jmp wq +.w32: + movu m1, [tlq+1] + pmaddubsw m1, m3 + movu m2, [tlq+17] + pmaddubsw m2, m3 + paddw m1, m2 + paddw m0, m1 + psubw m4, m0 + punpckhqdq m0, m0 + psubw m0, m4 + pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 + paddw m0, m1 + pmaddwd m0, m3 + psrlw m0, m5 + cmp hd, 32 + je .w32_end + lea r2d, [hq*2] + mov r6d, 0x5556 + mov r2d, 0x3334 + test hd, 64|16 + cmovz r6d, r2d + movd m1, r6d + pmulhuw m0, m1 +.w32_end: + pxor m1, m1 + pshufb m0, m1 + mova m1, m0 +.s32: + mova [dstq], m0 + mova [dstq+16], m1 + mova [dstq+strideq], m0 + mova [dstq+strideq+16], m1 + mova [dstq+strideq*2], m0 + mova [dstq+strideq*2+16], m1 + mova [dstq+stride3q], m0 + mova [dstq+stride3q+16], m1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s32 + RET +ALIGN function_align +.h64: + mova m0, [tlq-64] + mova m1, [tlq-48] + pmaddubsw m0, m3 + pmaddubsw m1, m3 + paddw m0, m1 + mova m1, [tlq-32] + pmaddubsw m1, m3 + paddw m0, m1 + mova m1, [tlq-16] + pmaddubsw m1, m3 + paddw m0, m1 + jmp wq +.w64: + movu m1, [tlq+ 1] + movu m2, [tlq+17] + pmaddubsw m1, m3 + pmaddubsw m2, m3 + paddw m1, m2 + movu m2, [tlq+33] + pmaddubsw m2, m3 + paddw m1, m2 + movu m2, [tlq+49] + pmaddubsw m2, m3 + paddw m1, m2 + paddw m0, m1 + psubw m4, m0 + punpckhqdq m0, m0 + psubw m0, m4 + pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 + paddw m0, m1 + pmaddwd m0, m3 + psrlw m0, m5 + cmp hd, 64 + je .w64_end + mov r6d, 0x5556 + mov r2d, 0x3334 + test hd, 32 + cmovz r6d, r2d + movd m1, r6d + pmulhuw m0, m1 +.w64_end: + pxor m1, m1 + pshufb m0, m1 + mova m1, m0 + mova m2, m0 + mova m3, m0 +.s64: + mova [dstq], m0 + mova [dstq+16], m1 + mova [dstq+32], m2 + mova [dstq+48], m3 + mova [dstq+strideq], m0 + mova [dstq+strideq+16], m1 + mova [dstq+strideq+32], m2 + mova [dstq+strideq+48], m3 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .s64 + RET + +;--------------------------------------------------------------------------------------- +;int dav1d_ipred_dc_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, +; const int width, const int height, const int a); +;--------------------------------------------------------------------------------------- +cglobal ipred_dc_left, 3, 7, 6, dst, stride, tl, w, h, stride3 + LEA r5, ipred_dc_left_ssse3_table + mov hd, hm ; zero upper half + tzcnt r6d, hd + sub tlq, hq + tzcnt wd, wm + movu m0, [tlq] + movd m3, [r5-ipred_dc_left_ssse3_table+pd_32768] + movd m2, r6d + psrld m3, m2 + movsxd r6, [r5+r6*4] + pcmpeqd m2, m2 + pmaddubsw m0, m2 + add r6, r5 + add r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table + movsxd wq, [r5+wq*4] + add wq, r5 + jmp r6 +.h64: + movu m1, [tlq+48] ; unaligned when jumping here from dc_top + pmaddubsw m1, m2 + paddw m0, m1 + movu m1, [tlq+32] ; unaligned when jumping here from dc_top + pmaddubsw m1, m2 + paddw m0, m1 +.h32: + movu m1, [tlq+16] ; unaligned when jumping here from dc_top + pmaddubsw m1, m2 + paddw m0, m1 +.h16: + pshufd m1, m0, q3232 ; psrlq m1, m0, 16 + paddw m0, m1 +.h8: + pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 + paddw m0, m1 +.h4: + pmaddwd m0, m2 + pmulhrsw m0, m3 + lea stride3q, [strideq*3] + pxor m1, m1 + pshufb m0, m1 + mova m1, m0 + mova m2, m0 + mova m3, m0 + jmp wq + +;--------------------------------------------------------------------------------------- +;int dav1d_ipred_dc_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, +; const int width, const int height, const int a); +;--------------------------------------------------------------------------------------- +cglobal ipred_dc_128, 2, 7, 6, dst, stride, tl, w, h, stride3 + LEA r5, ipred_dc_splat_ssse3_table + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r5+wq*4] + movddup m0, [r5-ipred_dc_splat_ssse3_table+pb_128] + mova m1, m0 + mova m2, m0 + mova m3, m0 + add wq, r5 + lea stride3q, [strideq*3] + jmp wq + +;--------------------------------------------------------------------------------------- +;int dav1d_ipred_dc_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, +; const int width, const int height, const int a); +;--------------------------------------------------------------------------------------- +cglobal ipred_dc_top, 3, 7, 6, dst, stride, tl, w, h + LEA r5, ipred_dc_left_ssse3_table + tzcnt wd, wm + inc tlq + movu m0, [tlq] + movifnidn hd, hm + movd m3, [r5-ipred_dc_left_ssse3_table+pd_32768] + movd m2, wd + psrld m3, m2 + movsxd r6, [r5+wq*4] + pcmpeqd m2, m2 + pmaddubsw m0, m2 + add r6, r5 + add r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table + movsxd wq, [r5+wq*4] + add wq, r5 + jmp r6 + +;--------------------------------------------------------------------------------------- +;int dav1d_ipred_smooth_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, +; const int width, const int height, const int a); +;--------------------------------------------------------------------------------------- +%macro SMOOTH 6 ; src[1-2], mul[1-2], add[1-2] + ; w * a = (w - 128) * a + 128 * a + ; (256 - w) * b = (127 - w) * b + 129 * b + ; => w * a + (256 - w) * b = [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b] + pmaddubsw m6, m%3, m%1 + pmaddubsw m0, m%4, m%2 ; (w - 128) * a + (127 - w) * b + paddw m6, m%5 + paddw m0, m%6 ; [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b + 128] + psrlw m6, 8 + psrlw m0, 8 + packuswb m6, m0 +%endmacro + +cglobal ipred_smooth_v, 3, 7, 7, dst, stride, tl, w, h, weights +%define base r6-ipred_smooth_v_ssse3_table + LEA r6, ipred_smooth_v_ssse3_table + tzcnt wd, wm + mov hd, hm + movsxd wq, [r6+wq*4] + movddup m0, [base+pb_127_m127] + movddup m1, [base+pw_128] + lea weightsq, [base+smooth_weights+hq*4] + neg hq + movd m5, [tlq+hq] + pxor m2, m2 + pshufb m5, m2 + add wq, r6 + jmp wq +.w4: + movd m2, [tlq+1] + punpckldq m2, m2 + punpcklbw m2, m5 ; top, bottom + lea r3, [strideq*3] + mova m4, [base+ipred_v_shuf] + mova m5, m4 + punpckldq m4, m4 + punpckhdq m5, m5 + pmaddubsw m3, m2, m0 ; m3: 127 * top - 127 * bottom + paddw m1, m2 ; m1: 1 * top + 256 * bottom + 128, overflow is ok + paddw m3, m1 ; m3: 128 * top + 129 * bottom + 128 +.w4_loop: + movu m1, [weightsq+hq*2] + pshufb m0, m1, m4 ;m2, m3, m4 and m5 should be stable in loop + pshufb m1, m5 + SMOOTH 0, 1, 2, 2, 3, 3 + movd [dstq+strideq*0], m6 + pshuflw m1, m6, q1032 + movd [dstq+strideq*1], m1 + punpckhqdq m6, m6 + movd [dstq+strideq*2], m6 + psrlq m6, 32 + movd [dstq+r3 ], m6 + lea dstq, [dstq+strideq*4] + add hq, 4 + jl .w4_loop + RET +ALIGN function_align +.w8: + movq m2, [tlq+1] + punpcklbw m2, m5 + mova m5, [base+ipred_v_shuf] + lea r3, [strideq*3] + pshufd m4, m5, q0000 + pshufd m5, m5, q1111 + pmaddubsw m3, m2, m0 + paddw m1, m2 + paddw m3, m1 ; m3 is output for loop +.w8_loop: + movq m1, [weightsq+hq*2] + pshufb m0, m1, m4 + pshufb m1, m5 + SMOOTH 0, 1, 2, 2, 3, 3 + movq [dstq+strideq*0], m6 + movhps [dstq+strideq*1], m6 + lea dstq, [dstq+strideq*2] + add hq, 2 + jl .w8_loop + RET +ALIGN function_align +.w16: + movu m3, [tlq+1] + punpcklbw m2, m3, m5 + punpckhbw m3, m5 + pmaddubsw m4, m2, m0 + pmaddubsw m5, m3, m0 + paddw m0, m1, m2 + paddw m1, m3 + paddw m4, m0 + paddw m5, m1 ; m4 and m5 is output for loop +.w16_loop: + movd m1, [weightsq+hq*2] + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + SMOOTH 1, 1, 2, 3, 4, 5 + mova [dstq], m6 + add dstq, strideq + add hq, 1 + jl .w16_loop + RET +ALIGN function_align +.w32: +%if WIN64 + movaps [rsp+24], xmm7 + %define xmm_regs_used 8 +%endif + mova m7, m5 +.w32_loop_init: + mov r3d, 2 +.w32_loop: + movddup m0, [base+pb_127_m127] + movddup m1, [base+pw_128] + movu m3, [tlq+1] + punpcklbw m2, m3, m7 + punpckhbw m3, m7 + pmaddubsw m4, m2, m0 + pmaddubsw m5, m3, m0 + paddw m0, m1, m2 + paddw m1, m3 + paddw m4, m0 + paddw m5, m1 + movd m1, [weightsq+hq*2] + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + SMOOTH 1, 1, 2, 3, 4, 5 + mova [dstq], m6 + add tlq, 16 + add dstq, 16 + dec r3d + jg .w32_loop + lea dstq, [dstq-32+strideq] + sub tlq, 32 + add hq, 1 + jl .w32_loop_init + RET +ALIGN function_align +.w64: +%if WIN64 + movaps [rsp+24], xmm7 + %define xmm_regs_used 8 +%endif + mova m7, m5 +.w64_loop_init: + mov r3d, 4 +.w64_loop: + movddup m0, [base+pb_127_m127] + movddup m1, [base+pw_128] + movu m3, [tlq+1] + punpcklbw m2, m3, m7 + punpckhbw m3, m7 + pmaddubsw m4, m2, m0 + pmaddubsw m5, m3, m0 + paddw m0, m1, m2 + paddw m1, m3 + paddw m4, m0 + paddw m5, m1 + movd m1, [weightsq+hq*2] + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + SMOOTH 1, 1, 2, 3, 4, 5 + mova [dstq], m6 + add tlq, 16 + add dstq, 16 + dec r3d + jg .w64_loop + lea dstq, [dstq-64+strideq] + sub tlq, 64 + add hq, 1 + jl .w64_loop_init + RET + +;--------------------------------------------------------------------------------------- +;int dav1d_ipred_smooth_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, +; const int width, const int height, const int a); +;--------------------------------------------------------------------------------------- +cglobal ipred_smooth_h, 3, 7, 8, dst, stride, tl, w, h +%define base r6-ipred_smooth_h_ssse3_table + LEA r6, ipred_smooth_h_ssse3_table + mov wd, wm + movd m3, [tlq+wq] + pxor m1, m1 + pshufb m3, m1 ; right + tzcnt wd, wd + mov hd, hm + movsxd wq, [r6+wq*4] + movddup m4, [base+pb_127_m127] + movddup m5, [base+pw_128] + add wq, r6 + jmp wq +.w4: + movddup m6, [base+smooth_weights+4*2] + mova m7, [base+ipred_h_shuf] + sub tlq, 4 + sub tlq, hq + lea r3, [strideq*3] +.w4_loop: + movd m2, [tlq+hq] ; left + pshufb m2, m7 + punpcklbw m1, m2, m3 ; left, right + punpckhbw m2, m3 + pmaddubsw m0, m1, m4 ; 127 * left - 127 * right + paddw m0, m1 ; 128 * left + 129 * right + pmaddubsw m1, m6 + paddw m1, m5 + paddw m0, m1 + pmaddubsw m1, m2, m4 + paddw m1, m2 + pmaddubsw m2, m6 + paddw m2, m5 + paddw m1, m2 + psrlw m0, 8 + psrlw m1, 8 + packuswb m0, m1 + movd [dstq+strideq*0], m0 + pshuflw m1, m0, q1032 + movd [dstq+strideq*1], m1 + punpckhqdq m0, m0 + movd [dstq+strideq*2], m0 + psrlq m0, 32 + movd [dstq+r3 ], m0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4_loop + RET +ALIGN function_align +.w8: + mova m6, [base+smooth_weights+8*2] + mova m7, [base+ipred_h_shuf] + sub tlq, 4 + sub tlq, hq + punpckldq m7, m7 +.w8_loop: + movd m2, [tlq+hq] ; left + pshufb m2, m7 + punpcklbw m1, m2, m3 ; left, right + punpckhbw m2, m3 + pmaddubsw m0, m1, m4 ; 127 * left - 127 * right + paddw m0, m1 ; 128 * left + 129 * right + pmaddubsw m1, m6 + paddw m1, m5 + paddw m0, m1 + pmaddubsw m1, m2, m4 + paddw m1, m2 + pmaddubsw m2, m6 + paddw m2, m5 + paddw m1, m2 + psrlw m0, 8 + psrlw m1, 8 + packuswb m0, m1 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_loop + RET +ALIGN function_align +.w16: + mova m6, [base+smooth_weights+16*2] + mova m7, [base+smooth_weights+16*3] + sub tlq, 1 + sub tlq, hq +.w16_loop: + pxor m1, m1 + movd m2, [tlq+hq] ; left + pshufb m2, m1 + punpcklbw m1, m2, m3 ; left, right + punpckhbw m2, m3 + pmaddubsw m0, m1, m4 ; 127 * left - 127 * right + paddw m0, m1 ; 128 * left + 129 * right + pmaddubsw m1, m6 + paddw m1, m5 + paddw m0, m1 + pmaddubsw m1, m2, m4 + paddw m1, m2 + pmaddubsw m2, m7 + paddw m2, m5 + paddw m1, m2 + psrlw m0, 8 + psrlw m1, 8 + packuswb m0, m1 + mova [dstq], m0 + lea dstq, [dstq+strideq] + sub hd, 1 + jg .w16_loop + RET +ALIGN function_align +.w32: + sub tlq, 1 + sub tlq, hq + pxor m6, m6 +.w32_loop_init: + mov r5, 2 + lea r3, [base+smooth_weights+16*4] +.w32_loop: + mova m7, [r3] + add r3, 16 + movd m2, [tlq+hq] ; left + pshufb m2, m6 + punpcklbw m1, m2, m3 ; left, right + punpckhbw m2, m3 + pmaddubsw m0, m1, m4 ; 127 * left - 127 * right + paddw m0, m1 ; 128 * left + 129 * right + pmaddubsw m1, m7 + paddw m1, m5 + paddw m0, m1 + pmaddubsw m1, m2, m4 + paddw m1, m2 + mova m7, [r3] + add r3, 16 + pmaddubsw m2, m7 + paddw m2, m5 + paddw m1, m2 + psrlw m0, 8 + psrlw m1, 8 + packuswb m0, m1 + mova [dstq], m0 + add dstq, 16 + dec r5 + jg .w32_loop + lea dstq, [dstq-32+strideq] + sub hd, 1 + jg .w32_loop_init + RET +ALIGN function_align +.w64: + sub tlq, 1 + sub tlq, hq + pxor m6, m6 +.w64_loop_init: + mov r5, 4 + lea r3, [base+smooth_weights+16*8] +.w64_loop: + mova m7, [r3] + add r3, 16 + movd m2, [tlq+hq] ; left + pshufb m2, m6 + punpcklbw m1, m2, m3 ; left, right + punpckhbw m2, m3 + pmaddubsw m0, m1, m4 ; 127 * left - 127 * right + paddw m0, m1 ; 128 * left + 129 * right + pmaddubsw m1, m7 + paddw m1, m5 + paddw m0, m1 + pmaddubsw m1, m2, m4 + paddw m1, m2 + mova m7, [r3] + add r3, 16 + pmaddubsw m2, m7 + paddw m2, m5 + paddw m1, m2 + psrlw m0, 8 + psrlw m1, 8 + packuswb m0, m1 + mova [dstq], m0 + add dstq, 16 + dec r5 + jg .w64_loop + lea dstq, [dstq-64+strideq] + sub hd, 1 + jg .w64_loop_init + RET + +;--------------------------------------------------------------------------------------- +;int dav1d_ipred_smooth_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, +; const int width, const int height, const int a); +;--------------------------------------------------------------------------------------- +%macro SMOOTH_2D_END 7 ; src[1-2], mul[1-2], add[1-2], m3 + pmaddubsw m6, m%3, m%1 + mova m0, m6 + pmaddubsw m6, m%4, m%2 + mova m1, m6 +%ifnum %5 + paddw m0, m%5 +%else + paddw m0, %5 +%endif +%ifnum %6 + paddw m1, m%6 +%else + paddw m1, %6 +%endif +%ifnum %7 +%else + mova m3, %7 +%endif + pavgw m0, m2 + pavgw m1, m3 + psrlw m0, 8 + psrlw m1, 8 + packuswb m0, m1 +%endmacro + +%macro SMOOTH_OUTPUT_16B 12 ; m1, [buffer1, buffer2, buffer3, buffer4,] [w1, w2,] m3, m7, [m0, m4, m5] + mova m1, [rsp+16*%1] ; top + punpckhbw m6, m1, m0 ; top, bottom + punpcklbw m1, m0 ; top, bottom + pmaddubsw m2, m1, m5 + mova [rsp+16*%2], m1 + paddw m1, m3 ; 1 * top + 255 * bottom + 255 + paddw m2, m1 ; 128 * top + 129 * bottom + 255 + mova [rsp+16*%3], m2 + pmaddubsw m2, m6, m5 + mova [rsp+16*%4], m6 + paddw m6, m3 ; 1 * top + 255 * bottom + 255 + paddw m2, m6 ; 128 * top + 129 * bottom + 255 + mova [rsp+16*%5], m2 + movd m1, [tlq+hq] ; left + pshufb m1, [base+pb_3] ; topleft[-(1 + y)] + punpcklbw m1, m4 ; left, right + pmaddubsw m2, m1, m5 ; 127 * left - 127 * right + paddw m2, m1 ; 128 * left + 129 * right + mova m3, m2 + pmaddubsw m0, m1, %6 ; weights_hor = &dav1d_sm_weights[width]; + pmaddubsw m1, %7 + paddw m2, m3, m0 + paddw m3, m1 + movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; + mova m7, [rsp+16*%9] + pshufb m1, m7 + mova [rsp+16*%8], m3 + mova m4, [rsp+16*%2] + mova m5, [rsp+16*%3] + mova m3, [rsp+16*%4] + mova m7, [rsp+16*%5] + SMOOTH_2D_END 1, 1, 4, 3, 5, 7, [rsp+16*%8] + mova [dstq], m0 + movddup m3, [base+pw_255] ; recovery + mova m0, [rsp+16*%10] ; recovery + mova m4, [rsp+16*%11] ; recovery + mova m5, [rsp+16*%12] ; recovery +%endmacro + +cglobal ipred_smooth, 3, 7, 8, -13*16, dst, stride, tl, w, h, v_weights +%define base r6-ipred_smooth_ssse3_table + mov wd, wm + mov hd, hm + LEA r6, ipred_smooth_ssse3_table + movd m4, [tlq+wq] ; right + pxor m2, m2 + pshufb m4, m2 + tzcnt wd, wd + mov r5, tlq + sub r5, hq + movsxd wq, [r6+wq*4] + movddup m5, [base+pb_127_m127] + movd m0, [r5] + pshufb m0, m2 ; bottom + movddup m3, [base+pw_255] + add wq, r6 + lea v_weightsq, [base+smooth_weights+hq*2] ; weights_ver = &dav1d_sm_weights[height] + jmp wq +.w4: + mova m7, [base+ipred_v_shuf] + movd m1, [tlq+1] ; left + pshufd m1, m1, q0000 + sub tlq, 4 + lea r3, [strideq*3] + sub tlq, hq + punpcklbw m1, m0 ; top, bottom + pshufd m6, m7, q1100 + pshufd m7, m7, q3322 + pmaddubsw m2, m1, m5 + paddw m3, m1 ; 1 * top + 255 * bottom + 255 + paddw m2, m3 ; 128 * top + 129 * bottom + 255 + mova [rsp+16*0], m1 + mova [rsp+16*1], m2 + movq m1, [base+smooth_weights+4*2] ; weights_hor = &dav1d_sm_weights[width]; + punpcklqdq m1, m1 + mova [rsp+16*2], m1 + mova [rsp+16*3], m4 + mova [rsp+16*4], m6 + mova [rsp+16*5], m5 +.w4_loop: + movd m1, [tlq+hq] ; left + pshufb m1, [base+ipred_h_shuf] + punpcklbw m0, m1, m4 ; left, right + punpckhbw m1, m4 + pmaddubsw m2, m0, m5 ; 127 * left - 127 * right + pmaddubsw m3, m1, m5 + paddw m2, m0 ; 128 * left + 129 * right + paddw m3, m1 + mova m4, [rsp+16*2] + pmaddubsw m0, m4 + pmaddubsw m1, m4 + paddw m2, m0 + paddw m3, m1 + movq m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; + add v_weightsq, 8 + pshufb m0, m1, m6 + pshufb m1, m7 + mova m4, [rsp+16*0] + mova m5, [rsp+16*1] + SMOOTH_2D_END 0, 1, 4, 4, 5, 5, 3 + mova m4, [rsp+16*3] + mova m6, [rsp+16*4] + mova m5, [rsp+16*5] + movd [dstq+strideq*0], m0 + pshuflw m1, m0, q1032 + movd [dstq+strideq*1], m1 + punpckhqdq m0, m0 + movd [dstq+strideq*2], m0 + psrlq m0, 32 + movd [dstq+r3 ], m0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4_loop + RET +ALIGN function_align +.w8: + mova m7, [base+ipred_v_shuf] + movq m1, [tlq+1] ; left + punpcklqdq m1, m1 + sub tlq, 4 + sub tlq, hq + punpcklbw m1, m0 + pshufd m6, m7, q0000 + pshufd m7, m7, q1111 + pmaddubsw m2, m1, m5 + paddw m3, m1 + paddw m2, m3 + mova [rsp+16*0], m1 + mova [rsp+16*1], m2 + mova m1, [base+smooth_weights+8*2] ; weights_hor = &dav1d_sm_weights[width]; + mova [rsp+16*2], m1 + mova [rsp+16*3], m4 + mova [rsp+16*4], m6 + mova [rsp+16*5], m5 +.w8_loop: + movd m1, [tlq+hq] ; left + pshufb m1, [base+ipred_h_shuf] + pshufd m1, m1, q1100 + punpcklbw m0, m1, m4 + punpckhbw m1, m4 + pmaddubsw m2, m0, m5 + pmaddubsw m3, m1, m5 + paddw m2, m0 + paddw m3, m1 + mova m4, [rsp+16*2] + pmaddubsw m0, m4 + pmaddubsw m1, m4 + paddw m2, m0 + paddw m3, m1 + movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; + add v_weightsq, 4 + pshufb m0, m1, m6 + pshufb m1, m7 + mova m4, [rsp+16*0] + mova m5, [rsp+16*1] + SMOOTH_2D_END 0, 1, 4, 4, 5, 5, 3 + mova m4, [rsp+16*3] + mova m6, [rsp+16*4] + mova m5, [rsp+16*5] + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_loop + RET +ALIGN function_align +.w16: + mova m7, [base+ipred_v_shuf] + movu m1, [tlq+1] ; left + sub tlq, 4 + sub tlq, hq + punpckhbw m6, m1, m0 ; top, bottom + punpcklbw m1, m0 ; top, bottom + pshufd m7, m7, q0000 + mova [rsp+16*2], m7 + pmaddubsw m2, m6, m5 + mova [rsp+16*5], m6 + paddw m6, m3 ; 1 * top + 255 * bottom + 255 + paddw m2, m6 ; 128 * top + 129 * bottom + 255 + mova [rsp+16*6], m2 + pmaddubsw m2, m1, m5 + paddw m3, m1 ; 1 * top + 255 * bottom + 255 + mova [rsp+16*0], m1 + paddw m2, m3 ; 128 * top + 129 * bottom + 255 + mova [rsp+16*1], m2 + mova [rsp+16*3], m4 + mova [rsp+16*4], m5 +.w16_loop: + movd m1, [tlq+hq] ; left + pshufb m1, [base+pb_3] ; topleft[-(1 + y)] + punpcklbw m1, m4 ; left, right + pmaddubsw m2, m1, m5 ; 127 * left - 127 * right + paddw m2, m1 ; 128 * left + 129 * right + mova m0, m1 + mova m3, m2 + pmaddubsw m0, [base+smooth_weights+16*2] ; weights_hor = &dav1d_sm_weights[width]; + pmaddubsw m1, [base+smooth_weights+16*3] + paddw m2, m0 + paddw m3, m1 + movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; + add v_weightsq, 2 + mova m7, [rsp+16*2] + pshufb m1, m7 + mova [rsp+16*7], m3 + mova m4, [rsp+16*0] + mova m5, [rsp+16*1] + mova m3, [rsp+16*5] + mova m7, [rsp+16*6] + SMOOTH_2D_END 1, 1, 4, 3, 5, 7, [rsp+16*7] + mova m4, [rsp+16*3] + mova m5, [rsp+16*4] + mova [dstq], m0 + lea dstq, [dstq+strideq] + sub hd, 1 + jg .w16_loop + RET +ALIGN function_align +.w32: + movu m1, [tlq+1] ; top topleft[1 + x] + movu m2, [tlq+17] ; top + mova [rsp+16*0], m1 + mova [rsp+16*1], m2 + sub tlq, 4 + sub tlq, hq + mova m7, [base+ipred_v_shuf] + pshufd m7, m7, q0000 + mova [rsp+16*2], m7 + mova [rsp+16*3], m0 + mova [rsp+16*4], m4 + mova [rsp+16*5], m5 +.w32_loop: + SMOOTH_OUTPUT_16B 0, 6, 7, 8, 9, [base+smooth_weights+16*4], [base+smooth_weights+16*5], 10, 2, 3, 4, 5 + add dstq, 16 + SMOOTH_OUTPUT_16B 1, 6, 7, 8, 9, [base+smooth_weights+16*6], [base+smooth_weights+16*7], 10, 2, 3, 4, 5 + lea dstq, [dstq-16+strideq] + add v_weightsq, 2 + sub hd, 1 + jg .w32_loop + RET +ALIGN function_align +.w64: + movu m1, [tlq+1] ; top topleft[1 + x] + movu m2, [tlq+17] ; top + mova [rsp+16*0], m1 + mova [rsp+16*1], m2 + movu m1, [tlq+33] ; top + movu m2, [tlq+49] ; top + mova [rsp+16*11], m1 + mova [rsp+16*12], m2 + sub tlq, 4 + sub tlq, hq + mova m7, [base+ipred_v_shuf] + pshufd m7, m7, q0000 + mova [rsp+16*2], m7 + mova [rsp+16*3], m0 + mova [rsp+16*4], m4 + mova [rsp+16*5], m5 +.w64_loop: + SMOOTH_OUTPUT_16B 0, 6, 7, 8, 9, [base+smooth_weights+16*8], [base+smooth_weights+16*9], 10, 2, 3, 4, 5 + add dstq, 16 + SMOOTH_OUTPUT_16B 1, 6, 7, 8, 9, [base+smooth_weights+16*10], [base+smooth_weights+16*11], 10, 2, 3, 4, 5 + add dstq, 16 + SMOOTH_OUTPUT_16B 11, 6, 7, 8, 9, [base+smooth_weights+16*12], [base+smooth_weights+16*13], 10, 2, 3, 4, 5 + add dstq, 16 + SMOOTH_OUTPUT_16B 12, 6, 7, 8, 9, [base+smooth_weights+16*14], [base+smooth_weights+16*15], 10, 2, 3, 4, 5 + lea dstq, [dstq-48+strideq] + add v_weightsq, 2 + sub hd, 1 + jg .w64_loop + RET + +;--------------------------------------------------------------------------------------- +;int dav1d_pal_pred_ssse3(pixel *dst, const ptrdiff_t stride, const uint16_t *const pal, +; const uint8_t *idx, const int w, const int h); +;--------------------------------------------------------------------------------------- +cglobal pal_pred, 4, 6, 5, dst, stride, pal, idx, w, h + mova m4, [palq] + LEA r2, pal_pred_ssse3_table + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r2+wq*4] + packuswb m4, m4 + add wq, r2 + lea r2, [strideq*3] + jmp wq +.w4: + pshufb m0, m4, [idxq] + add idxq, 16 + movd [dstq ], m0 + pshuflw m1, m0, q1032 + movd [dstq+strideq ], m1 + punpckhqdq m0, m0 + movd [dstq+strideq*2], m0 + psrlq m0, 32 + movd [dstq+r2 ], m0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4 + RET +ALIGN function_align +.w8: + pshufb m0, m4, [idxq] + pshufb m1, m4, [idxq+16] + add idxq, 32 + movq [dstq ], m0 + movhps [dstq+strideq ], m0 + movq [dstq+strideq*2], m1 + movhps [dstq+r2 ], m1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8 + RET +ALIGN function_align +.w16: + pshufb m0, m4, [idxq] + pshufb m1, m4, [idxq+16] + pshufb m2, m4, [idxq+32] + pshufb m3, m4, [idxq+48] + add idxq, 64 + mova [dstq ], m0 + mova [dstq+strideq ], m1 + mova [dstq+strideq*2], m2 + mova [dstq+r2 ], m3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w16 + RET +ALIGN function_align +.w32: + pshufb m0, m4, [idxq] + pshufb m1, m4, [idxq+16] + pshufb m2, m4, [idxq+32] + pshufb m3, m4, [idxq+48] + add idxq, 64 + mova [dstq ], m0 + mova [dstq+16 ], m1 + mova [dstq+strideq ], m2 + mova [dstq+strideq+16], m3 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32 + RET +ALIGN function_align +.w64: + pshufb m0, m4, [idxq] + pshufb m1, m4, [idxq+16] + pshufb m2, m4, [idxq+32] + pshufb m3, m4, [idxq+48] + add idxq, 64 + mova [dstq ], m0 + mova [dstq+16], m1 + mova [dstq+32], m2 + mova [dstq+48], m3 + add dstq, strideq + sub hd, 1 + jg .w64 + RET + +;--------------------------------------------------------------------------------------- +;void dav1d_ipred_cfl_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, +; const int width, const int height, const int16_t *ac, const int alpha); +;--------------------------------------------------------------------------------------- +%macro IPRED_CFL 1 ; ac in, unpacked pixels out + psignw m3, m%1, m1 + pabsw m%1, m%1 + pmulhrsw m%1, m2 + psignw m%1, m3 + paddw m%1, m0 +%endmacro + +%if UNIX64 +DECLARE_REG_TMP 7 +%else +DECLARE_REG_TMP 5 +%endif + +cglobal ipred_cfl, 3, 7, 6, dst, stride, tl, w, h, ac, alpha + movifnidn wd, wm + movifnidn hd, hm + tzcnt r6d, hd + lea t0d, [wq+hq] + movd m4, t0d + tzcnt t0d, t0d + movd m5, t0d + LEA t0, ipred_cfl_ssse3_table + tzcnt wd, wd + movsxd r6, [t0+r6*4] + movsxd wq, [t0+wq*4+16] + pcmpeqd m3, m3 + psrlw m4, 1 + add r6, t0 + add wq, t0 + movifnidn acq, acmp + jmp r6 +.h4: + movd m0, [tlq-4] + pmaddubsw m0, m3 + jmp wq +.w4: + movd m1, [tlq+1] + pmaddubsw m1, m3 + psubw m0, m4 + paddw m0, m1 + pmaddwd m0, m3 + cmp hd, 4 + jg .w4_mul + psrlw m0, 3 ; dc >>= ctz(width + height); + jmp .w4_end +.w4_mul: + punpckhqdq m1, m0, m0 + paddw m0, m1 + pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 + paddw m0, m1 + psrlw m0, 2 + mov r6d, 0x5556 + mov r2d, 0x3334 + test hd, 8 + cmovz r6d, r2d + movd m5, r6d + pmulhuw m0, m5 +.w4_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s4: + movd m1, alpham + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + lea r6, [strideq*3] + pabsw m2, m1 + psllw m2, 9 +.s4_loop: + mova m4, [acq] + mova m5, [acq+16] + IPRED_CFL 4 + IPRED_CFL 5 + packuswb m4, m5 + movd [dstq+strideq*0], m4 + pshuflw m4, m4, q1032 + movd [dstq+strideq*1], m4 + punpckhqdq m4, m4 + movd [dstq+strideq*2], m4 + psrlq m4, 32 + movd [dstq+r6 ], m4 + lea dstq, [dstq+strideq*4] + add acq, 32 + sub hd, 4 + jg .s4_loop + RET +ALIGN function_align +.h8: + movq m0, [tlq-8] + pmaddubsw m0, m3 + jmp wq +.w8: + movq m1, [tlq+1] + pmaddubsw m1, m3 + psubw m4, m0 + punpckhqdq m0, m0 + psubw m0, m4 + paddw m0, m1 + pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 + paddw m0, m1 + pmaddwd m0, m3 + psrlw m0, m5 + cmp hd, 8 + je .w8_end + mov r6d, 0x5556 + mov r2d, 0x3334 + cmp hd, 32 + cmovz r6d, r2d + movd m1, r6d + pmulhuw m0, m1 +.w8_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s8: + movd m1, alpham + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + lea r6, [strideq*3] + pabsw m2, m1 + psllw m2, 9 +.s8_loop: + mova m4, [acq] + mova m5, [acq+16] + IPRED_CFL 4 + IPRED_CFL 5 + packuswb m4, m5 + movq [dstq ], m4 + movhps [dstq+strideq ], m4 + mova m4, [acq+32] + mova m5, [acq+48] + IPRED_CFL 4 + IPRED_CFL 5 + packuswb m4, m5 + movq [dstq+strideq*2], m4 + movhps [dstq+r6 ], m4 + lea dstq, [dstq+strideq*4] + add acq, 64 + sub hd, 4 + jg .s8_loop + RET +ALIGN function_align +.h16: + mova m0, [tlq-16] + pmaddubsw m0, m3 + jmp wq +.w16: + movu m1, [tlq+1] + pmaddubsw m1, m3 + paddw m0, m1 + psubw m4, m0 + punpckhqdq m0, m0 + psubw m0, m4 + pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 + paddw m0, m1 + pmaddwd m0, m3 + psrlw m0, m5 + cmp hd, 16 + je .w16_end + mov r6d, 0x5556 + mov r2d, 0x3334 + test hd, 8|32 + cmovz r6d, r2d + movd m1, r6d + pmulhuw m0, m1 +.w16_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s16: + movd m1, alpham + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + pabsw m2, m1 + psllw m2, 9 +.s16_loop: + mova m4, [acq] + mova m5, [acq+16] + IPRED_CFL 4 + IPRED_CFL 5 + packuswb m4, m5 + mova [dstq], m4 + mova m4, [acq+32] + mova m5, [acq+48] + IPRED_CFL 4 + IPRED_CFL 5 + packuswb m4, m5 + mova [dstq+strideq], m4 + lea dstq, [dstq+strideq*2] + add acq, 64 + sub hd, 2 + jg .s16_loop + RET +ALIGN function_align +.h32: + mova m0, [tlq-32] + pmaddubsw m0, m3 + mova m2, [tlq-16] + pmaddubsw m2, m3 + paddw m0, m2 + jmp wq +.w32: + movu m1, [tlq+1] + pmaddubsw m1, m3 + movu m2, [tlq+17] + pmaddubsw m2, m3 + paddw m1, m2 + paddw m0, m1 + psubw m4, m0 + punpckhqdq m0, m0 + psubw m0, m4 + pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 + paddw m0, m1 + pmaddwd m0, m3 + psrlw m0, m5 + cmp hd, 32 + je .w32_end + lea r2d, [hq*2] + mov r6d, 0x5556 + mov r2d, 0x3334 + test hd, 64|16 + cmovz r6d, r2d + movd m1, r6d + pmulhuw m0, m1 +.w32_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s32: + movd m1, alpham + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + pabsw m2, m1 + psllw m2, 9 +.s32_loop: + mova m4, [acq] + mova m5, [acq+16] + IPRED_CFL 4 + IPRED_CFL 5 + packuswb m4, m5 + mova [dstq], m4 + mova m4, [acq+32] + mova m5, [acq+48] + IPRED_CFL 4 + IPRED_CFL 5 + packuswb m4, m5 + mova [dstq+16], m4 + add dstq, strideq + add acq, 64 + dec hd + jg .s32_loop + RET + +;--------------------------------------------------------------------------------------- +;void dav1d_ipred_cfl_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, +; const int width, const int height, const int16_t *ac, const int alpha); +;--------------------------------------------------------------------------------------- +cglobal ipred_cfl_left, 3, 7, 6, dst, stride, tl, w, h, ac, alpha + mov hd, hm ; zero upper half + tzcnt r6d, hd + sub tlq, hq + tzcnt wd, wm + movu m0, [tlq] + mov t0d, 0x8000 + movd m3, t0d + movd m2, r6d + psrld m3, m2 + LEA t0, ipred_cfl_left_ssse3_table + movsxd r6, [t0+r6*4] + pcmpeqd m2, m2 + pmaddubsw m0, m2 + add r6, t0 + add t0, ipred_cfl_splat_ssse3_table-ipred_cfl_left_ssse3_table + movsxd wq, [t0+wq*4] + add wq, t0 + movifnidn acq, acmp + jmp r6 +.h32: + movu m1, [tlq+16] ; unaligned when jumping here from dc_top + pmaddubsw m1, m2 + paddw m0, m1 +.h16: + pshufd m1, m0, q3232 ; psrlq m1, m0, 16 + paddw m0, m1 +.h8: + pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 + paddw m0, m1 +.h4: + pmaddwd m0, m2 + pmulhrsw m0, m3 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 + jmp wq + +;--------------------------------------------------------------------------------------- +;void dav1d_ipred_cfl_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, +; const int width, const int height, const int16_t *ac, const int alpha); +;--------------------------------------------------------------------------------------- +cglobal ipred_cfl_top, 3, 7, 6, dst, stride, tl, w, h, ac, alpha + LEA t0, ipred_cfl_left_ssse3_table + tzcnt wd, wm + inc tlq + movu m0, [tlq] + movifnidn hd, hm + mov r6d, 0x8000 + movd m3, r6d + movd m2, wd + psrld m3, m2 + movsxd r6, [t0+wq*4] + pcmpeqd m2, m2 + pmaddubsw m0, m2 + add r6, t0 + add t0, ipred_cfl_splat_ssse3_table-ipred_cfl_left_ssse3_table + movsxd wq, [t0+wq*4] + add wq, t0 + movifnidn acq, acmp + jmp r6 + +;--------------------------------------------------------------------------------------- +;void dav1d_ipred_cfl_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, +; const int width, const int height, const int16_t *ac, const int alpha); +;--------------------------------------------------------------------------------------- +cglobal ipred_cfl_128, 3, 7, 6, dst, stride, tl, w, h, ac, alpha + tzcnt wd, wm + movifnidn hd, hm + LEA r6, ipred_cfl_splat_ssse3_table + movsxd wq, [r6+wq*4] + movddup m0, [r6-ipred_cfl_splat_ssse3_table+pw_128] + add wq, r6 + movifnidn acq, acmp + jmp wq + +%macro RELOAD_ACQ_32 1 + mov acq, ac_bakq ; restore acq +%endmacro + +%if ARCH_X86_64 +cglobal ipred_cfl_ac_420, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak +DECLARE_REG_TMP 7 + movddup m2, [pb_2] +%else +cglobal ipred_cfl_ac_420, 4, 7, 7, ac, y, stride, wpad, hpad, w, h +DECLARE_REG_TMP 4 +%define ac_bakq acmp + mov t0d, 0x02020202 + movd m2, t0d + pshufd m2, m2, q0000 +%endif + movifnidn wd, wm + mov t0d, hm + mov hd, t0d + imul t0d, wd + movd m5, t0d + movifnidn hpadd, hpadm +%if ARCH_X86_64 + mov ac_bakq, acq +%endif + shl hpadd, 2 + sub hd, hpadd + pxor m4, m4 + cmp wd, 8 + jg .w16 + je .w8 + ; fall-through +%if ARCH_X86_64 + DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak +%else + DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h +%endif +.w4: + lea stride3q, [strideq*3] +.w4_loop: + movq m0, [yq] + movq m1, [yq+strideq] + movhps m0, [yq+strideq*2] + movhps m1, [yq+stride3q] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + paddw m0, m1 + mova [acq], m0 + paddw m4, m0 + lea yq, [yq+strideq*4] + add acq, 16 + sub hd, 2 + jg .w4_loop + test hpadd, hpadd + jz .calc_avg_4_8 + punpckhqdq m0, m0 +.w4_hpad_loop: + mova [acq], m0 + paddw m4, m0 + add acq, 16 + sub hpadd, 2 + jg .w4_hpad_loop + jmp .calc_avg_4_8 +.w8: + lea stride3q, [strideq*3] + test wpadd, wpadd + jnz .w8_wpad +.w8_loop: + mova m0, [yq] + mova m1, [yq+strideq] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + paddw m0, m1 + mova [acq], m0 + paddw m4, m0 + mova m0, [yq+strideq*2] + mova m1, [yq+stride3q] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + paddw m0, m1 + mova [acq+16], m0 + paddw m4, m0 + lea yq, [yq+strideq*4] + add acq, 32 + sub hd, 2 + jg .w8_loop + test hpadd, hpadd + jz .calc_avg_4_8 + jmp .w8_hpad +.w8_wpad: ; wpadd=1 + movddup m0, [yq] + movddup m1, [yq+strideq] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + paddw m0, m1 + pshufhw m0, m0, q3333 + mova [acq], m0 + paddw m4, m0 + lea yq, [yq+strideq*2] + add acq, 16 + sub hd, 1 + jg .w8_wpad + test hpadd, hpadd + jz .calc_avg_4_8 +.w8_hpad: + mova [acq], m0 + paddw m4, m0 + add acq, 16 + sub hpadd, 1 + jg .w8_hpad + jmp .calc_avg_4_8 +.w16: + test wpadd, wpadd + jnz .w16_wpad +.w16_loop: + mova m0, [yq] + mova m1, [yq+strideq] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + paddw m0, m1 + mova [acq], m0 + paddw m4, m0 + mova m6, [yq+16] + mova m1, [yq+strideq+16] + pmaddubsw m6, m2 + pmaddubsw m1, m2 + paddw m6, m1 + mova [acq+16], m6 + paddw m4, m6 + lea yq, [yq+strideq*2] + add acq, 32 + dec hd + jg .w16_loop + test hpadd, hpadd + jz .calc_avg16 + jmp .w16_hpad_loop +.w16_wpad: + cmp wpadd, 2 + jl .w16_pad1 + je .w16_pad2 +.w16_pad3: + movddup m0, [yq] + movddup m1, [yq+strideq] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + paddw m0, m1 + pshufhw m0, m0, q3333 + mova [acq], m0 + paddw m4, m0 + mova m6, m0 + punpckhqdq m6, m0, m0 + mova [acq+16], m6 + paddw m4, m6 + lea yq, [yq+strideq*2] + add acq, 32 + dec hd + jg .w16_pad3 + jmp .w16_wpad_done +.w16_pad2: + mova m0, [yq] + mova m1, [yq+strideq] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + paddw m0, m1 + mova [acq], m0 + paddw m4, m0 + pshufhw m6, m0, q3333 + punpckhqdq m6, m6 + mova [acq+16], m6 + paddw m4, m6 + lea yq, [yq+strideq*2] + add acq, 32 + dec hd + jg .w16_pad2 + jmp .w16_wpad_done +.w16_pad1: + mova m0, [yq] + mova m1, [yq+strideq] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + paddw m0, m1 + mova [acq], m0 + paddw m4, m0 + movddup m6, [yq+16] + movddup m1, [yq+strideq+16] + pmaddubsw m6, m2 + pmaddubsw m1, m2 + paddw m6, m1 + pshufhw m6, m6, q3333 + mova [acq+16], m6 + paddw m4, m6 + lea yq, [yq+strideq*2] + add acq, 32 + dec hd + jg .w16_pad1 +.w16_wpad_done: + test hpadd, hpadd + jz .calc_avg16 +.w16_hpad_loop: + mova [acq], m0 + paddw m4, m0 + mova [acq+16], m6 + paddw m4, m6 + add acq, 32 + dec hpadd + jg .w16_hpad_loop + jmp .calc_avg16 + +%if ARCH_X86_64 + DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak +%else + DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h +%endif +.calc_avg_4_8: + psrlw m2, 9 + pmaddwd m4, m2 + jmp .calc_avg +.calc_avg16: + psrld m0, m4, 16 + pslld m4, 16 + psrld m4, 16 + paddd m4, m0 +.calc_avg: + movd szd, m5 + psrad m5, 1 + tzcnt r1d, szd + paddd m4, m5 + movd m1, r1d + pshufd m0, m4, q2301 + paddd m0, m4 + pshufd m4, m0, q1032 + paddd m0, m4 + psrad m0, m1 ; sum >>= log2sz; + packssdw m0, m0 + RELOAD_ACQ_32 acq +.sub_loop: + mova m1, [acq] + psubw m1, m0 ; ac[x] -= sum; + mova [acq], m1 + add acq, 16 + sub szd, 8 + jg .sub_loop + RET + +%if ARCH_X86_64 +cglobal ipred_cfl_ac_422, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak + movddup m2, [pb_4] +%else +cglobal ipred_cfl_ac_422, 4, 7, 7, ac, y, stride, wpad, hpad, w, h + mov t0d, 0x04040404 + movd m2, t0d + pshufd m2, m2, q0000 +%endif + movifnidn wd, wm + mov t0d, hm + mov hd, t0d + imul t0d, wd + movd m6, t0d + movifnidn hpadd, hpadm +%if ARCH_X86_64 + mov ac_bakq, acq +%endif + shl hpadd, 2 + sub hd, hpadd + pxor m4, m4 + pxor m5, m5 + cmp wd, 8 + jg .w16 + je .w8 + ; fall-through + +%if ARCH_X86_64 + DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak +%else + DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h +%endif +.w4: + lea stride3q, [strideq*3] +.w4_loop: + movq m1, [yq] + movhps m1, [yq+strideq] + movq m0, [yq+strideq*2] + movhps m0, [yq+stride3q] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + mova [acq], m1 + mova [acq+16], m0 + paddw m4, m0 + paddw m5, m1 + lea yq, [yq+strideq*4] + add acq, 32 + sub hd, 4 + jg .w4_loop + test hpadd, hpadd + jz .calc_avg_4 + punpckhqdq m0, m0 +.w4_hpad_loop: + mova [acq], m0 + paddw m4, m0 + add acq, 16 + sub hpadd, 2 + jg .w4_hpad_loop + jmp .calc_avg_4 +.w8: + lea stride3q, [strideq*3] + test wpadd, wpadd + jnz .w8_wpad +.w8_loop: + mova m1, [yq] + mova m0, [yq+strideq] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + mova [acq], m1 + mova [acq+16], m0 + paddw m4, m0 + paddw m5, m1 + mova m1, [yq+strideq*2] + mova m0, [yq+stride3q] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + mova [acq+32], m1 + mova [acq+48], m0 + paddw m4, m0 + paddw m5, m1 + lea yq, [yq+strideq*4] + add acq, 64 + sub hd, 4 + jg .w8_loop + test hpadd, hpadd + jz .calc_avg_8_16 + jmp .w8_hpad +.w8_wpad: + movddup m1, [yq] + pmaddubsw m1, m2 + pshufhw m1, m1, q3333 + mova [acq], m1 + paddw m5, m1 + movddup m0, [yq+strideq] + pmaddubsw m0, m2 + pshufhw m0, m0, q3333 + mova [acq+16], m0 + paddw m4, m0 + lea yq, [yq+strideq*2] + add acq, 32 + sub hd, 2 + jg .w8_wpad + test hpadd, hpadd + jz .calc_avg_8_16 +.w8_hpad: + mova [acq], m0 + paddw m4, m0 + mova [acq+16], m0 + paddw m4, m0 + add acq, 32 + sub hpadd, 2 + jg .w8_hpad + jmp .calc_avg_8_16 +.w16: + test wpadd, wpadd + jnz .w16_wpad +.w16_loop: + mova m1, [yq] + mova m0, [yq+16] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + mova [acq], m1 + mova [acq+16], m0 + paddw m5, m0 + paddw m5, m1 + mova m1, [yq+strideq] + mova m0, [yq+strideq+16] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + mova [acq+32], m1 + mova [acq+48], m0 + paddw m4, m0 + paddw m4, m1 + lea yq, [yq+strideq*2] + add acq, 64 + sub hd, 2 + jg .w16_loop + test hpadd, hpadd + jz .calc_avg_8_16 + jmp .w16_hpad_loop +.w16_wpad: + cmp wpadd, 2 + jl .w16_pad1 + je .w16_pad2 +.w16_pad3: + movddup m1, [yq] + pmaddubsw m1, m2 + pshufhw m1, m1, q3333 + mova [acq], m1 + paddw m5, m1 + punpckhqdq m1, m1 + mova [acq+16], m1 + paddw m5, m1 + movddup m1, [yq+strideq] + pmaddubsw m1, m2 + pshufhw m1, m1, q3333 + mova [acq+32], m1 + paddw m4, m1 + punpckhqdq m0, m1, m1 + mova [acq+48], m0 + paddw m4, m0 + lea yq, [yq+strideq*2] + add acq, 64 + sub hd, 2 + jg .w16_pad3 + jmp .w16_wpad_done +.w16_pad2: + mova m1, [yq] + pmaddubsw m1, m2 + mova [acq], m1 + paddw m5, m1 + pshufhw m1, m1, q3333 + punpckhqdq m1, m1 + mova [acq+16], m1 + paddw m5, m1 + mova m1, [yq+strideq] + pmaddubsw m1, m2 + mova [acq+32], m1 + paddw m4, m1 + mova m0, m1 + pshufhw m0, m0, q3333 + punpckhqdq m0, m0 + mova [acq+48], m0 + paddw m4, m0 + lea yq, [yq+strideq*2] + add acq, 64 + sub hd, 2 + jg .w16_pad2 + jmp .w16_wpad_done +.w16_pad1: + mova m1, [yq] + pmaddubsw m1, m2 + mova [acq], m1 + paddw m5, m1 + movddup m0, [yq+16] + pmaddubsw m0, m2 + pshufhw m0, m0, q3333 + mova [acq+16], m0 + paddw m5, m0 + mova m1, [yq+strideq] + pmaddubsw m1, m2 + mova [acq+32], m1 + paddw m4, m1 + movddup m0, [yq+strideq+16] + pmaddubsw m0, m2 + pshufhw m0, m0, q3333 + mova [acq+48], m0 + paddw m4, m0 + lea yq, [yq+strideq*2] + add acq, 64 + sub hd, 2 + jg .w16_pad1 +.w16_wpad_done: + test hpadd, hpadd + jz .calc_avg_8_16 +.w16_hpad_loop: + mova [acq], m1 + mova [acq+16], m0 + paddw m4, m1 + paddw m5, m0 + mova [acq+32], m1 + mova [acq+48], m0 + paddw m4, m1 + paddw m5, m0 + add acq, 64 + sub hpadd, 2 + jg .w16_hpad_loop + jmp .calc_avg_8_16 + +%if ARCH_X86_64 + DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak +%else + DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h +%endif +.calc_avg_4: + psrlw m2, 10 + pmaddwd m5, m2 + pmaddwd m0, m4, m2 + jmp .calc_avg +.calc_avg_8_16: + mova m0, m5 + psrld m5, 16 + pslld m0, 16 + psrld m0, 16 + paddd m5, m0 + mova m0, m4 + psrld m0, 16 + pslld m4, 16 + psrld m4, 16 + paddd m0, m4 +.calc_avg: + paddd m5, m0 + movd szd, m6 + psrad m6, 1 + tzcnt r1d, szd ; const int log2sz = ctz(width) + ctz(height); + paddd m5, m6 + movd m1, r1d + pshufd m0, m5, q2301 + paddd m0, m5 + pshufd m5, m0, q1032 + paddd m0, m5 + psrad m0, m1 ; sum >>= log2sz; + packssdw m0, m0 + RELOAD_ACQ_32 acq ; ac = ac_orig +.sub_loop: + mova m1, [acq] + psubw m1, m0 + mova [acq], m1 + add acq, 16 + sub szd, 8 + jg .sub_loop + RET + +%if ARCH_X86_64 +cglobal ipred_cfl_ac_444, 4, 8, 7, -4*16, ac, y, stride, wpad, hpad, w, h, ac_bak + movddup m2, [pb_4] +%else +cglobal ipred_cfl_ac_444, 4, 7, 7, -5*16, ac, y, stride, wpad, hpad, w, h +%define ac_bakq [rsp+16*4] + mov t0d, 0x04040404 + movd m2, t0d + pshufd m2, m2, q0000 +%endif + movifnidn wd, wm + movifnidn hpadd, hpadm + movd m0, hpadd + mov t0d, hm + mov hd, t0d + imul t0d, wd + movd m6, t0d + movd hpadd, m0 + mov ac_bakq, acq + shl hpadd, 2 + sub hd, hpadd + pxor m5, m5 + pxor m4, m4 + cmp wd, 16 + jg .w32 + cmp wd, 8 + jg .w16 + je .w8 + ; fall-through + +%if ARCH_X86_64 + DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak +%else + DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h +%endif +.w4: + lea stride3q, [strideq*3] +.w4_loop: + movd m1, [yq] + movd m3, [yq+strideq] + punpckldq m1, m3 + punpcklbw m1, m1 + movd m0, [yq+strideq*2] + movd m3, [yq+stride3q] + punpckldq m0, m3 + punpcklbw m0, m0 + pmaddubsw m1, m2 + pmaddubsw m0, m2 + mova [acq], m1 + mova [acq+16], m0 + paddw m5, m0 + paddw m5, m1 + lea yq, [yq+strideq*4] + add acq, 32 + sub hd, 4 + jg .w4_loop + test hpadd, hpadd + jz .calc_avg_4 + punpckhqdq m0, m0 +.w4_hpad_loop: + mova [acq], m0 + paddw m5, m0 + add acq, 16 + sub hpadd, 2 + jg .w4_hpad_loop +.calc_avg_4: + psrlw m2, 10 + pmaddwd m5, m2 + jmp .calc_avg + +.w8: + lea stride3q, [strideq*3] + test wpadd, wpadd + jnz .w8_wpad +.w8_loop: + movq m1, [yq] + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq], m1 + paddw m5, m1 + movq m0, [yq+strideq] + punpcklbw m0, m0 + pmaddubsw m0, m2 + mova [acq+16], m0 + paddw m5, m0 + movq m1, [yq+strideq*2] + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq+32], m1 + paddw m4, m1 + movq m0, [yq+stride3q] + punpcklbw m0, m0 + pmaddubsw m0, m2 + mova [acq+48], m0 + paddw m4, m0 + lea yq, [yq+strideq*4] + add acq, 64 + sub hd, 4 + jg .w8_loop + test hpadd, hpadd + jz .calc_avg_8_16 + jmp .w8_hpad +.w8_wpad: + movd m1, [yq] + punpcklbw m1, m1 + punpcklqdq m1, m1 + pmaddubsw m1, m2 + pshufhw m1, m1, q3333 + mova [acq], m1 + paddw m5, m1 + movd m0, [yq+strideq] + punpcklbw m0, m0 + punpcklqdq m0, m0 + pmaddubsw m0, m2 + pshufhw m0, m0, q3333 + mova [acq+16], m0 + paddw m4, m0 + lea yq, [yq+strideq*2] + add acq, 32 + sub hd, 2 + jg .w8_wpad + test hpadd, hpadd + jz .calc_avg_8_16 +.w8_hpad: + mova [acq], m0 + paddw m5, m0 + mova [acq+16], m0 + paddw m4, m0 + add acq, 32 + sub hpadd, 2 + jg .w8_hpad + jmp .calc_avg_8_16 + +.w16: + test wpadd, wpadd + jnz .w16_wpad +.w16_loop: + mova m0, [yq] + mova m1, m0 + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq], m1 + paddw m5, m1 + punpckhbw m0, m0 + pmaddubsw m0, m2 + mova [acq+16], m0 + paddw m5, m0 + mova m0, [yq+strideq] + mova m1, m0 + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq+32], m1 + paddw m4, m1 + punpckhbw m0, m0 + pmaddubsw m0, m2 + mova [acq+48], m0 + paddw m4, m0 + lea yq, [yq+strideq*2] + add acq, 64 + sub hd, 2 + jg .w16_loop + test hpadd, hpadd + jz .calc_avg_8_16 + jmp .w16_hpad_loop +.w16_wpad: + cmp wpadd, 2 + jl .w16_pad1 + je .w16_pad2 +.w16_pad3: + movd m1, [yq] + punpcklbw m1, m1 + punpcklqdq m1, m1 + pshufhw m1, m1, q3333 + pmaddubsw m1, m2 + mova [acq], m1 + paddw m5, m1 + punpckhqdq m1, m1 + mova [acq+16], m1 + paddw m5, m1 + movd m1, [yq+strideq] + punpcklbw m1, m1 + punpcklqdq m1, m1 + pshufhw m1, m1, q3333 + pmaddubsw m1, m2 + mova [acq+32], m1 + paddw m4, m1 + punpckhqdq m0, m1, m1 + mova [acq+48], m0 + paddw m4, m0 + lea yq, [yq+strideq*2] + add acq, 64 + sub hd, 2 + jg .w16_pad3 + jmp .w16_wpad_done +.w16_pad2: + movq m1, [yq] + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq], m1 + paddw m5, m1 + pshufhw m1, m1, q3333 + punpckhqdq m1, m1 + mova [acq+16], m1 + paddw m5, m1 + movq m1, [yq+strideq] + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq+32], m1 + paddw m4, m1 + mova m0, m1 + pshufhw m0, m0, q3333 + punpckhqdq m0, m0 + mova [acq+48], m0 + paddw m4, m0 + lea yq, [yq+strideq*2] + add acq, 64 + sub hd, 2 + jg .w16_pad2 + jmp .w16_wpad_done +.w16_pad1: + mova m0, [yq] + mova m1, m0 + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq], m1 + paddw m5, m1 + punpckhbw m0, m0 + punpcklqdq m0, m0 + pshufhw m0, m0, q3333 + pmaddubsw m0, m2 + mova [acq+16], m0 + paddw m5, m0 + mova m0, [yq+strideq] + mova m1, m0 + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq+32], m1 + paddw m4, m1 + punpckhbw m0, m0 + punpcklqdq m0, m0 + pshufhw m0, m0, q3333 + pmaddubsw m0, m2 + mova [acq+48], m0 + paddw m4, m0 + lea yq, [yq+strideq*2] + add acq, 64 + sub hd, 2 + jg .w16_pad1 +.w16_wpad_done: + test hpadd, hpadd + jz .calc_avg_8_16 +.w16_hpad_loop: + mova [acq], m1 + mova [acq+16], m0 + paddw m4, m1 + paddw m5, m0 + mova [acq+32], m1 + mova [acq+48], m0 + paddw m4, m1 + paddw m5, m0 + add acq, 64 + sub hpadd, 2 + jg .w16_hpad_loop +.calc_avg_8_16: + mova m0, m5 + psrld m5, 16 + pslld m0, 16 + psrld m0, 16 + paddd m5, m0 + mova m0, m4 + psrld m0, 16 + pslld m4, 16 + psrld m4, 16 + paddd m0, m4 + paddd m5, m0 + jmp .calc_avg + +.w32: + pxor m0, m0 + mova [rsp ], m0 + mova [rsp+16], m0 + mova [rsp+32], m0 + mova [rsp+48], m0 + test wpadd, wpadd + jnz .w32_wpad +.w32_loop: + mova m0, [yq] + mova m1, m0 + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq], m1 + paddw m5, m1, [rsp] + mova [rsp ], m5 + punpckhbw m0, m0 + pmaddubsw m0, m2 + mova [acq+16], m0 + paddw m5, m0, [rsp+16] + mova [rsp+16], m5 + mova m4, [yq+16] + mova m3, m4 + punpcklbw m3, m3 + pmaddubsw m3, m2 + mova [acq+32], m3 + paddw m5, m3, [rsp+32] + mova [rsp+32], m5 + punpckhbw m4, m4 + pmaddubsw m4, m2 + mova [acq+48], m4 + paddw m5, m4, [rsp+48] + mova [rsp+48], m5 + lea yq, [yq+strideq] + add acq, 64 + sub hd, 1 + jg .w32_loop + test hpadd, hpadd + jz .calc_avg_32 + jmp .w32_hpad_loop +.w32_wpad: + cmp wpadd, 2 + jl .w32_pad1 + je .w32_pad2 + cmp wpadd, 4 + jl .w32_pad3 + je .w32_pad4 + cmp wpadd, 6 + jl .w32_pad5 + je .w32_pad6 +.w32_pad7: + movd m1, [yq] + punpcklbw m1, m1 + punpcklqdq m1, m1 + pshufhw m1, m1, q3333 + pmaddubsw m1, m2 + mova [acq], m1 + paddw m5, m1, [rsp] + mova [rsp ], m5 + mova m0, m1 + punpckhqdq m0, m0 + mova [acq+16], m0 + paddw m5, m0, [rsp+16] + mova [rsp+16], m5 + mova m3, m0 + mova [acq+32], m3 + paddw m5, m3, [rsp+32] + mova [rsp+32], m5 + mova m4, m3 + mova [acq+48], m4 + paddw m5, m4, [rsp+48] + mova [rsp+48], m5 + lea yq, [yq+strideq] + add acq, 64 + sub hd, 1 + jg .w32_pad7 + jmp .w32_wpad_done +.w32_pad6: + mova m0, [yq] + mova m1, m0 + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq], m1 + paddw m5, m1, [rsp] + mova [rsp ], m5 + pshufhw m0, m1, q3333 + punpckhqdq m0, m0 + mova [acq+16], m0 + paddw m5, m0, [rsp+16] + mova [rsp+16], m5 + mova m3, m0 + mova [acq+32], m3 + paddw m5, m3, [rsp+32] + mova [rsp+32], m5 + mova m4, m3 + mova [acq+48], m4 + paddw m5, m4, [rsp+48] + mova [rsp+48], m5 + lea yq, [yq+strideq] + add acq, 64 + sub hd, 1 + jg .w32_pad6 + jmp .w32_wpad_done +.w32_pad5: + mova m0, [yq] + mova m1, m0 + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq], m1 + mova m5, [rsp] + paddw m5, m1 + mova [rsp ], m5 + punpckhbw m0, m0 + punpcklqdq m0, m0 + pshufhw m0, m0, q3333 + pmaddubsw m0, m2 + mova [acq+16], m0 + paddw m5, m0, [rsp+16] + mova [rsp+16], m5 + mova m3, m0 + punpckhqdq m3, m3 + mova [acq+32], m3 + paddw m5, m3, [rsp+32] + mova [rsp+32], m5 + mova m4, m3 + mova [acq+48], m4 + paddw m5, m4, [rsp+48] + mova [rsp+48], m5 + lea yq, [yq+strideq] + add acq, 64 + sub hd, 1 + jg .w32_pad5 + jmp .w32_wpad_done +.w32_pad4: + mova m0, [yq] + mova m1, m0 + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq], m1 + paddw m5, m1, [rsp] + mova [rsp ], m5 + punpckhbw m0, m0 + pmaddubsw m0, m2 + mova [acq+16], m0 + paddw m5, m0, [rsp+16] + mova [rsp+16], m5 + mova m3, m0 + pshufhw m3, m3, q3333 + punpckhqdq m3, m3 + mova [acq+32], m3 + paddw m5, m3, [rsp+32] + mova [rsp+32], m5 + mova m4, m3 + mova [acq+48], m4 + paddw m5, m4, [rsp+48] + mova [rsp+48], m5 + lea yq, [yq+strideq] + add acq, 64 + sub hd, 1 + jg .w32_pad4 + jmp .w32_wpad_done +.w32_pad3: + mova m0, [yq] + mova m1, m0 + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq], m1 + paddw m5, m1, [rsp] + mova [rsp ], m5 + punpckhbw m0, m0 + pmaddubsw m0, m2 + mova [acq+16], m0 + paddw m5, m0, [rsp+16] + mova [rsp+16], m5 + movd m3, [yq+16] + punpcklbw m3, m3 + punpcklqdq m3, m3 + pshufhw m3, m3, q3333 + pmaddubsw m3, m2 + mova [acq+32], m3 + paddw m5, m3, [rsp+32] + mova [rsp+32], m5 + mova m4, m3 + punpckhqdq m4, m4 + mova [acq+48], m4 + paddw m5, m4, [rsp+48] + mova [rsp+48], m5 + lea yq, [yq+strideq] + add acq, 64 + sub hd, 1 + jg .w32_pad3 + jmp .w32_wpad_done +.w32_pad2: + mova m0, [yq] + mova m1, m0 + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq], m1 + paddw m5, m1, [rsp] + mova [rsp ], m5 + punpckhbw m0, m0 + pmaddubsw m0, m2 + mova [acq+16], m0 + paddw m5, m0, [rsp+16] + mova [rsp+16], m5 + mova m3, [yq+16] + punpcklbw m3, m3 + pmaddubsw m3, m2 + mova [acq+32], m3 + paddw m5, m3, [rsp+32] + mova [rsp+32], m5 + pshufhw m4, m3, q3333 + punpckhqdq m4, m4 + mova [acq+48], m4 + paddw m5, m4, [rsp+48] + mova [rsp+48], m5 + lea yq, [yq+strideq] + add acq, 64 + sub hd, 1 + jg .w32_pad2 + jmp .w32_wpad_done +.w32_pad1: + mova m0, [yq] + mova m1, m0 + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq], m1 + paddw m5, m1, [rsp] + mova [rsp ], m5 + punpckhbw m0, m0 + pmaddubsw m0, m2 + mova [acq+16], m0 + paddw m5, m0, [rsp+16] + mova [rsp+16], m5 + mova m4, [yq+16] + mova m3, m4 + punpcklbw m3, m3 + pmaddubsw m3, m2 + mova [acq+32], m3 + paddw m5, m3, [rsp+32] + mova [rsp+32], m5 + punpckhbw m4, m4 + punpcklqdq m4, m4 + pshufhw m4, m4, q3333 + pmaddubsw m4, m2 + mova [acq+48], m4 + paddw m5, m4, [rsp+48] + mova [rsp+48], m5 + lea yq, [yq+strideq] + add acq, 64 + sub hd, 1 + jg .w32_pad1 +.w32_wpad_done: + test hpadd, hpadd + jz .calc_avg_32 +.w32_hpad_loop: + mova [acq], m1 + mova [acq+16], m0 + paddw m5, m1, [rsp] + mova [rsp ], m5 + paddw m5, m0, [rsp+16] + mova [rsp+16], m5 + mova [acq+32], m3 + mova [acq+48], m4 + paddw m5, m3, [rsp+32] + mova [rsp+32], m5 + paddw m5, m4, [rsp+48] + mova [rsp+48], m5 + add acq, 64 + sub hpadd, 1 + jg .w32_hpad_loop + +%if ARCH_X86_64 + DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak +%else + DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h +%endif + +.calc_avg_32: + mova m5, [rsp] + mova m0, m5 + psrld m5, 16 + pslld m0, 16 + psrld m0, 16 + paddd m5, m0 + mova m0, [rsp+16] + mova m3, m0 + psrld m0, 16 + pslld m3, 16 + psrld m3, 16 + paddd m0, m3 + paddd m5, m0 + mova m0, [rsp+32] + mova m3, m0 + psrld m0, 16 + pslld m3, 16 + psrld m3, 16 + paddd m0, m3 + mova m1, [rsp+48] + mova m3, m1 + psrld m1, 16 + pslld m3, 16 + psrld m3, 16 + paddd m1, m3 + paddd m1, m0 + paddd m5, m1 +.calc_avg: + movd szd, m6 + psrad m6, 1 + tzcnt r1d, szd ; const int log2sz = ctz(width) + ctz(height); + paddd m5, m6 + movd m1, r1d + pshufd m0, m5, q2301 + paddd m0, m5 + pshufd m5, m0, q1032 + paddd m0, m5 + psrad m0, m1 ; sum >>= log2sz; + packssdw m0, m0 + RELOAD_ACQ_32 acq ; ac = ac_orig +.sub_loop: + mova m1, [acq] + psubw m1, m0 + mova [acq], m1 + add acq, 16 + sub szd, 8 + jg .sub_loop + RET + +; %1 simd register that hold the mask and will hold the result +; %2 simd register that holds the "true" values +; %3 location of the "false" values (simd register/memory) +%macro BLEND 3 ; mask, true, false + pand %2, %1 + pandn %1, %3 + por %1, %2 +%endmacro + +%macro PAETH 2 ; top, ldiff + pavgb m1, m%1, m3 + pxor m0, m%1, m3 + pand m0, m4 + psubusb m2, m5, m1 + psubb m1, m0 + psubusb m1, m5 + por m1, m2 + paddusb m1, m1 + por m1, m0 ; min(tldiff, 255) + psubusb m2, m5, m3 + psubusb m0, m3, m5 + por m2, m0 ; tdiff +%ifnum %2 + pminub m2, m%2 + pcmpeqb m0, m%2, m2 ; ldiff <= tdiff +%else + mova m0, %2 + pminub m2, m0 + pcmpeqb m0, m2 +%endif + pminub m1, m2 + pcmpeqb m1, m2 ; ldiff <= tldiff && tdiff <= tldiff + mova m2, m3 + BLEND m0, m2, m%1 + BLEND m1, m0, m5 +%endmacro + +cglobal ipred_paeth, 3, 6, 8, -7*16, dst, stride, tl, w, h +%define base r5-ipred_paeth_ssse3_table + tzcnt wd, wm + movifnidn hd, hm + pxor m0, m0 + movd m5, [tlq] + pshufb m5, m0 + LEA r5, ipred_paeth_ssse3_table + movsxd wq, [r5+wq*4] + movddup m4, [base+ipred_paeth_shuf] + add wq, r5 + jmp wq +.w4: + movd m6, [tlq+1] ; top + pshufd m6, m6, q0000 + lea r3, [strideq*3] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 ; ldiff +.w4_loop: + sub tlq, 4 + movd m3, [tlq] + mova m1, [base+ipred_h_shuf] + pshufb m3, m1 ; left + PAETH 6, 7 + movd [dstq ], m1 + pshuflw m0, m1, q1032 + movd [dstq+strideq ], m0 + punpckhqdq m1, m1 + movd [dstq+strideq*2], m1 + psrlq m1, 32 + movd [dstq+r3 ], m1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4_loop + RET +ALIGN function_align +.w8: + movddup m6, [tlq+1] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 +.w8_loop: + sub tlq, 2 + movd m3, [tlq] + pshufb m3, [base+ipred_paeth_shuf] + PAETH 6, 7 + movq [dstq ], m1 + movhps [dstq+strideq], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_loop + RET +ALIGN function_align +.w16: + movu m6, [tlq+1] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 +.w16_loop: + sub tlq, 1 + movd m3, [tlq] + pxor m1, m1 + pshufb m3, m1 + PAETH 6, 7 + mova [dstq], m1 + add dstq, strideq + sub hd, 1 + jg .w16_loop + RET +ALIGN function_align +.w32: + movu m6, [tlq+1] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 + mova [rsp ], m6 + mova [rsp+16], m7 + movu m6, [tlq+17] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 + mova [rsp+32], m6 +.w32_loop: + dec tlq + movd m3, [tlq] + pxor m1, m1 + pshufb m3, m1 + mova m6, [rsp] + PAETH 6, [rsp+16] + mova [dstq ], m1 + mova m6, [rsp+32] + PAETH 6, 7 + mova [dstq+16], m1 + add dstq, strideq + dec hd + jg .w32_loop + RET +ALIGN function_align +.w64: + movu m6, [tlq+1] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 + mova [rsp ], m6 + mova [rsp+16], m7 + movu m6, [tlq+17] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 + mova [rsp+32], m6 + mova [rsp+48], m7 + movu m6, [tlq+33] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 + mova [rsp+64], m6 + mova [rsp+80], m7 + movu m6, [tlq+49] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 + mova [rsp+96], m6 +.w64_loop: + dec tlq + movd m3, [tlq] + pxor m1, m1 + pshufb m3, m1 + mova m6, [rsp] + PAETH 6, [rsp+16] + mova [dstq ], m1 + mova m6, [rsp+32] + PAETH 6, [rsp+48] + mova [dstq+16], m1 + mova m6, [rsp+64] + PAETH 6, [rsp+80] + mova [dstq+32], m1 + mova m6, [rsp+96] + PAETH 6, 7 + mova [dstq+48], m1 + add dstq, strideq + dec hd + jg .w64_loop + RET + + +%macro FILTER 4 ;dst, src, tmp, shuf +%ifnum %4 + pshufb m%2, m%4 +%else + pshufb m%2, %4 +%endif + pshufd m%1, m%2, q0000 ;p0 p1 + pmaddubsw m%1, m2 + pshufd m%3, m%2, q1111 ;p2 p3 + pmaddubsw m%3, m3 + paddw m%1, [base+pw_8] + paddw m%1, m%3 + pshufd m%3, m%2, q2222 ;p4 p5 + pmaddubsw m%3, m4 + paddw m%1, m%3 + pshufd m%3, m%2, q3333 ;p6 __ + pmaddubsw m%3, m5 + paddw m%1, m%3 + psraw m%1, 4 + packuswb m%1, m%1 +%endmacro + +cglobal ipred_filter, 3, 7, 8, dst, stride, tl, w, h, filter +%define base r6-$$ + LEA r6, $$ + tzcnt wd, wm +%ifidn filterd, filterm + movzx filterd, filterb +%else + movzx filterd, byte filterm +%endif + shl filterd, 6 + lea filterq, [base+filter_intra_taps+filterq] + movq m0, [tlq-3] ;_ 6 5 0 1 2 3 4 + movsxd wq, [base+ipred_filter_ssse3_table+wq*4] + mova m2, [filterq+16*0] + mova m3, [filterq+16*1] + mova m4, [filterq+16*2] + mova m5, [filterq+16*3] + lea wq, [base+ipred_filter_ssse3_table+wq] + mov hd, hm + jmp wq +.w4: + mova m1, [base+filter_shuf1] + sub tlq, 3 + sub tlq, hq + jmp .w4_loop_start +.w4_loop: + movd m0, [tlq+hq] + punpckldq m0, m6 + lea dstq, [dstq+strideq*2] +.w4_loop_start: + FILTER 6, 0, 7, 1 + movd [dstq+strideq*0], m6 + pshuflw m6, m6, q1032 + movd [dstq+strideq*1], m6 + sub hd, 2 + jg .w4_loop + RET + +ALIGN function_align +.w8: + movq m6, [tlq+1] ;_ _ _ 0 1 2 3 4 + sub tlq, 5 + sub tlq, hq + +.w8_loop: + FILTER 7, 0, 1, [base+filter_shuf1] + punpcklqdq m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + FILTER 0, 6, 1, [base+filter_shuf2] + + punpckldq m6, m7, m0 + movq [dstq+strideq*0], m6 + punpckhqdq m6, m6 + movq [dstq+strideq*1], m6 + + movd m0, [tlq+hq] ;_ 6 5 0 + punpckldq m0, m6 ;_ 6 5 0 1 2 3 4 + + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_loop + RET + +ALIGN function_align +.w16: + movu m6, [tlq+1] ;top row + sub tlq, 5 + sub tlq, hq + +.w16_loop: + FILTER 7, 0, 1, [base+filter_shuf1] + punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + movd [dstq+strideq*0], m7 + psrlq m7, 32 + palignr m7, m6, 4 + + FILTER 6, 0, 1, [base+filter_shuf2] + punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + movd [dstq+4+strideq*0], m6 + psrlq m6, 32 + palignr m6, m7, 4 + + FILTER 7, 0, 1, [base+filter_shuf2] + punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + movd [dstq+8+strideq*0], m7 + psrlq m7, 32 + palignr m7, m6, 4 + + FILTER 6, 0, 1, [base+filter_shuf2] + movd [dstq+12+strideq*0], m6 + psrlq m6, 32 + palignr m6, m7, 4 + mova [dstq+strideq*1], m6 + + movd m0, [tlq+hq] ;_ 6 5 0 + punpckldq m0, m6 ;_ 6 5 0 1 2 3 4 + + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16_loop + RET + +ALIGN function_align +.w32: + movu m6, [tlq+1] ;top row + lea filterq, [tlq+17] + sub tlq, 5 + sub tlq, hq + +.w32_loop: + FILTER 7, 0, 1, [base+filter_shuf1] + punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + movd [dstq+strideq*0], m7 + psrlq m7, 32 + palignr m7, m6, 4 + + FILTER 6, 0, 1, [base+filter_shuf2] + punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + movd [dstq+4+strideq*0], m6 + psrlq m6, 32 + palignr m6, m7, 4 + + FILTER 7, 0, 1, [base+filter_shuf2] + punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + movd [dstq+8+strideq*0], m7 + psrlq m7, 32 + palignr m7, m6, 4 + + FILTER 6, 0, 1, [base+filter_shuf2] + movu m1, [filterq] + punpckldq m0, m7, m1 ;_ _ _ 0 1 2 3 4 _ _ _ _ _ _ _ _ + punpcklqdq m0, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + movd [dstq+12+strideq*0], m6 + psrlq m6, 32 + palignr m6, m7, 4 + mova [dstq+strideq*1], m6 + + mova m6, m1 + + FILTER 7, 0, 6, [base+filter_shuf2] + punpcklqdq m0, m1, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + movd [dstq+16+strideq*0], m7 + psrlq m7, 32 + palignr m7, m1, 4 + + FILTER 6, 0, 1, [base+filter_shuf2] + punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + movd [dstq+20+strideq*0], m6 + psrlq m6, 32 + palignr m6, m7, 4 + + FILTER 7, 0, 1, [base+filter_shuf2] + punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + movd [dstq+24+strideq*0], m7 + psrlq m7, 32 + palignr m7, m6, 4 + + FILTER 6, 0, 1, [base+filter_shuf2] + movd [dstq+28+strideq*0], m6 + psrlq m6, 32 + palignr m6, m7, 4 + mova [dstq+16+strideq*1], m6 + + mova m6, [dstq+strideq*1] + movd m0, [tlq+hq] ;_ 6 5 0 + punpckldq m0, m6 ;_ 6 5 0 1 2 3 4 + lea filterq, [dstq+16+strideq*1] + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_loop + RET diff --git a/third_party/dav1d/src/x86/itx.asm b/third_party/dav1d/src/x86/itx.asm new file mode 100644 index 0000000000..5b373c5f7e --- /dev/null +++ b/third_party/dav1d/src/x86/itx.asm @@ -0,0 +1,5563 @@ +; Copyright © 2018, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 16 + +; Note: The order of (at least some of) those constants matter! + +deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 + +%macro COEF_PAIR 2 +pw_%1_%2: dw %1, %2 +pw_m%2_%1: dw -%2, %1 +%endmacro + +; ADST-only +pw_3803_1321: dw 3803, 1321 +pw_m1321_2482: dw -1321, 2482 +pw_2482_3344: dw 2482, 3344 +pw_m3344_3344: dw -3344, 3344 +pw_m3803_3344: dw -3803, 3344 +pw_m3803_m6688: dw -3803, -6688 +pw_2896_m2896: dw 2896, -2896 + +pw_5: times 2 dw 5 +pw_2048: times 2 dw 2048 +pw_4096: times 2 dw 4096 +pw_8192: times 2 dw 8192 +pw_16384: times 2 dw 16384 +pw_1697x16: times 2 dw 1697*16 +pw_1697x8: times 2 dw 1697*8 +pw_2896x8: times 2 dw 2896*8 + +pd_2048: dd 2048 + +COEF_PAIR 2896, 2896 +COEF_PAIR 1567, 3784 +COEF_PAIR 3784, 1567 +COEF_PAIR 201, 4091 +COEF_PAIR 995, 3973 +COEF_PAIR 1751, 3703 +COEF_PAIR 2440, 3290 +COEF_PAIR 3035, 2751 +COEF_PAIR 3513, 2106 +COEF_PAIR 3857, 1380 +COEF_PAIR 4052, 601 +COEF_PAIR 401, 4076 +COEF_PAIR 1931, 3612 +COEF_PAIR 3166, 2598 +COEF_PAIR 3920, 1189 +COEF_PAIR 799, 4017 +COEF_PAIR 3406, 2276 +pw_m799_m4017: dw -799, -4017 +pw_m1567_m3784: dw -1567, -3784 +pw_m3406_m2276: dw -3406, -2276 +pw_m401_m4076: dw -401, -4076 +pw_m3166_m2598: dw -3166, -2598 +pw_m1931_m3612: dw -1931, -3612 +pw_m3920_m1189: dw -3920, -1189 +COEF_PAIR 2276, 3406 +COEF_PAIR 4017, 799 + +%macro COEF_X8 1-* +%rep %0 + dw %1*8, %1*8 + %rotate 1 +%endrep +%endmacro + +pw_3703x8: COEF_X8 3703 +pw_1751x8: COEF_X8 1751 +pw_m1380x8: COEF_X8 -1380 +pw_3857x8: COEF_X8 3857 +pw_3973x8: COEF_X8 3973 +pw_995x8: COEF_X8 995 +pw_m2106x8: COEF_X8 -2106 +pw_3513x8: COEF_X8 3513 +pw_3290x8: COEF_X8 3290 +pw_2440x8: COEF_X8 2440 +pw_m601x8: COEF_X8 -601 +pw_4052x8: COEF_X8 4052 + +idct64_mul: COEF_X8 4095, 101, 4065, 501, 2967, -2824, 3229, -2520 + COEF_X8 3745, 1660, 3564, 2019, 3822, -1474, 3948, -1092 + COEF_X8 3996, 897, 3889, 1285, 3461, -2191, 3659, -1842 + COEF_X8 3349, 2359, 3102, 2675, 4036, -700, 4085, -301 + +pw_201_4091x8: dw 201*8, 4091*8 +pw_m601_4052x8: dw -601*8, 4052*8 +pw_995_3973x8: dw 995*8, 3973*8 +pw_m1380_3857x8: dw -1380*8, 3857*8 +pw_1751_3703x8: dw 1751*8, 3703*8 +pw_m2106_3513x8: dw -2106*8, 3513*8 +pw_2440_3290x8: dw 2440*8, 3290*8 +pw_m2751_3035x8: dw -2751*8, 3035*8 + +%define o_idct64_offset idct64_mul - (o_base) - 8 + +SECTION .text + +; Code size reduction trickery: Intead of using rip-relative loads with +; mandatory 4-byte offsets everywhere, we can set up a base pointer with a +; single rip-relative lea and then address things relative from that with +; 1-byte offsets as long as data is within +-128 bytes of the base pointer. +%define o_base deint_shuf + 128 +%define o(x) (rax - (o_base) + (x)) + +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro + +%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) + +; flags: 1 = swap, 2 = interleave, 4: coef_regs +%macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags +%if %7 & 4 + pmaddwd m%2, m%5, m%1 + pmaddwd m%1, m%6 +%else +%if %7 & 1 + vpbroadcastd m%2, [o(pw_%5_%6)] + vpbroadcastd m%3, [o(pw_m%6_%5)] +%else + vpbroadcastd m%2, [o(pw_m%6_%5)] + vpbroadcastd m%3, [o(pw_%5_%6)] +%endif + pmaddwd m%2, m%1 + pmaddwd m%1, m%3 +%endif + paddd m%2, m%4 + paddd m%1, m%4 +%if %7 & 2 + pslld m%2, 4 + psrld m%1, 12 + pblendw m%1, m%2, 0xaa +%else + psrad m%2, 12 + psrad m%1, 12 + packssdw m%1, m%2 +%endif +%endmacro + +; flags: 1 = swap, 2 = interleave, 4 = coef_regs +%macro ITX_MUL4X_PACK 9-10 0 ; dst/src, tmp[1-3], rnd, coef[1-4], flags +%if %10 & 1 + vpbroadcastd m%3, [o(pw_%8_%9)] + vpbroadcastd m%4, [o(pw_m%9_%8)] + vpbroadcastd xm%2, [o(pw_%6_%7)] + vpblendd m%2, m%3, 0xf0 + vpbroadcastd xm%3, [o(pw_m%7_%6)] +%else + vpbroadcastd m%3, [o(pw_m%9_%8)] + vpbroadcastd m%4, [o(pw_%8_%9)] + vpbroadcastd xm%2, [o(pw_m%7_%6)] + vpblendd m%2, m%3, 0xf0 + vpbroadcastd xm%3, [o(pw_%6_%7)] +%endif + vpblendd m%3, m%4, 0xf0 + ITX_MUL2X_PACK %1, %4, _, %5, %2, %3, (4|%10) +%endmacro + +; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 +; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 +%macro ITX_MULSUB_2W 7-8 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2 + punpckhwd m%3, m%2, m%1 + punpcklwd m%2, m%1 +%if %7 < 32 + pmaddwd m%1, m%7, m%2 + pmaddwd m%4, m%7, m%3 +%else + vpbroadcastd m%1, [o(pw_m%7_%6)] + pmaddwd m%4, m%3, m%1 + pmaddwd m%1, m%2 +%endif + paddd m%4, m%5 + paddd m%1, m%5 + psrad m%4, 12 + psrad m%1, 12 + packssdw m%1, m%4 +%if %7 < 32 + pmaddwd m%3, m%6 + pmaddwd m%2, m%6 +%else + vpbroadcastd m%4, [o(pw_%6_%7)] + pmaddwd m%3, m%4 + pmaddwd m%2, m%4 +%endif + paddd m%3, m%5 + paddd m%2, m%5 + psrad m%3, 12 + psrad m%2, 12 +%if %0 == 8 + packssdw m%8, m%2, m%3 +%else + packssdw m%2, m%3 +%endif +%endmacro + +%macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048 + ITX_MULSUB_2W %2, %4, %5, %6, %7, 1567, 3784, %5 ; t2, t3 + ITX_MULSUB_2W %1, %3, %4, %6, %7, 2896, 2896, %4 ; t1, t0 + psubsw m%3, m%1, m%2 + paddsw m%2, m%1 + paddsw m%1, m%4, m%5 + psubsw m%4, m%5 +%endmacro + +%macro IDCT8_1D 11 ; src[1-8], tmp[1-2], pd_2048 + ITX_MULSUB_2W %6, %4, %9, %10, %11, 3406, 2276 ; t5a, t6a + ITX_MULSUB_2W %2, %8, %9, %10, %11, 799, 4017 ; t4a, t7a + ITX_MULSUB_2W %3, %7, %9, %10, %11, 1567, 3784 ; t2, t3 + paddsw m%9, m%2, m%6 ; t4 + psubsw m%2, m%6 ; t5a + paddsw m%10, m%8, m%4 ; t7 + psubsw m%8, m%4 ; t6a + ITX_MULSUB_2W %1, %5, %4, %6, %11, 2896, 2896 ; t1, t0 + ITX_MULSUB_2W %8, %2, %4, %6, %11, 2896, 2896 ; t5, t6 + psubsw m%6, m%1, m%3 ; dct4 out2 + paddsw m%3, m%1 ; dct4 out1 + paddsw m%1, m%5, m%7 ; dct4 out0 + psubsw m%5, m%7 ; dct4 out3 + psubsw m%7, m%3, m%2 ; out6 + paddsw m%2, m%3 ; out1 + paddsw m%3, m%6, m%8 ; out2 + psubsw m%6, m%8 ; out5 + psubsw m%8, m%1, m%10 ; out7 + paddsw m%1, m%10 ; out0 + paddsw m%4, m%5, m%9 ; out3 + psubsw m%5, m%9 ; out4 +%endmacro + +; in1 = %1, in3 = %2, in5 = %3, in7 = %4 +; in9 = %5, in11 = %6, in13 = %7, in15 = %8 +%macro IDCT16_1D_ODDHALF 11 ; src[1-8], tmp[1-2], pd_2048 + ITX_MULSUB_2W %1, %8, %9, %10, %11, 401, 4076 ; t8a, t15a + ITX_MULSUB_2W %5, %4, %9, %10, %11, 3166, 2598 ; t9a, t14a + ITX_MULSUB_2W %3, %6, %9, %10, %11, 1931, 3612 ; t10a, t13a + ITX_MULSUB_2W %7, %2, %9, %10, %11, 3920, 1189 ; t11a, t12a + psubsw m%9, m%2, m%6 ; t13 + paddsw m%6, m%2 ; t12 + psubsw m%2, m%8, m%4 ; t14 + paddsw m%8, m%4 ; t15 + psubsw m%4, m%7, m%3 ; t10 + paddsw m%3, m%7 ; t11 + psubsw m%7, m%1, m%5 ; t9 + paddsw m%1, m%5 ; t8 + ITX_MULSUB_2W %2, %7, %5, %10, %11, 1567, 3784 ; t9a, t14a + ITX_MULSUB_2W %9, %4, %5, %10, %11, m3784, 1567 ; t10a, t13a + psubsw m%5, m%1, m%3 ; t11a + paddsw m%1, m%3 ; t8a + psubsw m%3, m%7, m%4 ; t13 + paddsw m%7, m%4 ; t14 + psubsw m%4, m%8, m%6 ; t12a + paddsw m%8, m%6 ; t15a + psubsw m%6, m%2, m%9 ; t10 + paddsw m%2, m%9 ; t9 + ITX_MULSUB_2W %3, %6, %9, %10, %11, 2896, 2896 ; t10a, t13a + ITX_MULSUB_2W %4, %5, %9, %10, %11, 2896, 2896 ; t11, t12 +%endmacro + +%macro WRAP_XMM 1+ + INIT_XMM cpuname + %1 + INIT_YMM cpuname +%endmacro + +%macro ITX4_END 4-5 2048 ; row[1-4], rnd +%if %5 + vpbroadcastd m2, [o(pw_%5)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 +%endif + lea r2, [dstq+strideq*2] +%assign %%i 1 +%rep 4 + %if %1 & 2 + CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1) + %else + CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1) + %endif + %assign %%i %%i + 1 + %rotate 1 +%endrep + movd m2, [%%row_adr1] + pinsrd m2, [%%row_adr2], 1 + movd m3, [%%row_adr3] + pinsrd m3, [%%row_adr4], 1 + pmovzxbw m2, m2 + pmovzxbw m3, m3 + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + movd [%%row_adr1], m0 + pextrd [%%row_adr2], m0, 1 + pextrd [%%row_adr3], m0, 2 + pextrd [%%row_adr4], m0, 3 + ret +%endmacro + +%macro IWHT4_1D_PACKED 0 + punpckhqdq m3, m0, m1 ; in1 in3 + punpcklqdq m0, m1 ; in0 in2 + psubw m2, m0, m3 + paddw m0, m3 + punpckhqdq m2, m2 ; t2 t2 + punpcklqdq m0, m0 ; t0 t0 + psubw m1, m0, m2 + psraw m1, 1 + psubw m1, m3 ; t1 t3 + psubw m0, m1 ; ____ out0 + paddw m2, m1 ; out3 ____ +%endmacro + +INIT_XMM avx2 +cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, c + mova m0, [cq+16*0] + mova m1, [cq+16*1] + pxor m2, m2 + mova [cq+16*0], m2 + mova [cq+16*1], m2 + psraw m0, 2 + psraw m1, 2 + IWHT4_1D_PACKED + punpckhwd m0, m1 + punpcklwd m3, m1, m2 + punpckhdq m1, m0, m3 + punpckldq m0, m3 + IWHT4_1D_PACKED + vpblendd m0, m2, 0x03 + ITX4_END 3, 0, 2, 1, 0 + +%macro INV_TXFM_FN 3 ; type1, type2, size +cglobal inv_txfm_add_%1_%2_%3, 4, 5, 0, dst, stride, c, eob, tx2 + %define %%p1 m(i%1_%3_internal) + lea rax, [o_base] + ; Jump to the 1st txfm function if we're not taking the fast path, which + ; in turn performs an indirect jump to the 2nd txfm function. + lea tx2q, [m(i%2_%3_internal).pass2] +%ifidn %1_%2, dct_dct + test eobd, eobd + jnz %%p1 +%else + ; jump to the 1st txfm function unless it's located directly after this + times ((%%end - %%p1) >> 31) & 1 jmp %%p1 +ALIGN function_align +%%end: +%endif +%endmacro + +%macro INV_TXFM_4X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x4 +%ifidn %1_%2, dct_dct + vpbroadcastw m0, [cq] + vpbroadcastd m1, [o(pw_2896x8)] + pmulhrsw m0, m1 + mov [cq], eobd ; 0 + pmulhrsw m0, m1 + mova m1, m0 + jmp m(iadst_4x4_internal).end2 +%endif +%endmacro + +%macro IDCT4_1D_PACKED 0 + vpbroadcastd m4, [o(pd_2048)] + punpckhwd m2, m1, m0 + punpcklwd m1, m0 + ITX_MUL2X_PACK 2, 0, 3, 4, 1567, 3784 + ITX_MUL2X_PACK 1, 0, 3, 4, 2896, 2896 + paddsw m0, m1, m2 ; out0 out1 + psubsw m1, m2 ; out3 out2 +%endmacro + +%macro IADST4_1D_PACKED 0 + punpcklwd m2, m1, m0 + punpckhwd m3, m1, m0 + vpbroadcastd m5, [o(pw_m3344_3344)] + vpbroadcastd m0, [o(pw_3803_1321)] + vpbroadcastd m4, [o(pw_m1321_2482)] + pmaddwd m1, m5, m2 ; 3344*in3 - 3344*in2 + psrld m5, 16 + pmaddwd m0, m2 + pmaddwd m2, m4 + pmaddwd m5, m3 ; 3344*in0 + paddd m1, m5 ; 3344*in0 - 3344*in2 + 3344*in3 + vpbroadcastd m4, [o(pw_2482_3344)] + vpbroadcastd m5, [o(pw_m3803_3344)] + pmaddwd m4, m3 + pmaddwd m5, m3 + paddd m4, m0 ; 1321*in0 + 3344*in1 + 3803*in2 + 2482*in3 + vpbroadcastd m0, [o(pw_m3803_m6688)] + pmaddwd m3, m0 + vpbroadcastd m0, [o(pd_2048)] + paddd m2, m0 + paddd m1, m0 + paddd m0, m4 + paddd m5, m2 ; 2482*in0 + 3344*in1 - 1321*in2 - 3803*in3 + paddd m2, m4 + paddd m2, m3 + REPX {psrad x, 12}, m1, m2, m0, m5 + packssdw m0, m5 ; out0 out1 + packssdw m1, m2 ; out2 out3 +%endmacro + +INV_TXFM_4X4_FN dct, dct +INV_TXFM_4X4_FN dct, adst +INV_TXFM_4X4_FN dct, flipadst +INV_TXFM_4X4_FN dct, identity + +cglobal idct_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 + mova m0, [cq+16*0] + mova m1, [cq+16*1] + IDCT4_1D_PACKED + mova m2, [o(deint_shuf)] + shufps m3, m0, m1, q1331 + shufps m0, m1, q0220 + pshufb m0, m2 + pshufb m1, m3, m2 + jmp tx2q +.pass2: + IDCT4_1D_PACKED + pxor m2, m2 + mova [cq+16*0], m2 + mova [cq+16*1], m2 + ITX4_END 0, 1, 3, 2 + +INV_TXFM_4X4_FN adst, dct +INV_TXFM_4X4_FN adst, adst +INV_TXFM_4X4_FN adst, flipadst +INV_TXFM_4X4_FN adst, identity + +cglobal iadst_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 + mova m0, [cq+16*0] + mova m1, [cq+16*1] + call .main + punpckhwd m3, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + jmp tx2q +.pass2: + call .main +.end: + pxor m2, m2 + mova [cq+16*0], m2 + mova [cq+16*1], m2 +.end2: + ITX4_END 0, 1, 2, 3 +ALIGN function_align +.main: + IADST4_1D_PACKED + ret + +INV_TXFM_4X4_FN flipadst, dct +INV_TXFM_4X4_FN flipadst, adst +INV_TXFM_4X4_FN flipadst, flipadst +INV_TXFM_4X4_FN flipadst, identity + +cglobal iflipadst_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 + mova m0, [cq+16*0] + mova m1, [cq+16*1] + call m(iadst_4x4_internal).main + punpcklwd m2, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + jmp tx2q +.pass2: + call m(iadst_4x4_internal).main +.end: + pxor m2, m2 + mova [cq+16*0], m2 + mova [cq+16*1], m2 +.end2: + ITX4_END 3, 2, 1, 0 + +INV_TXFM_4X4_FN identity, dct +INV_TXFM_4X4_FN identity, adst +INV_TXFM_4X4_FN identity, flipadst +INV_TXFM_4X4_FN identity, identity + +cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 + mova m0, [cq+16*0] + mova m1, [cq+16*1] + vpbroadcastd m3, [o(pw_1697x8)] + pmulhrsw m2, m3, m0 + pmulhrsw m3, m1 + paddsw m0, m2 + paddsw m1, m3 + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + jmp tx2q +.pass2: + vpbroadcastd m3, [o(pw_1697x8)] + pmulhrsw m2, m3, m0 + pmulhrsw m3, m1 + paddsw m0, m2 + paddsw m1, m3 + jmp m(iadst_4x4_internal).end + +%macro WRITE_4X8 2 ; coefs[1-2] + movd xm4, [dstq+strideq*0] + pinsrd xm4, [dstq+strideq*1], 1 + movd xm5, [dstq+strideq*2] + pinsrd xm5, [dstq+r3 ], 1 + pinsrd xm4, [r2 +strideq*0], 2 + pinsrd xm4, [r2 +strideq*1], 3 + pinsrd xm5, [r2 +strideq*2], 2 + pinsrd xm5, [r2 +r3 ], 3 + pmovzxbw m4, xm4 + pmovzxbw m5, xm5 + paddw m4, m%1 + paddw m5, m%2 + packuswb m4, m5 + vextracti128 xm5, m4, 1 + movd [dstq+strideq*0], xm4 + pextrd [dstq+strideq*1], xm4, 1 + pextrd [dstq+strideq*2], xm4, 2 + pextrd [dstq+r3 ], xm4, 3 + movd [r2 +strideq*0], xm5 + pextrd [r2 +strideq*1], xm5, 1 + pextrd [r2 +strideq*2], xm5, 2 + pextrd [r2 +r3 ], xm5, 3 +%endmacro + +%macro INV_TXFM_4X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x8 +%ifidn %1_%2, dct_dct + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_2048)] + mov [cq], eobd + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm2 + vpbroadcastw m0, xm0 + mova m1, m0 + jmp m(iadst_4x8_internal).end3 +%endif +%endmacro + +%macro IDCT8_1D_PACKED 0 + vpbroadcastd m6, [o(pd_2048)] + punpckhwd m5, m3, m0 ; in7 in1 + punpckhwd m4, m1, m2 ; in3 in5 + punpcklwd m3, m1 ; in6 in2 + punpcklwd m2, m0 ; in4 in0 + ITX_MUL2X_PACK 5, 0, 1, 6, 799, 4017, 3 ; t4a t7a + ITX_MUL2X_PACK 4, 0, 1, 6, 3406, 2276, 3 ; t5a t6a + ITX_MUL2X_PACK 3, 0, 1, 6, 1567, 3784 ; t3 t2 + psubsw m0, m5, m4 ; t5a t6a (interleaved) + paddsw m4, m5 ; t4 t7 (interleaved) + ITX_MUL2X_PACK 2, 1, 5, 6, 2896, 2896 ; t0 t1 + vpbroadcastd m1, [o(pw_m2896_2896)] + ITX_MUL2X_PACK 0, 1, _, 6, 1, 5, 4 ; t6 t5 +%if mmsize > 16 + vbroadcasti128 m1, [o(deint_shuf)] + pshufb m4, m1 +%else + pshufb m4, [o(deint_shuf)] +%endif + psubsw m1, m2, m3 ; tmp3 tmp2 + paddsw m3, m2 ; tmp0 tmp1 + shufps m2, m4, m0, q1032 ; t7 t6 + vpblendd m4, m0, 0xcc ; t4 t5 + paddsw m0, m3, m2 ; out0 out1 + psubsw m3, m2 ; out7 out6 + psubsw m2, m1, m4 ; out4 out5 + paddsw m1, m4 ; out3 out2 +%endmacro + +%macro IADST8_1D_PACKED 1 ; pass + vpbroadcastd m6, [o(pd_2048)] + punpckhwd m0, m4, m3 ; 0 7 + punpckhwd m1, m5, m2 ; 2 5 + punpcklwd m2, m5 ; 4 3 + punpcklwd m3, m4 ; 6 1 +%if %1 == 1 + ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076, 3 ; t1a t0a + ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612, 2 ; t2a t3a + ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598, 3 ; t5a t4a + ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189, 2 ; t6a t7a + psubsw m4, m0, m2 ; t5 t4 + paddsw m0, m2 ; t1 t0 + psubsw m5, m1, m3 ; t6 t7 + paddsw m1, m3 ; t2 t3 + ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 3 ; t5a t4a + ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567, 2 ; t7a t6a +%if mmsize > 16 + vbroadcasti128 m2, [o(deint_shuf)] +%else + mova m2, [o(deint_shuf)] +%endif + pshuflw m1, m1, q2301 + pshufhw m1, m1, q2301 + psubsw m3, m0, m1 ; t3 t2 + paddsw m0, m1 ; -out7 out0 + psubsw m1, m4, m5 ; t7 t6 + paddsw m4, m5 ; out6 -out1 + pshufb m0, m2 + pshufb m4, m2 + vpbroadcastd m5, [o(pw_m2896_2896)] + pmaddwd m2, m5, m3 + pmaddwd m5, m1 + paddd m2, m6 + paddd m5, m6 + psrad m2, 12 + psrad m5, 12 + packssdw m2, m5 ; out4 -out5 + vpbroadcastd m5, [o(pw_2896_2896)] + pmaddwd m3, m5 + pmaddwd m1, m5 + paddd m3, m6 + paddd m1, m6 + psrad m3, 12 + psrad m1, 12 + packssdw m1, m3 ; out2 -out3 + punpcklqdq m3, m4, m0 ; out6 -out7 + punpckhqdq m0, m4 ; out0 -out1 +%else + ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076 ; t0a t1a + ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612 ; t2a t3a + ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598 ; t4a t5a + ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189 ; t6a t7a + psubsw m4, m0, m2 ; t4 t5 + paddsw m0, m2 ; t0 t1 + psubsw m5, m1, m3 ; t6 t7 + paddsw m1, m3 ; t2 t3 + shufps m2, m5, m4, q1032 + punpckhwd m4, m2 + punpcklwd m5, m2 + ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 1 ; t5a t4a + ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567 ; t7a t6a + psubsw m2, m0, m1 ; t2 t3 + paddsw m0, m1 ; out0 -out7 + psubsw m1, m4, m5 ; t7 t6 + paddsw m4, m5 ; out6 -out1 + vpbroadcastd m5, [o(pw_2896x8)] + vpblendd m3, m0, m4, 0x33 ; out6 -out7 + vpblendd m0, m4, 0xcc ; out0 -out1 + shufps m4, m2, m1, q1032 ; t3 t7 + vpblendd m1, m2, 0x33 ; t2 t6 + psubsw m2, m1, m4 ; t2-t3 t6-t7 + paddsw m1, m4 ; t2+t3 t6+t7 + pmulhrsw m2, m5 ; out4 -out5 + pshufd m1, m1, q1032 + pmulhrsw m1, m5 ; out2 -out3 +%endif +%endmacro + +INIT_YMM avx2 +INV_TXFM_4X8_FN dct, dct +INV_TXFM_4X8_FN dct, adst +INV_TXFM_4X8_FN dct, flipadst +INV_TXFM_4X8_FN dct, identity + +cglobal idct_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q3120 + vpermq m1, [cq+32*1], q3120 + vpbroadcastd m2, [o(pw_2896x8)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 + IDCT4_1D_PACKED + vbroadcasti128 m2, [o(deint_shuf)] + shufps m3, m0, m1, q1331 + shufps m0, m1, q0220 + pshufb m0, m2 + pshufb m1, m3, m2 + jmp tx2q +.pass2: + vextracti128 xm2, m0, 1 + vextracti128 xm3, m1, 1 + call .main + vpbroadcastd m4, [o(pw_2048)] + vinserti128 m0, xm2, 1 + vinserti128 m1, xm3, 1 + pshufd m1, m1, q1032 + jmp m(iadst_4x8_internal).end2 +ALIGN function_align +.main: + WRAP_XMM IDCT8_1D_PACKED + ret + +INV_TXFM_4X8_FN adst, dct +INV_TXFM_4X8_FN adst, adst +INV_TXFM_4X8_FN adst, flipadst +INV_TXFM_4X8_FN adst, identity + +cglobal iadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q3120 + vpermq m1, [cq+32*1], q3120 + vpbroadcastd m2, [o(pw_2896x8)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 + call m(iadst_8x4_internal).main + punpckhwd m3, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + jmp tx2q +.pass2: + vextracti128 xm2, m0, 1 + vextracti128 xm3, m1, 1 + pshufd xm4, xm0, q1032 + pshufd xm5, xm1, q1032 + call .main_pass2 + vpbroadcastd m4, [o(pw_2048)] + vinserti128 m0, xm2, 1 + vinserti128 m1, xm3, 1 + pxor m5, m5 + psubw m5, m4 +.end: + vpblendd m4, m5, 0xcc +.end2: + pmulhrsw m0, m4 + pmulhrsw m1, m4 + WIN64_RESTORE_XMM + pxor m2, m2 + mova [cq+32*0], m2 + mova [cq+32*1], m2 +.end3: + lea r2, [dstq+strideq*4] + lea r3, [strideq*3] + WRITE_4X8 0, 1 + RET +ALIGN function_align +.main_pass1: + WRAP_XMM IADST8_1D_PACKED 1 + ret +ALIGN function_align +.main_pass2: + WRAP_XMM IADST8_1D_PACKED 2 + ret + +INV_TXFM_4X8_FN flipadst, dct +INV_TXFM_4X8_FN flipadst, adst +INV_TXFM_4X8_FN flipadst, flipadst +INV_TXFM_4X8_FN flipadst, identity + +cglobal iflipadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q3120 + vpermq m1, [cq+32*1], q3120 + vpbroadcastd m2, [o(pw_2896x8)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 + call m(iadst_8x4_internal).main + punpcklwd m3, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m1, m3 + punpckhwd m1, m3 + jmp tx2q +.pass2: + vextracti128 xm2, m0, 1 + vextracti128 xm3, m1, 1 + pshufd xm4, xm0, q1032 + pshufd xm5, xm1, q1032 + call m(iadst_4x8_internal).main_pass2 + vpbroadcastd m5, [o(pw_2048)] + vinserti128 m3, xm1, 1 + vinserti128 m2, xm0, 1 + pxor m4, m4 + psubw m4, m5 + pshufd m0, m3, q1032 + pshufd m1, m2, q1032 + jmp m(iadst_4x8_internal).end + +INV_TXFM_4X8_FN identity, dct +INV_TXFM_4X8_FN identity, adst +INV_TXFM_4X8_FN identity, flipadst +INV_TXFM_4X8_FN identity, identity + +cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 + vpermq m2, [cq+32*0], q3120 + vpermq m0, [cq+32*1], q3120 + vpbroadcastd m3, [o(pw_2896x8)] + vpbroadcastd m4, [o(pw_1697x8)] + punpcklwd m1, m2, m0 + punpckhwd m2, m0 + pmulhrsw m1, m3 + pmulhrsw m2, m3 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + pmulhrsw m2, m4, m0 + pmulhrsw m4, m1 + paddsw m0, m2 + paddsw m1, m4 + jmp tx2q +.pass2: + vpbroadcastd m4, [o(pw_4096)] + jmp m(iadst_4x8_internal).end2 + +%macro INV_TXFM_4X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x16 +%ifidn %1_%2, dct_dct + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_16384)] + movd xm3, [o(pw_2048)] + mov [cq], eobd + pmulhrsw xm0, xm2 + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm3 + vpbroadcastw m0, xm0 + mova m1, m0 + mova m2, m0 + mova m3, m0 + jmp m(iadst_4x16_internal).end3 +%endif +%endmacro + +%macro IDCT16_1D_PACKED 0 + vpbroadcastd m10, [o(pd_2048)] +.main2: + punpckhwd m8, m7, m0 ; dct16 in15 in1 + punpcklwd m9, m4, m0 ; dct4 in2 in0 + punpckhwd m0, m3, m4 ; dct16 in7 in9 + punpcklwd m7, m1 ; dct8 in7 in1 + punpckhwd m1, m6 ; dct16 in3 in13 + punpcklwd m3, m5 ; dct8 in3 in5 + punpckhwd m5, m2 ; dct16 in11 in5 + punpcklwd m6, m2 ; dct4 in3 in1 + ITX_MUL2X_PACK 8, 2, 4, 10, 401, 4076, 3 ; t8a t15a + ITX_MUL2X_PACK 0, 2, 4, 10, 3166, 2598, 3 ; t9a t14a + ITX_MUL2X_PACK 1, 2, 4, 10, 3920, 1189, 3 ; t11a t12a + ITX_MUL2X_PACK 5, 2, 4, 10, 1931, 3612, 3 ; t10a t13a + ITX_MUL2X_PACK 7, 2, 4, 10, 799, 4017, 3 ; t4a t7a + ITX_MUL2X_PACK 3, 2, 4, 10, 3406, 2276, 3 ; t5a t6a + ITX_MUL2X_PACK 6, 2, 4, 10, 1567, 3784 ; t3 t2 + psubsw m2, m8, m0 ; t9 t14 + paddsw m8, m0 ; t8 t15 + psubsw m0, m1, m5 ; t10 t13 + paddsw m1, m5 ; t11 t12 + vpbroadcastd m5, [o(pw_m3784_1567)] ; reuse pw_1567_3784 + ITX_MUL2X_PACK 2, 4, _, 10, 4, 5, 6 ; t9a t14a + vpbroadcastd m4, [o(pw_m1567_m3784)] ; reuse pw_m3784_1567 + ITX_MUL2X_PACK 0, 5, _, 10, 5, 4, 6 ; t10a t13a + psubsw m4, m8, m1 ; t11a t12a + paddsw m8, m1 ; t8a t15a + psubsw m1, m7, m3 ; t5a t6a + paddsw m7, m3 ; t4 t7 + paddsw m3, m2, m0 ; t9 t14 + psubsw m2, m0 ; t10 t13 +%if mmsize > 16 + vbroadcasti128 m0, [o(deint_shuf)] +%else + mova m0, [o(deint_shuf)] +%endif + pshufb m8, m0 + pshufb m7, m0 + pshufb m3, m0 + ITX_MUL2X_PACK 9, 0, 5, 10, 2896, 2896 ; t0 t1 + vpbroadcastd m0, [o(pw_m2896_2896)] + ITX_MUL2X_PACK 4, 5, _, 10, 5, 0, 4 ; t11 t12 + vpbroadcastd m5, [o(pw_2896_2896)] + ITX_MUL2X_PACK 1, 0, _, 10, 0, 5, 4 ; t6 t5 + vpbroadcastd m0, [o(pw_m2896_2896)] + ITX_MUL2X_PACK 2, 0, _, 10, 0, 5, 4 ; t13a t10a + punpckhqdq m0, m8, m3 ; t15a t14 + punpcklqdq m8, m3 ; t8a t9 + shufps m5, m4, m2, q1032 ; t12 t13a + vpblendd m4, m2, 0xcc ; t11 t10a + shufps m2, m7, m1, q1032 ; t7 t6 + vpblendd m7, m1, 0xcc ; t4 t5 + psubsw m1, m9, m6 ; dct4 out3 out2 + paddsw m9, m6 ; dct4 out0 out1 + psubsw m3, m9, m2 ; dct8 out7 out6 + paddsw m9, m2 ; dct8 out0 out1 + psubsw m2, m1, m7 ; dct8 out4 out5 + paddsw m1, m7 ; dct8 out3 out2 + psubsw m7, m9, m0 ; out15 out14 + paddsw m0, m9 ; out0 out1 + psubsw m6, m1, m5 ; out12 out13 + paddsw m1, m5 ; out3 out2 + psubsw m5, m2, m4 ; out11 out10 + paddsw m2, m4 ; out4 out5 + psubsw m4, m3, m8 ; out8 out9 + paddsw m3, m8 ; out7 out6 +%endmacro + +INV_TXFM_4X16_FN dct, dct +INV_TXFM_4X16_FN dct, adst +INV_TXFM_4X16_FN dct, flipadst +INV_TXFM_4X16_FN dct, identity + +cglobal idct_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2 + mova m0, [cq+32*0] + mova m1, [cq+32*1] + mova m2, [cq+32*2] + mova m3, [cq+32*3] + call m(idct_16x4_internal).main + vpbroadcastd m5, [o(pw_16384)] + punpckhwd m4, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m0, m1 + punpcklwd m0, m1 + REPX {pmulhrsw x, m5}, m0, m4, m2, m3 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + jmp tx2q +.pass2: + vextracti128 xm4, m0, 1 + vextracti128 xm5, m1, 1 + vextracti128 xm6, m2, 1 + vextracti128 xm7, m3, 1 + call .main + vinserti128 m0, xm4, 1 + vinserti128 m1, xm5, 1 + vpbroadcastd m5, [o(pw_2048)] + vinserti128 m2, xm6, 1 + vinserti128 m3, xm7, 1 + pshufd m1, m1, q1032 + pshufd m3, m3, q1032 + jmp m(iadst_4x16_internal).end2 +ALIGN function_align +.main: + WRAP_XMM IDCT16_1D_PACKED + ret + +INV_TXFM_4X16_FN adst, dct +INV_TXFM_4X16_FN adst, adst +INV_TXFM_4X16_FN adst, flipadst +INV_TXFM_4X16_FN adst, identity + +cglobal iadst_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2 + mova m0, [cq+32*0] + mova m1, [cq+32*1] + mova m2, [cq+32*2] + mova m3, [cq+32*3] + call m(iadst_16x4_internal).main + vpbroadcastd m5, [o(pw_16384)] + punpckhwd m4, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m0, m1 + punpcklwd m0, m1 + REPX {pmulhrsw x, m5}, m4, m2, m3, m0 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + jmp tx2q +.pass2: + call .main + vpbroadcastd m5, [o(pw_2896x8)] + paddsw m1, m2, m4 + psubsw m2, m4 + pmulhrsw m1, m5 ; -out7 out4 out6 -out5 + pmulhrsw m2, m5 ; out8 -out11 -out9 out10 + vpbroadcastd m5, [o(pw_2048)] + pshufd m1, m1, q1032 + vpblendd m4, m1, m0, 0x33 + vpblendd m0, m2, 0x33 + vpblendd m2, m3, 0x33 + vpblendd m3, m1, 0x33 + vpermq m0, m0, q2031 + vpermq m1, m2, q1302 + vpermq m2, m3, q3120 + vpermq m3, m4, q0213 + psubw m6, m7, m5 +.end: + vpblendd m5, m6, 0xcc +.end2: + REPX {pmulhrsw x, m5}, m0, m1, m2, m3 + WIN64_RESTORE_XMM + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + mova [cq+32*2], m4 + mova [cq+32*3], m4 +.end3: + lea r2, [dstq+strideq*8] + lea r3, [strideq*3] + WRITE_4X8 0, 1 + lea dstq, [dstq+strideq*4] + lea r2, [r2 +strideq*4] + WRITE_4X8 2, 3 + RET +ALIGN function_align +.main: + vpblendd m4, m1, m0, 0xcc + vpblendd m1, m0, 0x33 + vpblendd m5, m2, m3, 0xcc + vpblendd m2, m3, 0x33 + vperm2i128 m3, m5, m2, 0x31 + vinserti128 m0, m1, xm4, 1 ; in0 in3 in2 in1 + vperm2i128 m4, m1, m4, 0x31 + vinserti128 m1, m5, xm2, 1 ; in4 in7 in6 in5 + pshufd m3, m3, q1032 ; in12 in15 in13 in14 + pshufd m2, m4, q1032 ; in11 in8 in9 in10 +.main2: + vpbroadcastd m8, [o(pd_2048)] + pxor m7, m7 + punpckhwd m4, m3, m0 ; in12 in3 in14 in1 + punpcklwd m0, m3 ; in0 in15 in2 in13 + punpckhwd m3, m2, m1 ; in8 in7 in10 in5 + punpcklwd m1, m2 ; in4 in11 in6 in9 + ITX_MUL4X_PACK 0, 2, 5, 6, 8, 201, 4091, 995, 3973, 3 + ITX_MUL4X_PACK 1, 2, 5, 6, 8, 1751, 3703, 2440, 3290, 3 + ITX_MUL4X_PACK 3, 2, 5, 6, 8, 3035, 2751, 3513, 2106, 3 + ITX_MUL4X_PACK 4, 2, 5, 6, 8, 3857, 1380, 4052, 601, 3 + psubsw m2, m0, m3 ; t9a t8a t11a t10a + paddsw m0, m3 ; t1a t0a t3a t2a + psubsw m3, m1, m4 ; t13a t12a t15a t14a + paddsw m1, m4 ; t5a t4a t7a t6a + ITX_MUL4X_PACK 2, 4, 5, 6, 8, 799, 4017, 3406, 2276, 3 + psubw m6, m7, m5 + ITX_MUL2X_PACK 3, 5, _, 8, 6, 4, 6 + vpbroadcastd m6, [o(pw_m3784_1567)] + vpbroadcastd m5, [o(pw_1567_3784)] + psubsw m4, m0, m1 ; t5 t4 t7 t6 + paddsw m0, m1 ; t1 t0 t3 t2 + psubsw m1, m2, m3 ; t13a t12a t15a t14a + paddsw m2, m3 ; t9a t8a t11a t10a + psubw m3, m7, m6 ; pw_3784_m1567 + vpblendd m6, m3, 0xf0 + ITX_MUL2X_PACK 4, 3, _, 8, 6, 5, 4 ; t4a t5a t7a t6a + ITX_MUL2X_PACK 1, 3, _, 8, 6, 5, 4 ; t12 t13 t15 t14 + vbroadcasti128 m5, [o(deint_shuf)] + pshufb m0, m5 + pshufb m2, m5 + vperm2i128 m3, m0, m2, 0x31 ; t3 t2 t11a t10a + vinserti128 m0, xm2, 1 ; t1 t0 t9a t8a + vperm2i128 m2, m4, m1, 0x31 ; t7a t6a t15 t14 + vinserti128 m4, xm1, 1 ; t4a t5a t12 t13 + pshufd m2, m2, q1032 ; t6a t7a t14 t15 + psubsw m1, m0, m3 ; t3a t2a t11 t10 + paddsw m0, m3 ; -out15 out0 out14 -out1 + paddsw m3, m4, m2 ; -out3 out12 out2 -out13 + psubsw m4, m2 ; t6 t7 t14a t15a + shufps m2, m1, m4, q1032 ; t2a t6 t10 t14a + vpblendd m4, m1, 0x33 ; t3a t7 t11 t15a + ret +ALIGN function_align +.main_pass1_end: + vpbroadcastd m5, [o(pw_m2896_2896)] + vpbroadcastd m6, [o(pw_2896_2896)] + punpcklwd m1, m4, m2 + punpckhwd m4, m2 + pmaddwd m2, m5, m4 + pmaddwd m4, m6 + pmaddwd m5, m1 + pmaddwd m1, m6 + REPX {paddd x, m8}, m5, m1, m2, m4 + REPX {psrad x, 12}, m5, m2, m1, m4 + packssdw m2, m5 ; -out11 out8 out10 -out9 + packssdw m1, m4 ; -out7 out4 out6 -out5 + ret + +INV_TXFM_4X16_FN flipadst, dct +INV_TXFM_4X16_FN flipadst, adst +INV_TXFM_4X16_FN flipadst, flipadst +INV_TXFM_4X16_FN flipadst, identity + +cglobal iflipadst_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2 + mova m0, [cq+32*0] + mova m1, [cq+32*1] + mova m2, [cq+32*2] + mova m3, [cq+32*3] + call m(iadst_16x4_internal).main + vpbroadcastd m5, [o(pw_16384)] + punpcklwd m4, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m3, m2 + punpckhwd m3, m2 + REPX {pmulhrsw x, m5}, m4, m1, m0, m3 + punpckldq m2, m3, m1 + punpckhdq m3, m1 + punpckhdq m1, m0, m4 + punpckldq m0, m4 + jmp tx2q +.pass2: + call m(iadst_4x16_internal).main + vpbroadcastd m5, [o(pw_2896x8)] + paddsw m1, m2, m4 + psubsw m2, m4 + pmulhrsw m1, m5 ; -out7 out4 out6 -out5 + pmulhrsw m2, m5 ; out8 -out11 -out9 out10 + vpbroadcastd m6, [o(pw_2048)] + pshufd m1, m1, q1032 + vpblendd m4, m0, m2, 0x33 + vpblendd m0, m1, 0xcc + vpblendd m1, m3, 0xcc + vpblendd m2, m3, 0x33 + vpermq m0, m0, q3120 + vpermq m1, m1, q0213 + vpermq m2, m2, q2031 + vpermq m3, m4, q1302 + psubw m5, m7, m6 + jmp m(iadst_4x16_internal).end + +INV_TXFM_4X16_FN identity, dct +INV_TXFM_4X16_FN identity, adst +INV_TXFM_4X16_FN identity, flipadst +INV_TXFM_4X16_FN identity, identity + +cglobal iidentity_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2 + mova m3, [cq+32*0] + mova m2, [cq+32*1] + mova m4, [cq+32*2] + mova m5, [cq+32*3] + vpbroadcastd m8, [o(pw_1697x8)] + pcmpeqw m0, m0 ; -1 + punpcklwd m1, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m4, m5 + punpckhwd m4, m5 + pmulhrsw m5, m8, m1 + pmulhrsw m6, m8, m2 + pmulhrsw m7, m8, m3 + pmulhrsw m8, m4 + pcmpeqw m9, m0, m1 ; we want to do a signed avg, but pavgw is + pxor m1, m9 ; unsigned. as long as both signs are equal + pcmpeqw m9, m0, m2 ; it still works, but if the input is -1 the + pxor m2, m9 ; pmulhrsw result will become 0 which causes + pcmpeqw m9, m0, m3 ; pavgw to output -32768 instead of 0 unless + pxor m3, m9 ; we explicitly deal with that case here. + pcmpeqw m0, m4 + pxor m4, m0 + pavgw m1, m5 + pavgw m2, m6 + pavgw m3, m7 + pavgw m4, m8 + punpckldq m0, m1, m2 + punpckhdq m1, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + jmp tx2q +.pass2: + vpbroadcastd m8, [o(pw_1697x16)] + vpbroadcastd m5, [o(pw_2048)] + pmulhrsw m4, m8, m0 + pmulhrsw m6, m8, m1 + pmulhrsw m7, m8, m2 + pmulhrsw m8, m3 + REPX {paddsw x, x}, m0, m1, m2, m3 + paddsw m0, m4 + paddsw m1, m6 + paddsw m2, m7 + paddsw m3, m8 + jmp m(iadst_4x16_internal).end2 + +%macro WRITE_8X4 4-7 strideq*1, strideq*2, r3 ; coefs[1-2], tmp[1-2], off[1-3] + movq xm%3, [dstq ] + movhps xm%3, [dstq+%5] + movq xm%4, [dstq+%6] + movhps xm%4, [dstq+%7] + pmovzxbw m%3, xm%3 + pmovzxbw m%4, xm%4 +%ifnum %1 + paddw m%3, m%1 +%else + paddw m%3, %1 +%endif +%ifnum %2 + paddw m%4, m%2 +%else + paddw m%4, %2 +%endif + packuswb m%3, m%4 + vextracti128 xm%4, m%3, 1 + movq [dstq ], xm%3 + movhps [dstq+%6], xm%3 + movq [dstq+%5], xm%4 + movhps [dstq+%7], xm%4 +%endmacro + +%macro INV_TXFM_8X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x4 +%ifidn %1_%2, dct_dct + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + pmulhrsw xm0, xm1 + movd xm2, [o(pw_2048)] + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm2 + vpbroadcastw m0, xm0 + mova m1, m0 + jmp m(iadst_8x4_internal).end3 +%endif +%endmacro + +INV_TXFM_8X4_FN dct, dct +INV_TXFM_8X4_FN dct, adst +INV_TXFM_8X4_FN dct, flipadst +INV_TXFM_8X4_FN dct, identity + +cglobal idct_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2 + vpbroadcastd xm3, [o(pw_2896x8)] + pmulhrsw xm0, xm3, [cq+16*0] + pmulhrsw xm1, xm3, [cq+16*1] + pmulhrsw xm2, xm3, [cq+16*2] + pmulhrsw xm3, [cq+16*3] + call m(idct_4x8_internal).main + vbroadcasti128 m4, [o(deint_shuf)] + vinserti128 m3, m1, xm3, 1 + vinserti128 m1, m0, xm2, 1 + shufps m0, m1, m3, q0220 + shufps m1, m3, q1331 + pshufb m0, m4 + pshufb m1, m4 + jmp tx2q +.pass2: + IDCT4_1D_PACKED + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + jmp m(iadst_8x4_internal).end2 + +INV_TXFM_8X4_FN adst, dct +INV_TXFM_8X4_FN adst, adst +INV_TXFM_8X4_FN adst, flipadst +INV_TXFM_8X4_FN adst, identity + +cglobal iadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2 + vpbroadcastd xm0, [o(pw_2896x8)] + pshufd xm4, [cq+16*0], q1032 + pmulhrsw xm3, xm0, [cq+16*3] + pshufd xm5, [cq+16*1], q1032 + pmulhrsw xm2, xm0, [cq+16*2] + pmulhrsw xm4, xm0 + pmulhrsw xm5, xm0 + call m(iadst_4x8_internal).main_pass1 + vinserti128 m0, xm2, 1 + vinserti128 m1, xm3, 1 + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + pxor m3, m3 + psubsw m3, m2 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + jmp tx2q +.pass2: + call .main +.end: + vpermq m0, m0, q3120 + vpermq m1, m1, q3120 +.end2: + vpbroadcastd m2, [o(pw_2048)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 + WIN64_RESTORE_XMM +.end3: + pxor m2, m2 + mova [cq+32*0], m2 + mova [cq+32*1], m2 + lea r3, [strideq*3] + WRITE_8X4 0, 1, 4, 5 + RET +ALIGN function_align +.main: + IADST4_1D_PACKED + ret + +INV_TXFM_8X4_FN flipadst, dct +INV_TXFM_8X4_FN flipadst, adst +INV_TXFM_8X4_FN flipadst, flipadst +INV_TXFM_8X4_FN flipadst, identity + +cglobal iflipadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2 + vpbroadcastd xm0, [o(pw_2896x8)] + pshufd xm4, [cq+16*0], q1032 + pmulhrsw xm3, xm0, [cq+16*3] + pshufd xm5, [cq+16*1], q1032 + pmulhrsw xm2, xm0, [cq+16*2] + pmulhrsw xm4, xm0 + pmulhrsw xm5, xm0 + call m(iadst_4x8_internal).main_pass1 + vinserti128 m3, xm1, 1 + vinserti128 m2, xm0, 1 + punpckhwd m1, m3, m2 + punpcklwd m3, m2 + pxor m0, m0 + psubsw m0, m1 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + jmp tx2q +.pass2: + call m(iadst_8x4_internal).main + mova m2, m1 + vpermq m1, m0, q2031 + vpermq m0, m2, q2031 + jmp m(iadst_8x4_internal).end2 + +INV_TXFM_8X4_FN identity, dct +INV_TXFM_8X4_FN identity, adst +INV_TXFM_8X4_FN identity, flipadst +INV_TXFM_8X4_FN identity, identity + +cglobal iidentity_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2 + mova xm2, [cq+16*0] + mova xm0, [cq+16*1] + vinserti128 m2, [cq+16*2], 1 + vinserti128 m0, [cq+16*3], 1 + vpbroadcastd m3, [o(pw_2896x8)] + punpcklwd m1, m2, m0 + punpckhwd m2, m0 + pmulhrsw m1, m3 + pmulhrsw m2, m3 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + paddsw m0, m0 + paddsw m1, m1 + jmp tx2q +.pass2: + vpbroadcastd m3, [o(pw_1697x8)] + pmulhrsw m2, m3, m0 + pmulhrsw m3, m1 + paddsw m0, m2 + paddsw m1, m3 + jmp m(iadst_8x4_internal).end + +%macro INV_TXFM_8X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x8 +%ifidn %1_%2, dct_dct + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_16384)] + mov [cq], eobd + pmulhrsw xm0, xm2 + psrlw xm2, 3 ; pw_2048 + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm2 + vpbroadcastw m0, xm0 +.end: + mov r2d, 2 +.end2: + lea r3, [strideq*3] +.loop: + WRITE_8X4 0, 0, 1, 2 + lea dstq, [dstq+strideq*4] + dec r2d + jg .loop + RET +%endif +%endmacro + +INV_TXFM_8X8_FN dct, dct +INV_TXFM_8X8_FN dct, adst +INV_TXFM_8X8_FN dct, flipadst +INV_TXFM_8X8_FN dct, identity + +cglobal idct_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q3120 ; 0 1 + vpermq m3, [cq+32*3], q3120 ; 6 7 + vpermq m2, [cq+32*2], q3120 ; 4 5 + vpermq m1, [cq+32*1], q3120 ; 2 3 + call .main + shufps m4, m0, m1, q0220 + shufps m5, m0, m1, q1331 + shufps m1, m2, m3, q0220 + shufps m3, m2, m3, q1331 + vbroadcasti128 m0, [o(deint_shuf)] + vpbroadcastd m2, [o(pw_16384)] + REPX {pshufb x, m0}, m4, m5, m1, m3 + REPX {pmulhrsw x, m2}, m4, m5, m1, m3 + vinserti128 m0, m4, xm1, 1 + vperm2i128 m2, m4, m1, 0x31 + vinserti128 m1, m5, xm3, 1 + vperm2i128 m3, m5, m3, 0x31 + jmp tx2q +.pass2: + call .main + vpbroadcastd m4, [o(pw_2048)] + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + vpermq m2, m2, q3120 + vpermq m3, m3, q2031 + jmp m(iadst_8x8_internal).end2 +ALIGN function_align +.main: + IDCT8_1D_PACKED + ret + +INV_TXFM_8X8_FN adst, dct +INV_TXFM_8X8_FN adst, adst +INV_TXFM_8X8_FN adst, flipadst +INV_TXFM_8X8_FN adst, identity + +cglobal iadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 + vpermq m4, [cq+32*0], q1302 ; 1 0 + vpermq m3, [cq+32*3], q3120 ; 6 7 + vpermq m5, [cq+32*1], q1302 ; 3 2 + vpermq m2, [cq+32*2], q3120 ; 4 5 + call .main_pass1 + vpbroadcastd m5, [o(pw_16384)] + punpcklwd m4, m0, m1 + punpckhwd m0, m1 + punpcklwd m1, m2, m3 + punpckhwd m2, m3 + pxor m3, m3 + psubw m3, m5 ; negate odd elements during rounding + pmulhrsw m4, m5 + pmulhrsw m0, m3 + pmulhrsw m1, m5 + pmulhrsw m2, m3 + punpcklwd m3, m4, m0 + punpckhwd m4, m0 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + vperm2i128 m2, m3, m0, 0x31 + vinserti128 m0, m3, xm0, 1 + vperm2i128 m3, m4, m1, 0x31 + vinserti128 m1, m4, xm1, 1 + jmp tx2q +.pass2: + pshufd m4, m0, q1032 + pshufd m5, m1, q1032 + call .main_pass2 + vpbroadcastd m5, [o(pw_2048)] + vpbroadcastd xm4, [o(pw_4096)] + psubw m4, m5 ; lower half = 2048, upper half = -2048 +.end: + REPX {vpermq x, x, q3120}, m0, m1, m2, m3 +.end2: + pmulhrsw m0, m4 + pmulhrsw m1, m4 +.end3: + pmulhrsw m2, m4 + pmulhrsw m3, m4 + WIN64_RESTORE_XMM +.end4: + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + mova [cq+32*2], m4 + mova [cq+32*3], m4 + lea r3, [strideq*3] + WRITE_8X4 0, 1, 4, 5 + lea dstq, [dstq+strideq*4] + WRITE_8X4 2, 3, 4, 5 + RET +ALIGN function_align +.main_pass1: + IADST8_1D_PACKED 1 + ret +ALIGN function_align +.main_pass2: + IADST8_1D_PACKED 2 + ret + +INV_TXFM_8X8_FN flipadst, dct +INV_TXFM_8X8_FN flipadst, adst +INV_TXFM_8X8_FN flipadst, flipadst +INV_TXFM_8X8_FN flipadst, identity + +cglobal iflipadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 + vpermq m4, [cq+32*0], q1302 ; 1 0 + vpermq m3, [cq+32*3], q3120 ; 6 7 + vpermq m5, [cq+32*1], q1302 ; 3 2 + vpermq m2, [cq+32*2], q3120 ; 4 5 + call m(iadst_8x8_internal).main_pass1 + vpbroadcastd m5, [o(pw_16384)] + punpckhwd m4, m3, m2 + punpcklwd m3, m2 + punpckhwd m2, m1, m0 + punpcklwd m1, m0 + pxor m0, m0 + psubw m0, m5 + pmulhrsw m4, m0 + pmulhrsw m3, m5 + pmulhrsw m2, m0 + pmulhrsw m1, m5 + punpckhwd m0, m4, m3 + punpcklwd m4, m3 + punpckhwd m3, m2, m1 + punpcklwd m2, m1 + vinserti128 m1, m0, xm3, 1 + vperm2i128 m3, m0, m3, 0x31 + vinserti128 m0, m4, xm2, 1 + vperm2i128 m2, m4, m2, 0x31 + jmp tx2q +.pass2: + pshufd m4, m0, q1032 + pshufd m5, m1, q1032 + call m(iadst_8x8_internal).main_pass2 + vpbroadcastd m4, [o(pw_2048)] + vpbroadcastd xm5, [o(pw_4096)] + psubw m4, m5 ; lower half = -2048, upper half = 2048 + vpermq m5, m3, q2031 + vpermq m3, m0, q2031 + vpermq m0, m2, q2031 + vpermq m2, m1, q2031 + pmulhrsw m1, m0, m4 + pmulhrsw m0, m5, m4 + jmp m(iadst_8x8_internal).end3 + +INV_TXFM_8X8_FN identity, dct +INV_TXFM_8X8_FN identity, adst +INV_TXFM_8X8_FN identity, flipadst +INV_TXFM_8X8_FN identity, identity + +cglobal iidentity_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 + mova xm3, [cq+16*0] + mova xm2, [cq+16*1] + vinserti128 m3, [cq+16*4], 1 + vinserti128 m2, [cq+16*5], 1 + mova xm4, [cq+16*2] + mova xm0, [cq+16*3] + vinserti128 m4, [cq+16*6], 1 + vinserti128 m0, [cq+16*7], 1 + punpcklwd m1, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m4, m0 + punpckhwd m4, m0 + punpckldq m0, m1, m2 + punpckhdq m1, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + jmp tx2q +.pass2: + vpbroadcastd m4, [o(pw_4096)] + jmp m(iadst_8x8_internal).end + +%macro INV_TXFM_8X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x16 +%ifidn %1_%2, dct_dct + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_16384)] + mov [cq], eobd + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm2 + psrlw xm2, 3 ; pw_2048 + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm2 + vpbroadcastw m0, xm0 + mov r2d, 4 + jmp m(inv_txfm_add_dct_dct_8x8).end2 +%endif +%endmacro + +%macro ITX_8X16_LOAD_COEFS 0 + vpbroadcastd m4, [o(pw_2896x8)] + pmulhrsw m0, m4, [cq+32*0] + add cq, 32*4 + pmulhrsw m7, m4, [cq+32*3] + pmulhrsw m1, m4, [cq-32*3] + pmulhrsw m6, m4, [cq+32*2] + pmulhrsw m2, m4, [cq-32*2] + pmulhrsw m5, m4, [cq+32*1] + pmulhrsw m3, m4, [cq-32*1] + pmulhrsw m4, [cq+32*0] +%endmacro + +INV_TXFM_8X16_FN dct, dct +INV_TXFM_8X16_FN dct, adst +INV_TXFM_8X16_FN dct, flipadst +INV_TXFM_8X16_FN dct, identity + +cglobal idct_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2 + ITX_8X16_LOAD_COEFS + call m(idct_16x8_internal).main + vpbroadcastd m10, [o(pw_16384)] +.pass1_end: + vperm2i128 m9, m3, m7, 0x31 + vinserti128 m3, xm7, 1 + vperm2i128 m8, m2, m6, 0x31 + vinserti128 m2, xm6, 1 + vperm2i128 m6, m1, m5, 0x31 + vinserti128 m1, xm5, 1 + vperm2i128 m5, m0, m4, 0x31 + vinserti128 m0, xm4, 1 + punpckhwd m4, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m0, m1 + punpcklwd m0, m1 +.pass1_end2: + punpckhwd m7, m5, m6 + punpcklwd m5, m6 + punpcklwd m6, m8, m9 + punpckhwd m8, m9 + REPX {pmulhrsw x, m10}, m2, m0, m4, m3, m5, m6, m7, m8 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + punpckldq m4, m5, m6 + punpckhdq m5, m6 + punpckldq m6, m7, m8 + punpckhdq m7, m8 + jmp tx2q +.pass2: + call .main + REPX {vpermq x, x, q3120}, m0, m2, m4, m6 + REPX {vpermq x, x, q2031}, m1, m3, m5, m7 +.end: + vpbroadcastd m8, [o(pw_2048)] +.end2: + REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 +.end3: + pxor m8, m8 + REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3 + lea r3, [strideq*3] + WRITE_8X4 0, 1, 8, 9 + lea dstq, [dstq+strideq*4] + WRITE_8X4 2, 3, 0, 1 + lea dstq, [dstq+strideq*4] + WRITE_8X4 4, 5, 0, 1 + lea dstq, [dstq+strideq*4] + WRITE_8X4 6, 7, 0, 1 + RET +ALIGN function_align +.main: + IDCT16_1D_PACKED + ret + +INV_TXFM_8X16_FN adst, dct +INV_TXFM_8X16_FN adst, adst +INV_TXFM_8X16_FN adst, flipadst +INV_TXFM_8X16_FN adst, identity + +cglobal iadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2 + ITX_8X16_LOAD_COEFS + call m(iadst_16x8_internal).main + call m(iadst_16x8_internal).main_pass1_end + vpbroadcastd m10, [o(pw_16384)] + pslld m9, m10, 17 + psubw m10, m9 ; 16384, -16384 + jmp m(idct_8x16_internal).pass1_end +ALIGN function_align +.pass2: + call .main + call .main_pass2_end + vpbroadcastd m9, [o(pw_2048)] + vpbroadcastd xm8, [o(pw_4096)] + psubw m8, m9 + REPX {vpermq x, x, q2031}, m0, m1, m2, m3 + REPX {vpermq x, x, q3120}, m4, m5, m6, m7 + jmp m(idct_8x16_internal).end2 +ALIGN function_align +.main: + REPX {pshufd x, x, q1032}, m7, m1, m5, m3 +.main2: + vpbroadcastd m10, [o(pd_2048)] + punpckhwd m8, m7, m0 ; in14 in1 + punpcklwd m0, m7 ; in0 in15 + punpcklwd m7, m6, m1 ; in12 in3 + punpckhwd m1, m6 ; in2 in13 + punpckhwd m6, m5, m2 ; in10 in5 + punpcklwd m2, m5 ; in4 in11 + punpcklwd m5, m4, m3 ; in8 in7 + punpckhwd m3, m4 ; in6 in9 + ITX_MUL2X_PACK 0, 4, 9, 10, 201, 4091, 3 ; t0 t1 + ITX_MUL2X_PACK 1, 4, 9, 10, 995, 3973, 3 ; t2 t3 + ITX_MUL2X_PACK 2, 4, 9, 10, 1751, 3703, 3 ; t4 t5 + ITX_MUL2X_PACK 3, 4, 9, 10, 2440, 3290, 3 ; t6 t7 + ITX_MUL2X_PACK 5, 4, 9, 10, 3035, 2751, 3 ; t8 t9 + ITX_MUL2X_PACK 6, 4, 9, 10, 3513, 2106, 3 ; t10 t11 + ITX_MUL2X_PACK 7, 4, 9, 10, 3857, 1380, 3 ; t12 t13 + ITX_MUL2X_PACK 8, 4, 9, 10, 4052, 601, 3 ; t14 t15 + psubsw m4, m0, m5 ; t9a t8a + paddsw m0, m5 ; t1a t0a + psubsw m5, m1, m6 ; t11a t10a + paddsw m1, m6 ; t3a t2a + psubsw m6, m2, m7 ; t13a t12a + paddsw m2, m7 ; t5a t4a + psubsw m7, m3, m8 ; t15a t14a + paddsw m3, m8 ; t7a t6a + vpbroadcastd m11, [o(pw_m4017_799)] + vpbroadcastd m12, [o(pw_799_4017)] + pxor m9, m9 + ITX_MUL2X_PACK 4, 8, _, 10, 11, 12, 6 ; t8 t9 + psubw m8, m9, m11 ; pw_4017_m799 + ITX_MUL2X_PACK 6, 12, _, 10, 12, 8, 6 ; t12 t13 + vpbroadcastd m11, [o(pw_m2276_3406)] + vpbroadcastd m12, [o(pw_3406_2276)] + ITX_MUL2X_PACK 5, 8, _, 10, 11, 12, 6 ; t10 t11 + psubw m8, m9, m11 ; pw_2276_m3406 + ITX_MUL2X_PACK 7, 12, _, 10, 12, 8, 6 ; t14 t15 + psubsw m8, m1, m3 ; t7 t6 + paddsw m1, m3 ; t3 t2 + psubsw m3, m0, m2 ; t5 t4 + paddsw m0, m2 ; t1 t0 + psubsw m2, m5, m7 ; t14a t15a + paddsw m7, m5 ; t10a t11a + psubsw m5, m4, m6 ; t12a t13a + paddsw m4, m6 ; t8a t9a + vpbroadcastd m11, [o(pw_m3784_1567)] + vpbroadcastd m12, [o(pw_1567_3784)] + ITX_MUL2X_PACK 3, 6, _, 10, 12, 11, 6 ; t5a t4a + psubw m6, m9, m11 ; pw_3784_m1567 + ITX_MUL2X_PACK 8, 6, _, 10, 6, 12, 6 ; t7a t6a + vpbroadcastd m11, [o(pw_m1567_3784)] + vpbroadcastd m12, [o(pw_3784_1567)] + ITX_MUL2X_PACK 2, 6, _, 10, 11, 12, 6 ; t15 t14 + psubw m6, m9, m11 ; pw_1567_m3784 + ITX_MUL2X_PACK 5, 12, _, 10, 12, 6, 6 ; t13 t12 + vbroadcasti128 m12, [o(deint_shuf)] + paddsw m6, m4, m7 ; -out1 out14 + psubsw m4, m7 ; t10 t11 + psubsw m11, m3, m8 ; t7 t6 + paddsw m8, m3 ; out12 -out3 + psubsw m3, m0, m1 ; t3a t2a + paddsw m0, m1 ; -out15 out0 + paddsw m1, m2, m5 ; -out13 out2 + psubsw m5, m2 ; t15a t14a + pshufb m0, m12 + pshufb m6, m12 + pshufb m8, m12 + pshufb m1, m12 + shufps m7, m6, m0, q1032 ; out14 -out15 + vpblendd m0, m6, 0x33 ; -out1 out0 + punpcklqdq m6, m8, m1 ; out12 -out13 + punpckhqdq m1, m8, m1 ; -out3 out2 + ret +ALIGN function_align +.main_pass1_end: + vpbroadcastd m8, [o(pw_m2896_2896)] + vpbroadcastd m12, [o(pw_2896_2896)] + pmaddwd m9, m8, m11 ; -out11 + pmaddwd m2, m12, m5 ; -out5 + pmaddwd m5, m8 ; out10 + pmaddwd m11, m12 ; out4 + REPX {paddd x, m10}, m9, m5, m2, m11 + REPX {psrad x, 12 }, m9, m5, m2, m11 + packssdw m5, m9 ; out10 -out11 + packssdw m2, m11 ; -out5 out4 + pmaddwd m11, m8, m3 ; out8 + vpbroadcastd m8, [o(pw_2896_m2896)] + pmaddwd m3, m12 ; -out7 + pmaddwd m8, m4 ; -out9 + pmaddwd m4, m12 ; out6 + REPX {paddd x, m10}, m11, m3, m8, m4 + REPX {psrad x, 12 }, m11, m3, m8, m4 + packssdw m3, m4 ; -out7 out6 + packssdw m4, m11, m8 ; out8 -out9 + vpbroadcastd m10, [o(pw_16384)] + pxor m9, m9 + ret +ALIGN function_align +.main_pass2_end: + vpbroadcastd m8, [o(pw_2896x8)] + pshufb m2, m11, m12 + pshufb m5, m12 + pshufb m3, m12 + pshufb m4, m12 + punpcklqdq m11, m5, m2 ; t15a t7 + punpckhqdq m5, m2 ; t14a t6 + shufps m2, m3, m4, q1032 ; t2a t10 + vpblendd m3, m4, 0xcc ; t3a t11 + psubsw m4, m2, m3 ; out8 -out9 + paddsw m3, m2 ; -out7 out6 + paddsw m2, m5, m11 ; -out5 out4 + psubsw m5, m11 ; out10 -out11 + REPX {pmulhrsw x, m8}, m2, m3, m4, m5 + ret + +INV_TXFM_8X16_FN flipadst, dct +INV_TXFM_8X16_FN flipadst, adst +INV_TXFM_8X16_FN flipadst, flipadst +INV_TXFM_8X16_FN flipadst, identity + +cglobal iflipadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2 + ITX_8X16_LOAD_COEFS + call m(iadst_16x8_internal).main + call m(iadst_16x8_internal).main_pass1_end + vpbroadcastd m9, [o(pw_16384)] + pslld m10, m9, 17 + psubw m10, m9 ; -16384, 16384 + vperm2i128 m9, m4, m0, 0x31 + vinserti128 m0, m4, xm0, 1 + vperm2i128 m8, m5, m1, 0x31 + vinserti128 m4, m5, xm1, 1 + vperm2i128 m5, m7, m3, 0x31 + vinserti128 m3, m7, xm3, 1 + vinserti128 m1, m6, xm2, 1 + vperm2i128 m6, m6, m2, 0x31 + punpcklwd m2, m4, m0 + punpckhwd m4, m0 + punpcklwd m0, m3, m1 + punpckhwd m3, m1 + jmp m(idct_8x16_internal).pass1_end2 +.pass2: + call m(iadst_8x16_internal).main + call m(iadst_8x16_internal).main_pass2_end + vpbroadcastd m8, [o(pw_2048)] + vpbroadcastd xm9, [o(pw_4096)] + psubw m8, m9 + vpermq m9, m0, q3120 + vpermq m0, m7, q2031 + vpermq m7, m1, q3120 + vpermq m1, m6, q2031 + vpermq m6, m2, q3120 + vpermq m2, m5, q2031 + vpermq m5, m3, q3120 + vpermq m3, m4, q2031 + pmulhrsw m0, m8 + pmulhrsw m1, m8 + pmulhrsw m2, m8 + pmulhrsw m3, m8 + pmulhrsw m4, m5, m8 + pmulhrsw m5, m6, m8 + pmulhrsw m6, m7, m8 + pmulhrsw m7, m9, m8 + jmp m(idct_8x16_internal).end3 + +INV_TXFM_8X16_FN identity, dct +INV_TXFM_8X16_FN identity, adst +INV_TXFM_8X16_FN identity, flipadst +INV_TXFM_8X16_FN identity, identity + +%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394] + pmulhrsw m%2, m%3, m%1 +%if %0 == 4 ; if downshifting by 1 + pmulhrsw m%2, m%4 +%else + paddsw m%1, m%1 +%endif + paddsw m%1, m%2 +%endmacro + +cglobal iidentity_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2 + mova xm3, [cq+16*0] + mova xm2, [cq+16*2] + add cq, 16*8 + vinserti128 m3, [cq+16*0], 1 + vinserti128 m2, [cq+16*2], 1 + vpbroadcastd m9, [o(pw_2896x8)] + mova xm4, [cq-16*4] + mova xm5, [cq-16*2] + vinserti128 m4, [cq+16*4], 1 + vinserti128 m5, [cq+16*6], 1 + mova xm7, [cq-16*7] + mova xm6, [cq-16*5] + vinserti128 m7, [cq+16*1], 1 + vinserti128 m6, [cq+16*3], 1 + mova xm8, [cq-16*3] + mova xm0, [cq-16*1] + vinserti128 m8, [cq+16*5], 1 + vinserti128 m0, [cq+16*7], 1 + punpcklwd m1, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m4, m5 + punpckhwd m4, m5 + punpcklwd m5, m7, m6 + punpckhwd m7, m6 + punpcklwd m6, m8, m0 + punpckhwd m8, m0 + REPX {pmulhrsw x, m9}, m1, m2, m3, m4, m5, m6, m7, m8 + punpckldq m0, m1, m2 + punpckhdq m1, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + punpckldq m4, m5, m6 + punpckhdq m5, m6 + punpckldq m6, m7, m8 + punpckhdq m7, m8 + jmp tx2q +.pass2: + vpbroadcastd m8, [o(pw_1697x16)] + REPX {vpermq x, x, q3120}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {IDTX16 x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 7 + jmp m(idct_8x16_internal).end + +%macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2] + pmovzxbw m%3, [dstq+%5] +%ifnum %1 + paddw m%3, m%1 +%else + paddw m%3, %1 +%endif + pmovzxbw m%4, [dstq+%6] +%ifnum %2 + paddw m%4, m%2 +%else + paddw m%4, %2 +%endif + packuswb m%3, m%4 + vpermq m%3, m%3, q3120 + mova [dstq+%5], xm%3 + vextracti128 [dstq+%6], m%3, 1 +%endmacro + +%macro INV_TXFM_16X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x4 +%ifidn %1_%2, dct_dct + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_16384)] + mov [cq], eobd + mov r2d, 2 +.dconly: + pmulhrsw xm0, xm2 + movd xm2, [pw_2048] ; intentionally rip-relative + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm2 + vpbroadcastw m0, xm0 + pxor m3, m3 +.dconly_loop: + mova xm1, [dstq] + vinserti128 m1, [dstq+strideq], 1 + punpckhbw m2, m1, m3 + punpcklbw m1, m3 + paddw m2, m0 + paddw m1, m0 + packuswb m1, m2 + mova [dstq], xm1 + vextracti128 [dstq+strideq], m1, 1 + lea dstq, [dstq+strideq*2] + dec r2d + jg .dconly_loop + RET +%endif +%endmacro + +INV_TXFM_16X4_FN dct, dct +INV_TXFM_16X4_FN dct, adst +INV_TXFM_16X4_FN dct, flipadst +INV_TXFM_16X4_FN dct, identity + +cglobal idct_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2 + mova xm0, [cq+16*0] + mova xm1, [cq+16*1] + mova xm2, [cq+16*2] + mova xm3, [cq+16*3] + mova xm4, [cq+16*4] + mova xm5, [cq+16*5] + mova xm6, [cq+16*6] + mova xm7, [cq+16*7] + call m(idct_4x16_internal).main + vinserti128 m6, m2, xm6, 1 + vinserti128 m2, m0, xm4, 1 + vinserti128 m0, m1, xm5, 1 + vinserti128 m1, m3, xm7, 1 + punpcklwd m3, m2, m6 + punpckhwd m2, m6 + vpbroadcastd m6, [o(pw_16384)] + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + mova m1, m6 + jmp m(iadst_16x4_internal).pass1_end +.pass2: + call .main + jmp m(iadst_16x4_internal).end +ALIGN function_align +.main: + vpbroadcastd m6, [o(pd_2048)] + IDCT4_1D 0, 1, 2, 3, 4, 5, 6 + ret + +INV_TXFM_16X4_FN adst, dct +INV_TXFM_16X4_FN adst, adst +INV_TXFM_16X4_FN adst, flipadst +INV_TXFM_16X4_FN adst, identity + +cglobal iadst_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q1230 + vpermq m3, [cq+32*3], q2103 + vpermq m1, [cq+32*1], q1230 + vpermq m2, [cq+32*2], q2103 + call m(iadst_4x16_internal).main2 + call m(iadst_4x16_internal).main_pass1_end + punpcklwd m4, m3, m1 + punpcklwd m5, m2, m0 + punpckhwd m0, m1 + punpckhwd m2, m3 + vpbroadcastd m1, [o(pw_16384)] + vinserti128 m3, m0, xm2, 1 + vperm2i128 m2, m0, m2, 0x31 + vinserti128 m0, m4, xm5, 1 + vperm2i128 m4, m4, m5, 0x31 + psubw m6, m7, m1 +.pass1_end: + pmulhrsw m3, m1 + pmulhrsw m2, m6 + pmulhrsw m4, m1 + pmulhrsw m0, m6 + punpcklwd m1, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m4, m0 + punpckhwd m4, m0 + punpckldq m0, m1, m2 + punpckhdq m1, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + jmp tx2q +.pass2: + call .main +.end: + vpbroadcastd m4, [o(pw_2048)] + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + WIN64_RESTORE_XMM +.end2: + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + mova [cq+32*2], m4 + mova [cq+32*3], m4 +.end3: + WRITE_16X2 0, 1, 4, 5, strideq*0, strideq*1 + lea dstq, [dstq+strideq*2] + WRITE_16X2 2, 3, 4, 5, strideq*0, strideq*1 + RET +ALIGN function_align +.main: + vpbroadcastd m6, [o(pw_m3344_3344)] + vpbroadcastd m7, [o(pw_3803_1321)] + vpbroadcastd m8, [o(pw_m1321_2482)] + vpbroadcastd m9, [o(pw_2482_3344)] + punpcklwd m4, m2, m0 ; in2 in0 l + punpckhwd m2, m0 ; in2 in0 h + psrld m5, m6, 16 + pmaddwd m10, m6, m4 ; t2:02 l + pmaddwd m6, m2 ; t2:02 h + pmaddwd m0, m7, m4 ; t0:02 l + pmaddwd m7, m2 ; t0:02 h + pmaddwd m4, m8 ; t1:02 l + pmaddwd m8, m2 ; t1:02 h + punpckhwd m2, m3, m1 ; in3 in1 h + punpcklwd m3, m1 ; in3 in1 l + pmaddwd m1, m5, m2 ; t2:3 h + pmaddwd m5, m3 ; t2:3 l + paddd m6, m1 + vpbroadcastd m1, [o(pd_2048)] + paddd m10, m5 + pmaddwd m5, m9, m3 + pmaddwd m9, m2 + paddd m0, m1 + paddd m7, m1 + paddd m0, m5 ; t0 + t3 + 2048 l + paddd m7, m9 ; t0 + t3 + 2048 h + vpbroadcastd m9, [o(pw_m3803_3344)] + pmaddwd m5, m9, m2 + pmaddwd m9, m3 + paddd m10, m1 ; t2 + 2048 l + paddd m6, m1 ; t2 + 2048 h + paddd m5, m1 ; t1:13 + 2048 h + paddd m1, m9 ; t1:13 + 2048 l + vpbroadcastd m9, [o(pw_m3803_m6688)] + pmaddwd m2, m9 + pmaddwd m3, m9 + paddd m5, m8 ; t1 + t3 + 2048 h + paddd m1, m4 ; t1 + t3 + 2048 l + paddd m8, m7 + paddd m4, m0 + paddd m2, m8 ; t0 + t1 - t3 + 2048 h + paddd m3, m4 ; t0 + t1 - t3 + 2048 l + REPX {psrad x, 12}, m10, m6, m0, m7, m5, m1, m2, m3 + packssdw m0, m7 + packssdw m1, m5 + packssdw m3, m2 + packssdw m2, m10, m6 + ret + +INV_TXFM_16X4_FN flipadst, dct +INV_TXFM_16X4_FN flipadst, adst +INV_TXFM_16X4_FN flipadst, flipadst +INV_TXFM_16X4_FN flipadst, identity + +cglobal iflipadst_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q1230 + vpermq m3, [cq+32*3], q2103 + vpermq m1, [cq+32*1], q1230 + vpermq m2, [cq+32*2], q2103 + call m(iadst_4x16_internal).main2 + call m(iadst_4x16_internal).main_pass1_end + punpckhwd m4, m3, m2 + punpckhwd m5, m1, m0 + punpcklwd m0, m2 + punpcklwd m1, m3 + vpbroadcastd m6, [o(pw_16384)] + vinserti128 m3, m0, xm1, 1 + vperm2i128 m2, m0, m1, 0x31 + vinserti128 m0, m4, xm5, 1 + vperm2i128 m4, m4, m5, 0x31 + psubw m1, m7, m6 + jmp m(iadst_16x4_internal).pass1_end +ALIGN function_align +.pass2: + call m(iadst_16x4_internal).main + vpbroadcastd m4, [o(pw_2048)] + REPX {pmulhrsw x, m4}, m3, m2, m1, m0 + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + mova [cq+32*2], m4 + mova [cq+32*3], m4 + WRITE_16X2 3, 2, 4, 5, strideq*0, strideq*1 + lea dstq, [dstq+strideq*2] + WRITE_16X2 1, 0, 4, 5, strideq*0, strideq*1 + RET + +INV_TXFM_16X4_FN identity, dct +INV_TXFM_16X4_FN identity, adst +INV_TXFM_16X4_FN identity, flipadst +INV_TXFM_16X4_FN identity, identity + +cglobal iidentity_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2 + mova xm2, [cq+16*0] + mova xm4, [cq+16*1] + vinserti128 m2, [cq+16*4], 1 + vinserti128 m4, [cq+16*5], 1 + mova xm0, [cq+16*2] + mova xm1, [cq+16*3] + vinserti128 m0, [cq+16*6], 1 + vinserti128 m1, [cq+16*7], 1 + vpbroadcastd m7, [o(pw_1697x16)] + vpbroadcastd m8, [o(pw_16384)] + punpcklwd m3, m2, m4 + punpckhwd m2, m4 + punpcklwd m4, m0, m1 + punpckhwd m0, m1 + punpcklwd m1, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m4, m0 + punpckhwd m4, m0 + pmulhrsw m0, m7, m1 + pmulhrsw m5, m7, m2 + pmulhrsw m6, m7, m3 + pmulhrsw m7, m4 + REPX {pmulhrsw x, m8}, m0, m5, m6, m7 + paddsw m1, m0 + paddsw m2, m5 + paddsw m3, m6 + paddsw m4, m7 + punpcklqdq m0, m1, m2 + punpckhqdq m1, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + jmp tx2q +.pass2: + vpbroadcastd m7, [o(pw_1697x8)] + pmulhrsw m4, m7, m0 + pmulhrsw m5, m7, m1 + pmulhrsw m6, m7, m2 + pmulhrsw m7, m3 + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 + jmp m(iadst_16x4_internal).end + +%macro INV_TXFM_16X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x8 +%ifidn %1_%2, dct_dct + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_16384)] + mov [cq], eobd + pmulhrsw xm0, xm1 + mov r2d, 4 + jmp m(inv_txfm_add_dct_dct_16x4).dconly +%endif +%endmacro + +%macro ITX_16X8_LOAD_COEFS 1 ; shuf_odd + vpbroadcastd m8, [o(pw_2896x8)] + vpermq m0, [cq+32*0], q3120 + add cq, 32*4 + vpermq m7, [cq+32*3], q%1 + vpermq m1, [cq-32*3], q%1 + vpermq m6, [cq+32*2], q3120 + vpermq m2, [cq-32*2], q3120 + vpermq m5, [cq+32*1], q%1 + vpermq m3, [cq-32*1], q%1 + vpermq m4, [cq+32*0], q3120 + REPX {pmulhrsw x, m8}, m0, m7, m1, m6, m2, m5, m3, m4 +%endmacro + +INV_TXFM_16X8_FN dct, dct +INV_TXFM_16X8_FN dct, adst +INV_TXFM_16X8_FN dct, flipadst +INV_TXFM_16X8_FN dct, identity + +cglobal idct_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2 + ITX_16X8_LOAD_COEFS 3120 + call m(idct_8x16_internal).main + vpbroadcastd m10, [o(pw_16384)] + punpckhwd m8, m0, m2 + punpcklwd m0, m2 + punpckhwd m2, m1, m3 + punpcklwd m1, m3 + punpcklwd m9, m4, m6 + punpckhwd m4, m6 + punpcklwd m6, m5, m7 + punpckhwd m5, m7 + REPX {pmulhrsw x, m10}, m8, m1, m4, m6 +.pass1_end: + REPX {pmulhrsw x, m10}, m0, m2, m9, m5 + punpckhwd m3, m0, m8 + punpcklwd m0, m8 + punpckhwd m8, m2, m1 + punpcklwd m2, m1 + punpcklwd m7, m9, m4 + punpckhwd m9, m4 + punpcklwd m4, m5, m6 + punpckhwd m5, m6 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + punpckldq m2, m3, m8 + punpckhdq m3, m8 + punpckldq m6, m7, m4 + punpckhdq m7, m4 + punpckldq m8, m9, m5 + punpckhdq m9, m5 + vperm2i128 m4, m0, m6, 0x31 + vinserti128 m0, xm6, 1 + vperm2i128 m5, m1, m7, 0x31 + vinserti128 m1, xm7, 1 + vperm2i128 m6, m2, m8, 0x31 + vinserti128 m2, xm8, 1 + vperm2i128 m7, m3, m9, 0x31 + vinserti128 m3, xm9, 1 + jmp tx2q +.pass2: + call .main + vpbroadcastd m8, [o(pw_2048)] +.end: + REPX {pmulhrsw x, m8}, m0, m2, m4, m6 +.end2: + REPX {pmulhrsw x, m8}, m1, m3, m5, m7 + lea r3, [strideq*3] + WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r3 +.end3: + pxor m0, m0 + REPX {mova [cq+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 +.end4: + lea dstq, [dstq+strideq*4] + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 6, 7, 0, 1, strideq*2, r3 + RET +ALIGN function_align +.main: + vpbroadcastd m10, [o(pd_2048)] +.main2: + IDCT8_1D 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 + ret + +INV_TXFM_16X8_FN adst, dct +INV_TXFM_16X8_FN adst, adst +INV_TXFM_16X8_FN adst, flipadst +INV_TXFM_16X8_FN adst, identity + +cglobal iadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2 + ITX_16X8_LOAD_COEFS 1302 + call m(iadst_8x16_internal).main2 + call m(iadst_8x16_internal).main_pass1_end + psubw m11, m9, m10 + punpcklwd m8, m0, m2 + punpckhwd m0, m2 + punpckhwd m2, m1, m3 + punpcklwd m1, m3 + punpcklwd m9, m4, m6 + punpckhwd m4, m6 + punpckhwd m6, m5, m7 + punpcklwd m5, m7 + REPX {pmulhrsw x, m11}, m8, m1, m4, m6 + jmp m(idct_16x8_internal).pass1_end +ALIGN function_align +.pass2: + call .main + call .main_pass2_end + pxor m8, m8 + psubw m8, m9 + REPX {pmulhrsw x, m9}, m0, m2, m4, m6 + jmp m(idct_16x8_internal).end2 +ALIGN function_align +.main: + vpbroadcastd m10, [o(pd_2048)] + ITX_MULSUB_2W 7, 0, 8, 9, 10, 401, 4076 ; t1a, t0a + ITX_MULSUB_2W 3, 4, 8, 9, 10, 3166, 2598 ; t5a, t4a + ITX_MULSUB_2W 1, 6, 8, 9, 10, 3920, 1189 ; t7a, t6a + ITX_MULSUB_2W 5, 2, 8, 9, 10, 1931, 3612 ; t3a, t2a + psubsw m8, m2, m6 ; t6 + paddsw m2, m6 ; t2 + psubsw m6, m0, m4 ; t4 + paddsw m0, m4 ; t0 + psubsw m4, m5, m1 ; t7 + paddsw m5, m1 ; t3 + psubsw m1, m7, m3 ; t5 + paddsw m7, m3 ; t1 + ITX_MULSUB_2W 6, 1, 3, 9, 10, 1567, 3784 ; t5a, t4a + ITX_MULSUB_2W 4, 8, 3, 9, 10, 3784, 1567 ; t6a, t7a + psubsw m9, m6, m8 ; t7 + paddsw m6, m8 ; out6 + psubsw m3, m7, m5 ; t3 + paddsw m7, m5 ; -out7 + psubsw m5, m0, m2 ; t2 + paddsw m0, m2 ; out0 + psubsw m2, m1, m4 ; t6 + paddsw m1, m4 ; -out1 + ret +ALIGN function_align +.main_pass1_end: + vpbroadcastd m11, [o(pw_m2896_2896)] + vpbroadcastd m12, [o(pw_2896_2896)] + punpckhwd m4, m3, m5 + punpcklwd m3, m5 + pmaddwd m5, m11, m4 + pmaddwd m4, m12 + pmaddwd m8, m11, m3 + pmaddwd m3, m12 + REPX {paddd x, m10}, m5, m4, m8, m3 + REPX {psrad x, 12 }, m5, m8, m4, m3 + packssdw m3, m4 ; -out3 + packssdw m4, m8, m5 ; out4 + punpcklwd m5, m9, m2 + punpckhwd m9, m2 + pmaddwd m2, m12, m5 + pmaddwd m5, m11 + pmaddwd m12, m9 + pmaddwd m11, m9 + REPX {paddd x, m10}, m2, m5, m12, m11 + REPX {psrad x, 12 }, m2, m12, m5, m11 + packssdw m2, m12 ; out2 + packssdw m5, m11 ; -out5 + ret +ALIGN function_align +.main_pass2_end: + vpbroadcastd m8, [o(pw_2896x8)] + psubsw m4, m5, m3 + paddsw m3, m5 + psubsw m5, m2, m9 + paddsw m2, m9 + pmulhrsw m2, m8 ; out2 + pmulhrsw m3, m8 ; -out3 + pmulhrsw m4, m8 ; out4 + pmulhrsw m5, m8 ; -out5 + vpbroadcastd m9, [o(pw_2048)] + ret + +INV_TXFM_16X8_FN flipadst, dct +INV_TXFM_16X8_FN flipadst, adst +INV_TXFM_16X8_FN flipadst, flipadst +INV_TXFM_16X8_FN flipadst, identity + +cglobal iflipadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2 + ITX_16X8_LOAD_COEFS 1302 + call m(iadst_8x16_internal).main2 + call m(iadst_8x16_internal).main_pass1_end + psubw m9, m10 + punpcklwd m8, m6, m4 + punpckhwd m6, m4 + punpcklwd m4, m7, m5 + punpckhwd m7, m5 + punpckhwd m5, m3, m1 + punpcklwd m3, m1 + punpckhwd m1, m2, m0 + punpcklwd m2, m0 + REPX {pmulhrsw x, m10}, m8, m4, m5, m1 + REPX {pmulhrsw x, m9 }, m6, m7, m3, m2 + punpcklwd m0, m7, m4 + punpckhwd m7, m4 + punpckhwd m4, m6, m8 + punpcklwd m6, m8 + punpckhwd m8, m3, m5 + punpcklwd m3, m5 + punpcklwd m5, m2, m1 + punpckhwd m2, m1 + punpckhdq m1, m0, m6 + punpckldq m0, m6 + punpckldq m6, m7, m4 + punpckhdq m7, m4 + punpckhdq m4, m3, m5 + punpckldq m3, m5 + punpckldq m5, m8, m2 + punpckhdq m8, m2 + vinserti128 m2, m6, xm5, 1 + vperm2i128 m6, m5, 0x31 + vperm2i128 m5, m1, m4, 0x31 + vinserti128 m1, xm4, 1 + vperm2i128 m4, m0, m3, 0x31 + vinserti128 m0, xm3, 1 + vinserti128 m3, m7, xm8, 1 + vperm2i128 m7, m8, 0x31 + jmp tx2q +.pass2: + call m(iadst_16x8_internal).main + call m(iadst_16x8_internal).main_pass2_end + pxor m8, m8 + psubw m8, m9 + pmulhrsw m10, m7, m8 + pmulhrsw m7, m0, m9 + pmulhrsw m0, m6, m9 + pmulhrsw m6, m1, m8 + pmulhrsw m1, m5, m8 + pmulhrsw m5, m2, m9 + pmulhrsw m2, m4, m9 + pmulhrsw m4, m3, m8 + lea r3, [strideq*3] + WRITE_16X2 10, 0, 8, 9, strideq*0, strideq*1 + WRITE_16X2 1, 2, 0, 1, strideq*2, r3 + jmp m(idct_16x8_internal).end3 + +INV_TXFM_16X8_FN identity, dct +INV_TXFM_16X8_FN identity, adst +INV_TXFM_16X8_FN identity, flipadst +INV_TXFM_16X8_FN identity, identity + +cglobal iidentity_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2 + mova xm7, [cq+16*0] + mova xm2, [cq+16*1] + add cq, 16*8 + vpbroadcastd m3, [o(pw_2896x8)] + vinserti128 m7, [cq+16*0], 1 + vinserti128 m2, [cq+16*1], 1 + mova xm6, [cq-16*6] + mova xm4, [cq-16*5] + vinserti128 m6, [cq+16*2], 1 + vinserti128 m4, [cq+16*3], 1 + mova xm8, [cq-16*4] + mova xm5, [cq-16*3] + vinserti128 m8, [cq+16*4], 1 + vinserti128 m5, [cq+16*5], 1 + mova xm0, [cq-16*2] + mova xm1, [cq-16*1] + vinserti128 m0, [cq+16*6], 1 + vinserti128 m1, [cq+16*7], 1 + vpbroadcastd m10, [o(pw_1697x16)] + vpbroadcastd m11, [o(pw_16384)] + REPX {pmulhrsw x, m3}, m7, m2, m6, m4, m8, m5, m0, m1 + punpcklwd m3, m7, m2 + punpckhwd m7, m2 + punpcklwd m2, m6, m4 + punpckhwd m6, m4 + punpcklwd m4, m8, m5 + punpckhwd m8, m5 + punpcklwd m5, m0, m1 + punpckhwd m0, m1 + punpckldq m1, m3, m2 + punpckhdq m3, m2 + punpckldq m2, m4, m5 + punpckhdq m4, m5 + punpckldq m5, m7, m6 + punpckhdq m7, m6 + punpckldq m6, m8, m0 + punpckhdq m8, m0 + REPX {IDTX16 x, 0, 10, 11}, 1, 3, 2, 4, 5, 7, 6, 8 + punpcklqdq m0, m1, m2 + punpckhqdq m1, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + punpcklqdq m4, m5, m6 + punpckhqdq m5, m6 + punpcklqdq m6, m7, m8 + punpckhqdq m7, m8 + jmp tx2q +.pass2: + vpbroadcastd m8, [o(pw_4096)] + jmp m(idct_16x8_internal).end + +%define o_base pw_5 + 128 + +%macro INV_TXFM_16X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x16 +%ifidn %1_%2, dct_dct + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_8192)] + mov [cq], eobd + mov r2d, 8 + jmp m(inv_txfm_add_dct_dct_16x4).dconly +%endif +%endmacro + +%macro ITX_16X16_LOAD_COEFS 0 + mova m0, [cq+32*0] + mova m1, [cq+32*1] + mova m2, [cq+32*2] + mova m3, [cq+32*3] + add cq, 32*8 + mova m4, [cq-32*4] + mova m5, [cq-32*3] + mova m6, [cq-32*2] + mova m7, [cq-32*1] + mova m8, [cq+32*0] + mova m9, [cq+32*1] + mova m10, [cq+32*2] + mova m11, [cq+32*3] + mova m12, [cq+32*4] + mova m13, [cq+32*5] + mova m14, [cq+32*6] + mova m15, [cq+32*7] + mova [rsp], m15 +%endmacro + +INV_TXFM_16X16_FN dct, dct +INV_TXFM_16X16_FN dct, adst +INV_TXFM_16X16_FN dct, flipadst +INV_TXFM_16X16_FN dct, identity + +cglobal idct_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 + ITX_16X16_LOAD_COEFS + call .main +.pass1_end: + vpbroadcastd m1, [o(pw_8192)] + REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14 + vextracti128 [rsp+16*5], m8, 1 + mova [rsp+16*1], xm8 +.pass1_end2: + vextracti128 [rsp+16*4], m0, 1 + mova [rsp+16*0], xm0 + REPX {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15 + pmulhrsw m1, [rsp+32*1] + vperm2i128 m8, m1, m9, 0x31 + vinserti128 m1, xm9, 1 + vperm2i128 m9, m2, m10, 0x31 + vinserti128 m2, xm10, 1 + vperm2i128 m10, m3, m11, 0x31 + vinserti128 m3, xm11, 1 + vperm2i128 m11, m4, m12, 0x31 + vinserti128 m4, xm12, 1 + vperm2i128 m12, m5, m13, 0x31 + vinserti128 m5, xm13, 1 + vperm2i128 m13, m6, m14, 0x31 + vinserti128 m6, xm14, 1 + vperm2i128 m14, m7, m15, 0x31 + vinserti128 m7, xm15, 1 + mova m15, [rsp+32*2] +.pass1_end3: + punpcklwd m0, m9, m10 + punpckhwd m9, m10 + punpcklwd m10, m15, m8 + punpckhwd m15, m8 + punpckhwd m8, m11, m12 + punpcklwd m11, m12 + punpckhwd m12, m13, m14 + punpcklwd m13, m14 + punpckhdq m14, m11, m13 + punpckldq m11, m13 + punpckldq m13, m15, m9 + punpckhdq m15, m9 + punpckldq m9, m10, m0 + punpckhdq m10, m0 + punpckhdq m0, m8, m12 + punpckldq m8, m12 + punpcklqdq m12, m13, m8 + punpckhqdq m13, m8 + punpcklqdq m8, m9, m11 + punpckhqdq m9, m11 + punpckhqdq m11, m10, m14 + punpcklqdq m10, m14 + punpcklqdq m14, m15, m0 + punpckhqdq m15, m0 + mova m0, [rsp] + mova [rsp], m15 + punpckhwd m15, m4, m5 + punpcklwd m4, m5 + punpckhwd m5, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m6, m7 + punpcklwd m6, m7 + punpckhwd m7, m2, m3 + punpcklwd m2, m3 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckldq m2, m4, m6 + punpckhdq m4, m6 + punpckhdq m6, m5, m7 + punpckldq m5, m7 + punpckldq m7, m15, m1 + punpckhdq m15, m1 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + punpcklqdq m4, m5, m7 + punpckhqdq m5, m7 + punpckhqdq m7, m6, m15 + punpcklqdq m6, m15 + jmp tx2q +.pass2: + call .main +.end: + vpbroadcastd m1, [o(pw_2048)] + REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14 + mova [rsp], m6 +.end2: + REPX {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15 + pmulhrsw m1, [rsp+32*1] + lea r3, [strideq*3] + WRITE_16X2 0, 1, 6, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r3 + lea dstq, [dstq+strideq*4] + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 [rsp], 7, 0, 1, strideq*2, r3 +.end3: + pxor m2, m2 + REPX {mova [cq+32*x], m2}, -8, -7, -6, -5, -4, -3, -2, -1 + lea dstq, [dstq+strideq*4] + WRITE_16X2 8, 9, 0, 1, strideq*0, strideq*1 + WRITE_16X2 10, 11, 0, 1, strideq*2, r3 + REPX {mova [cq+32*x], m2}, 0, 1, 2, 3, 4, 5, 6, 7 + lea dstq, [dstq+strideq*4] + WRITE_16X2 12, 13, 0, 1, strideq*0, strideq*1 + WRITE_16X2 14, 15, 0, 1, strideq*2, r3 + RET +ALIGN function_align +.main: + vpbroadcastd m15, [o(pd_2048)] + mova [rsp+gprsize+32*1], m1 + mova [rsp+gprsize+32*2], m9 + IDCT8_1D 0, 2, 4, 6, 8, 10, 12, 14, 1, 9, 15 + mova m1, [rsp+gprsize+32*2] ; in9 + mova [rsp+gprsize+32*2], m14 ; tmp7 + mova m9, [rsp+gprsize+32*1] ; in1 + mova [rsp+gprsize+32*1], m10 ; tmp5 + mova m14, [rsp+gprsize+32*0] ; in15 + mova [rsp+gprsize+32*0], m6 ; tmp3 + IDCT16_1D_ODDHALF 9, 3, 5, 7, 1, 11, 13, 14, 6, 10, 15 + mova m6, [rsp+gprsize+32*1] ; tmp5 + psubsw m15, m0, m14 ; out15 + paddsw m0, m14 ; out0 + psubsw m14, m2, m13 ; out14 + paddsw m2, m13 ; out1 + mova [rsp+gprsize+32*1], m2 + psubsw m13, m4, m11 ; out13 + paddsw m2, m4, m11 ; out2 + psubsw m11, m8, m7 ; out11 + paddsw m4, m8, m7 ; out4 + mova m7, [rsp+gprsize+32*2] ; tmp7 + psubsw m10, m6, m5 ; out10 + paddsw m5, m6 ; out5 + psubsw m8, m7, m9 ; out8 + paddsw m7, m9 ; out7 + psubsw m9, m12, m3 ; out9 + paddsw m6, m12, m3 ; out6 + mova m3, [rsp+gprsize+32*0] ; tmp3 + psubsw m12, m3, m1 ; out12 + paddsw m3, m1 ; out3 + ret + +INV_TXFM_16X16_FN adst, dct +INV_TXFM_16X16_FN adst, adst +INV_TXFM_16X16_FN adst, flipadst + +cglobal iadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 + ITX_16X16_LOAD_COEFS + call .main + call .main_pass1_end + pmulhrsw m0, m1, [cq+32*0] + pmulhrsw m2, m1, [cq+32*1] + REPX {pmulhrsw x, m1}, m4, m6, m8, m10 + pmulhrsw m12, m1, [cq+32*2] + pmulhrsw m14, m1, [cq+32*3] + vextracti128 [rsp+16*5], m8, 1 + mova [rsp+16*1], xm8 + pxor m8, m8 + psubw m1, m8, m1 + jmp m(idct_16x16_internal).pass1_end2 +ALIGN function_align +.pass2: + call .main + call .main_pass2_end + REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14 + mova [rsp+32*0], m6 + pxor m6, m6 + psubw m1, m6, m1 + jmp m(idct_16x16_internal).end2 +ALIGN function_align +.main: + vpbroadcastd m15, [o(pd_2048)] + mova [rsp+gprsize+32*1], m0 + mova [rsp+gprsize+32*2], m4 + ITX_MULSUB_2W 13, 2, 0, 4, 15, 995, 3973 ; t3, t2 + ITX_MULSUB_2W 9, 6, 0, 4, 15, 2440, 3290 ; t7, t6 + ITX_MULSUB_2W 5, 10, 0, 4, 15, 3513, 2106 ; t11, t10 + ITX_MULSUB_2W 1, 14, 0, 4, 15, 4052, 601 ; t15, t14 + psubsw m0, m2, m10 ; t10a + paddsw m2, m10 ; t2a + psubsw m10, m13, m5 ; t11a + paddsw m13, m5 ; t3a + psubsw m5, m6, m14 ; t14a + paddsw m6, m14 ; t6a + psubsw m14, m9, m1 ; t15a + paddsw m9, m1 ; t7a + ITX_MULSUB_2W 0, 10, 1, 4, 15, 3406, 2276 ; t11, t10 + ITX_MULSUB_2W 14, 5, 1, 4, 15, 2276, 3406 ; t14, t15 + psubsw m1, m10, m14 ; t14a + paddsw m10, m14 ; t10a + psubsw m14, m0, m5 ; t15a + paddsw m0, m5 ; t11a + psubsw m5, m2, m6 ; t6 + paddsw m2, m6 ; t2 + psubsw m6, m13, m9 ; t7 + paddsw m13, m9 ; t3 + ITX_MULSUB_2W 6, 5, 4, 9, 15, 3784, 1567 ; t6a, t7a + ITX_MULSUB_2W 14, 1, 4, 9, 15, 3784, 1567 ; t14, t15 + mova m9, [rsp+gprsize+32*0] ; in15 + mova [rsp+gprsize+32*0], m10 ; t10a + mova m4, [rsp+gprsize+32*1] ; in0 + mova [rsp+gprsize+32*1], m6 ; t6a + mova m6, [rsp+gprsize+32*2] ; in4 + mova [rsp+gprsize+32*2], m2 ; t2 + ITX_MULSUB_2W 9, 4, 2, 10, 15, 201, 4091 ; t1, t0 + ITX_MULSUB_2W 11, 6, 2, 10, 15, 1751, 3703 ; t5, t4 + ITX_MULSUB_2W 7, 8, 2, 10, 15, 3035, 2751 ; t9, t8 + ITX_MULSUB_2W 3, 12, 2, 10, 15, 3857, 1380 ; t13, t12 + psubsw m10, m4, m8 ; t8a + paddsw m8, m4 ; t0a + psubsw m4, m9, m7 ; t9a + paddsw m9, m7 ; t1a + psubsw m7, m6, m12 ; t12a + paddsw m6, m12 ; t4a + psubsw m12, m11, m3 ; t13a + paddsw m11, m3 ; t5a + ITX_MULSUB_2W 10, 4, 2, 3, 15, 799, 4017 ; t9, t8 + ITX_MULSUB_2W 12, 7, 2, 3, 15, 4017, 799 ; t12, t13 + psubsw m3, m9, m11 ; t5 + paddsw m9, m11 ; t1 + psubsw m11, m4, m12 ; t12a + paddsw m4, m12 ; t8a + paddsw m12, m8, m6 ; t0 + psubsw m8, m6 ; t4 + paddsw m6, m10, m7 ; t9a + psubsw m10, m7 ; t13a + ITX_MULSUB_2W 8, 3, 2, 7, 15, 1567, 3784 ; t5a, t4a + ITX_MULSUB_2W 11, 10, 2, 7, 15, 1567, 3784 ; t13, t12 + mova m7, [rsp+gprsize+32*0] ; t10a + mova m2, [rsp+gprsize+32*1] ; t6a + paddsw m15, m9, m13 ; -out15 + psubsw m9, m13 ; t3a + paddsw m13, m11, m1 ; -out13 + psubsw m11, m1 ; t15a + psubsw m1, m4, m7 ; t10 + paddsw m7, m4 ; -out1 + psubsw m4, m3, m2 ; t6 + paddsw m3, m2 ; -out3 + paddsw m2, m10, m14 ; out2 + psubsw m10, m14 ; t14a + paddsw m14, m6, m0 ; out14 + psubsw m6, m0 ; t11 + mova m0, [rsp+gprsize+32*2] ; t2 + mova [rsp+gprsize+32*1], m7 + psubsw m7, m12, m0 ; t2a + paddsw m0, m12 ; out0 + paddsw m12, m8, m5 ; out12 + psubsw m8, m5 ; t7 + ret +ALIGN function_align +.main_pass1_end: + mova [cq+32*0], m0 + mova [cq+32*1], m2 + mova [cq+32*2], m12 + mova [cq+32*3], m14 + vpbroadcastd m14, [pw_m2896_2896] + vpbroadcastd m12, [pw_2896_2896] + vpbroadcastd m2, [pd_2048] + punpcklwd m5, m11, m10 + punpckhwd m11, m10 + pmaddwd m10, m14, m5 + pmaddwd m0, m14, m11 + pmaddwd m5, m12 + pmaddwd m11, m12 + REPX {paddd x, m2}, m10, m0, m5, m11 + REPX {psrad x, 12}, m10, m0, m5, m11 + packssdw m10, m0 ; out10 + packssdw m5, m11 ; -out5 + punpcklwd m11, m8, m4 + punpckhwd m8, m4 + pmaddwd m4, m12, m11 + pmaddwd m0, m12, m8 + pmaddwd m11, m14 + pmaddwd m8, m14 + REPX {paddd x, m2}, m4, m0, m11, m8 + REPX {psrad x, 12}, m4, m0, m11, m8 + packssdw m4, m0 ; out4 + packssdw m11, m8 ; -out11 + punpcklwd m8, m9, m7 + punpckhwd m9, m7 + pmaddwd m7, m12, m8 + pmaddwd m0, m12, m9 + pmaddwd m8, m14 + pmaddwd m9, m14 + REPX {paddd x, m2}, m7, m0, m8, m9 + REPX {psrad x, 12}, m7, m0, m8, m9 + packssdw m7, m0 ; -out7 + packssdw m8, m9 ; out8 + punpckhwd m0, m6, m1 + punpcklwd m6, m1 + pmaddwd m1, m14, m0 + pmaddwd m9, m14, m6 + pmaddwd m0, m12 + pmaddwd m6, m12 + REPX {paddd x, m2}, m1, m9, m0, m6 + REPX {psrad x, 12}, m1, m9, m0, m6 + packssdw m9, m1 ; -out7 + packssdw m6, m0 ; out8 + vpbroadcastd m1, [o(pw_8192)] + ret +ALIGN function_align +.main_pass2_end: + ; In pass 2 we're going to clip to pixels afterwards anyway, so clipping to + ; 16-bit here will produce the same result as using 32-bit intermediates. + paddsw m5, m10, m11 ; -out5 + psubsw m10, m11 ; out10 + psubsw m11, m4, m8 ; -out11 + paddsw m4, m8 ; out4 + psubsw m8, m7, m9 ; out8 + paddsw m7, m9 ; -out7 + psubsw m9, m1, m6 ; -out9 + paddsw m6, m1 ; out6 + vpbroadcastd m1, [o(pw_2896x8)] + REPX {pmulhrsw x, m1}, m4, m5, m6, m7, m8, m9, m10, m11 + vpbroadcastd m1, [o(pw_2048)] + ret + +INV_TXFM_16X16_FN flipadst, dct +INV_TXFM_16X16_FN flipadst, adst +INV_TXFM_16X16_FN flipadst, flipadst + +cglobal iflipadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 + ITX_16X16_LOAD_COEFS + call m(iadst_16x16_internal).main + call m(iadst_16x16_internal).main_pass1_end + pmulhrsw m6, m1 + pmulhrsw m2, m1, m8 + mova [rsp+32*2], m6 + pmulhrsw m6, m1, m4 + pmulhrsw m4, m1, m10 + pmulhrsw m8, m1, [cq+32*3] + pmulhrsw m10, m1, [cq+32*2] + pmulhrsw m12, m1, [cq+32*1] + pmulhrsw m14, m1, [cq+32*0] + pxor m0, m0 + psubw m0, m1 + REPX {pmulhrsw x, m0}, m3, m5, m7, m11, m15 + pmulhrsw m1, m0, m9 + pmulhrsw m9, m0, m13 + pmulhrsw m0, [rsp+32*1] + mova [rsp+16*0], xm15 + mova [rsp+16*1], xm7 + vperm2i128 m15, m15, m7, 0x31 + vinserti128 m7, m2, xm14, 1 + vperm2i128 m14, m2, m14, 0x31 + vinserti128 m2, m9, xm5, 1 + vperm2i128 m9, m9, m5, 0x31 + vinserti128 m5, m4, xm12, 1 + vperm2i128 m12, m4, m12, 0x31 + vinserti128 m4, m11, xm3, 1 + vperm2i128 m11, m11, m3, 0x31 + vinserti128 m3, m10, xm6, 1 + vperm2i128 m10, m10, m6, 0x31 + vinserti128 m6, m1, xm0, 1 + vperm2i128 m13, m1, m0, 0x31 + vinserti128 m1, m8, [rsp+32*2], 1 + vperm2i128 m8, m8, [rsp+32*2], 0x31 + jmp m(idct_16x16_internal).pass1_end3 +.pass2: + call m(iadst_16x16_internal).main + call m(iadst_16x16_internal).main_pass2_end + pmulhrsw m0, m1 + pmulhrsw m8, m1 + mova [rsp+32*0], m0 + mova [rsp+32*2], m8 + pxor m0, m0 + psubw m0, m1 + pmulhrsw m8, m0, m7 + pmulhrsw m7, m0, m9 + pmulhrsw m9, m1, m6 + pmulhrsw m6, m1, m10 + pmulhrsw m10, m0, m5 + pmulhrsw m5, m0, m11 + pmulhrsw m11, m1, m4 + pmulhrsw m4, m1, m12 + pmulhrsw m12, m0, m3 + pmulhrsw m3, m0, m13 + pmulhrsw m13, m1, m2 + pmulhrsw m1, m14 + pmulhrsw m14, m0, [rsp+32*1] + pmulhrsw m0, m15 + lea r3, [strideq*3] + WRITE_16X2 0, 1, 2, 0, strideq*0, strideq*1 + mova m15, [rsp+32*0] + WRITE_16X2 3, 4, 0, 1, strideq*2, r3 + lea dstq, [dstq+strideq*4] + WRITE_16X2 5, 6, 0, 1, strideq*0, strideq*1 + WRITE_16X2 7, [rsp+32*2], 0, 1, strideq*2, r3 + jmp m(idct_16x16_internal).end3 + +%macro IDTX16B 3 ; src/dst, tmp, pw_1697x16 + pmulhrsw m%2, m%3, m%1 + psraw m%2, 1 + pavgw m%1, m%2 ; signs are guaranteed to be equal +%endmacro + +INV_TXFM_16X16_FN identity, dct +INV_TXFM_16X16_FN identity, identity + +cglobal iidentity_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 + vpbroadcastd m7, [o(pw_1697x16)] + mova xm0, [cq+16* 0] + vinserti128 m0, [cq+16*16], 1 + mova xm15, [cq+16* 1] + vinserti128 m15, [cq+16*17], 1 + mova xm1, [cq+16* 2] + vinserti128 m1, [cq+16*18], 1 + mova xm8, [cq+16* 3] + vinserti128 m8, [cq+16*19], 1 + mova xm2, [cq+16* 4] + vinserti128 m2, [cq+16*20], 1 + mova xm9, [cq+16* 5] + vinserti128 m9, [cq+16*21], 1 + mova xm3, [cq+16* 6] + vinserti128 m3, [cq+16*22], 1 + mova xm10, [cq+16* 7] + add cq, 16*16 + vinserti128 m10, [cq+16* 7], 1 + mova xm4, [cq-16* 8] + vinserti128 m4, [cq+16* 8], 1 + mova xm11, [cq-16* 7] + vinserti128 m11, [cq+16* 9], 1 + mova xm5, [cq-16* 6] + vinserti128 m5, [cq+16*10], 1 + mova xm12, [cq-16* 5] + vinserti128 m12, [cq+16*11], 1 + mova xm13, [cq-16* 3] + vinserti128 m13, [cq+16*13], 1 + mova xm14, [cq-16* 1] + vinserti128 m14, [cq+16*15], 1 + REPX {IDTX16B x, 6, 7}, 0, 15, 1, 8, 2, 9, 3, \ + 10, 4, 11, 5, 12, 13, 14 + mova xm6, [cq-16* 4] + vinserti128 m6, [cq+16*12], 1 + mova [rsp], m0 + IDTX16B 6, 0, 7 + mova xm0, [cq-16* 2] + vinserti128 m0, [cq+16*14], 1 + pmulhrsw m7, m0 + psraw m7, 1 + pavgw m7, m0 + jmp m(idct_16x16_internal).pass1_end3 +ALIGN function_align +.pass2: + vpbroadcastd m15, [o(pw_1697x16)] + mova [rsp+32*1], m0 + REPX {IDTX16 x, 0, 15}, 1, 2, 3, 4, 5, 6, 7, \ + 8, 9, 10, 11, 12, 13, 14 + mova m0, [rsp+32*1] + mova [rsp+32*1], m1 + IDTX16 0, 1, 15 + mova m1, [rsp+32*0] + pmulhrsw m15, m1 + paddsw m1, m1 + paddsw m15, m1 + jmp m(idct_16x16_internal).end + +%define o_base deint_shuf + 128 + +%macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2 +%if %3 + vpbroadcastd m15, [o(pw_2896x8)] + pmulhrsw m0, m15, [%1+%2*0] + pmulhrsw m1, m15, [%1+%2*1] + pmulhrsw m2, m15, [%1+%2*2] + pmulhrsw m3, m15, [%1+%2*3] + pmulhrsw m4, m15, [%1+%2*4] + pmulhrsw m5, m15, [%1+%2*5] + pmulhrsw m6, m15, [%1+%2*6] + pmulhrsw m7, m15, [%1+%2*7] +%else + mova m0, [%1+%2*0] + mova m1, [%1+%2*1] + mova m2, [%1+%2*2] + mova m3, [%1+%2*3] + mova m4, [%1+%2*4] + mova m5, [%1+%2*5] + mova m6, [%1+%2*6] + mova m7, [%1+%2*7] +%endif +%endmacro + +%macro LOAD_8ROWS_H 2-3 0 ; src, stride, is_rect2 +%if %3 +%if %3 == 1 + vpbroadcastd m15, [o(pw_2896x8)] +%endif + pmulhrsw m8, m15, [%1+%2*0] + pmulhrsw m9, m15, [%1+%2*1] + pmulhrsw m10, m15, [%1+%2*2] + pmulhrsw m11, m15, [%1+%2*3] + pmulhrsw m12, m15, [%1+%2*4] + pmulhrsw m13, m15, [%1+%2*5] + pmulhrsw m14, m15, [%1+%2*6] + pmulhrsw m15, [%1+%2*7] +%else + mova m8, [%1+%2*0] + mova m9, [%1+%2*1] + mova m10, [%1+%2*2] + mova m11, [%1+%2*3] + mova m12, [%1+%2*4] + mova m13, [%1+%2*5] + mova m14, [%1+%2*6] + mova m15, [%1+%2*7] +%endif +%endmacro + +%macro ITX_UNPACK_MULHRSW 7 ; dst1, dst2/src, tmp, coef[1-4] + vpbroadcastd m%3, [r5-pw_201_4091x8+pw_%4_%5x8] + punpcklwd m%1, m%2, m%2 + pmulhrsw m%1, m%3 + vpbroadcastd m%3, [r5-pw_201_4091x8+pw_%6_%7x8] + punpckhwd m%2, m%2 + pmulhrsw m%2, m%3 +%endmacro + +cglobal inv_txfm_add_dct_dct_8x32, 4, 4, 0, dst, stride, c, eob + lea rax, [o_base] + test eobd, eobd + jz .dconly + PROLOGUE 0, 4, 16, 32*3, dst, stride, c, eob + %undef cmp + cmp eobd, 106 + jle .fast + LOAD_8ROWS cq+32*1, 32*2 + call m(idct_16x8_internal).main + vperm2i128 m11, m0, m4, 0x31 + vinserti128 m0, xm4, 1 + vperm2i128 m4, m1, m5, 0x31 + vinserti128 m1, xm5, 1 + vperm2i128 m5, m2, m6, 0x31 + vinserti128 m2, xm6, 1 + vperm2i128 m6, m3, m7, 0x31 + vinserti128 m3, xm7, 1 + pxor m7, m7 + REPX {mova [cq+32*x], m7}, 1, 3, 5, 7, 9, 11, 13, 15 + punpckhwd m7, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpcklwd m3, m11, m4 + punpckhwd m11, m4 + punpckhwd m4, m5, m6 + punpcklwd m5, m6 + punpckhdq m6, m0, m2 + punpckldq m0, m2 + punpckldq m2, m3, m5 + punpckhdq m3, m5 + punpckhdq m5, m11, m4 + punpckldq m11, m4 + punpckldq m4, m7, m1 + punpckhdq m7, m1 + punpckhqdq m12, m6, m0 + punpcklqdq m0, m6 ; out4 + punpckhqdq m13, m7, m4 + punpcklqdq m4, m7 ; out5 + punpckhqdq m14, m3, m2 + punpcklqdq m2, m3 ; out6 + punpckhqdq m15, m5, m11 + punpcklqdq m11, m5 ; out7 + mova [rsp+32*0], m0 + mova [rsp+32*1], m4 + mova [rsp+32*2], m2 +.fast: + LOAD_8ROWS cq+32*0, 32*2 + call m(idct_16x8_internal).main + vperm2i128 m8, m0, m4, 0x31 + vinserti128 m0, xm4, 1 + vperm2i128 m4, m1, m5, 0x31 + vinserti128 m1, xm5, 1 + vperm2i128 m5, m2, m6, 0x31 + vinserti128 m2, xm6, 1 + vperm2i128 m6, m3, m7, 0x31 + vinserti128 m3, xm7, 1 + vpbroadcastd m9, [o(pw_8192)] + pxor m7, m7 + REPX {mova [cq+32*x], m7}, 0, 2, 4, 6, 8, 10, 12, 14 + punpckhwd m7, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m8, m4 + punpcklwd m8, m4 + punpckhwd m4, m5, m6 + punpcklwd m5, m6 + punpckhdq m6, m0, m2 + punpckldq m0, m2 + punpckldq m2, m8, m5 + punpckhdq m8, m5 + punpckhdq m5, m3, m4 + punpckldq m3, m4 + punpckhdq m4, m7, m1 + punpckldq m7, m1 + punpcklqdq m1, m7, m4 + punpckhqdq m7, m4 ; out9 + punpckhqdq m4, m2, m8 ; out10 + punpcklqdq m2, m8 + punpckhqdq m8, m3, m5 + punpcklqdq m3, m5 + punpckhqdq m5, m0, m6 ; out8 + punpcklqdq m0, m6 + REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m7 + cmp eobd, 106 + jg .full + mova [rsp+32*0], m5 + mova [rsp+32*1], m7 + mova [rsp+32*2], m4 + pmulhrsw m11, m9, m8 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call .main_fast + jmp .pass2 +.dconly: + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_8192)] + mov [cq], eobd + pmulhrsw xm0, xm2 + psrlw xm2, 2 ; pw_2048 + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm2 + vpbroadcastw m0, xm0 + mov r2d, 8 + jmp m(inv_txfm_add_dct_dct_8x8).end2 +.full: + REPX {pmulhrsw x, m9}, m12, m13, m14, m15 + pmulhrsw m6, m9, [rsp+32*2] + mova [rsp+32*2], m4 + pmulhrsw m4, m9, [rsp+32*0] + mova [rsp+32*0], m5 + pmulhrsw m5, m9, [rsp+32*1] + mova [rsp+32*1], m7 + pmulhrsw m7, m9, m11 + pmulhrsw m11, m9, m8 + call .main +.pass2: + vpbroadcastd m12, [o(pw_2048)] + REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m13, m14, m15 + pmulhrsw m12, [rsp] + REPX {vpermq x, x, q3120}, m0, m2, m4, m6, m8, m10, m12, m14 + REPX {vpermq x, x, q2031}, m1, m3, m5, m7, m9, m11, m13, m15 + mova [rsp+32*0], m4 + mova [rsp+32*1], m6 + lea r3, [strideq*3] + WRITE_8X4 0, 1, 4, 6 + lea dstq, [dstq+strideq*4] + WRITE_8X4 2, 3, 4, 6 + lea dstq, [dstq+strideq*4] + WRITE_8X4 [rsp+32*0], 5, 4, 6 + lea dstq, [dstq+strideq*4] + WRITE_8X4 [rsp+32*1], 7, 4, 6 + lea dstq, [dstq+strideq*4] + WRITE_8X4 8, 9, 4, 6 + lea dstq, [dstq+strideq*4] + WRITE_8X4 10, 11, 4, 6 + lea dstq, [dstq+strideq*4] + WRITE_8X4 12, 13, 4, 6 + lea dstq, [dstq+strideq*4] + WRITE_8X4 14, 15, 4, 6 + RET +ALIGN function_align +.main_fast: ; bottom half is zero + call m(idct_8x16_internal).main + mova m8, [rsp+gprsize+0*32] + mova [rsp+gprsize+0*32], m0 + mova m9, [rsp+gprsize+1*32] + mova [rsp+gprsize+1*32], m1 + mova m0, [rsp+gprsize+2*32] + mova [rsp+gprsize+2*32], m6 + lea r5, [rax-(o_base)+pw_201_4091x8] + ITX_UNPACK_MULHRSW 1, 8, 6, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a + ITX_UNPACK_MULHRSW 15, 9, 6, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a + ITX_UNPACK_MULHRSW 14, 0, 6, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a + ITX_UNPACK_MULHRSW 13, 11, 6, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a + jmp .main2 +ALIGN function_align +.main: + call m(idct_8x16_internal).main + mova m8, [rsp+gprsize+0*32] + mova [rsp+gprsize+0*32], m0 + mova m9, [rsp+gprsize+1*32] + mova [rsp+gprsize+1*32], m1 + mova m0, [rsp+gprsize+2*32] + mova [rsp+gprsize+2*32], m6 + punpcklwd m1, m15, m8 ; in31 in1 + punpckhwd m8, m15 ; in3 in29 + punpcklwd m15, m14, m9 ; in27 in5 + punpckhwd m9, m14 ; in7 in25 + punpcklwd m14, m13, m0 ; in23 in9 + punpckhwd m0, m13 ; in11 in21 + punpcklwd m13, m12, m11 ; in19 in13 + punpckhwd m11, m12 ; in15 in17 + ITX_MUL2X_PACK 1, 6, 12, 10, 201, 4091, 3 ; t16a, t31a + ITX_MUL2X_PACK 8, 6, 12, 10, 4052, 601, 3 ; t23a, t24a + ITX_MUL2X_PACK 15, 6, 12, 10, 995, 3973, 3 ; t20a, t27a + ITX_MUL2X_PACK 9, 6, 12, 10, 3857, 1380, 3 ; t19a, t28a + ITX_MUL2X_PACK 14, 6, 12, 10, 1751, 3703, 3 ; t18a, t29a + ITX_MUL2X_PACK 0, 6, 12, 10, 3513, 2106, 3 ; t21a, t26a + ITX_MUL2X_PACK 13, 6, 12, 10, 2440, 3290, 3 ; t22a, t25a + ITX_MUL2X_PACK 11, 6, 12, 10, 3035, 2751, 3 ; t17a, t30a +.main2: + psubsw m6, m1, m11 ; t17 t30 + paddsw m1, m11 ; t16 t31 + psubsw m11, m9, m14 ; t18 t29 + paddsw m9, m14 ; t19 t28 + psubsw m14, m15, m0 ; t21 t26 + paddsw m15, m0 ; t20 t27 + psubsw m0, m8, m13 ; t22 t25 + paddsw m8, m13 ; t23 t24 + ITX_MUL2X_PACK 6, 12, 13, 10, 799, 4017, 3 ; t17a t30a + ITX_MUL2X_PACK 11, 12, 13, 10, m4017, 799, 3 ; t18a t29a + ITX_MUL2X_PACK 14, 12, 13, 10, 3406, 2276, 3 ; t21a t26a + ITX_MUL2X_PACK 0, 12, 13, 10, m2276, 3406, 3 ; t22a t25a + psubsw m13, m1, m9 ; t19a t28a + paddsw m1, m9 ; t16a t31a + psubsw m9, m8, m15 ; t20a t27a + paddsw m8, m15 ; t23a t24a + psubsw m15, m6, m11 ; t18 t29 + paddsw m6, m11 ; t17 t30 + psubsw m11, m0, m14 ; t21 t26 + paddsw m0, m14 ; t22 t25 + ITX_MUL2X_PACK 15, 12, 14, 10, 1567, 3784, 3 ; t18a t29a + ITX_MUL2X_PACK 13, 12, 14, 10, 1567, 3784, 3 ; t19 t28 + ITX_MUL2X_PACK 9, 12, 14, 10, m3784, 1567, 3 ; t20 t27 + ITX_MUL2X_PACK 11, 12, 14, 10, m3784, 1567, 3 ; t21a t26a + vbroadcasti128 m12, [o(deint_shuf)] + psubsw m14, m1, m8 ; t23 t24 + paddsw m1, m8 ; t16 t31 + psubsw m8, m6, m0 ; t22a t25a + paddsw m6, m0 ; t17a t30a + psubsw m0, m15, m11 ; t21 t26 + paddsw m15, m11 ; t18 t29 + psubsw m11, m13, m9 ; t20a t27a + paddsw m13, m9 ; t19a t28a + REPX {pshufb x, m12}, m1, m6, m15, m13 + ITX_MUL2X_PACK 14, 9, 12, 10, 2896, 2896 ; t24a t23a + vpbroadcastd m9, [o(pw_m2896_2896)] + ITX_MUL2X_PACK 8, 12, _, 10, 12, 9, 4 ; t22 t25 + vpbroadcastd m12, [o(pw_2896_2896)] + ITX_MUL2X_PACK 0, 12, _, 10, 12, 9, 4 ; t21a t26a + vpbroadcastd m12, [o(pw_2896_2896)] + ITX_MUL2X_PACK 11, 9, _, 10, 9, 12, 4 ; t27 t20 + shufps m9, m14, m8, q1032 ; t23a t22 + vpblendd m14, m8, 0xcc ; t24a t25 + shufps m8, m11, m0, q1032 ; t20 t21a + vpblendd m11, m0, 0xcc ; t27 t26a + punpcklqdq m0, m1, m6 ; t16 t17a + punpckhqdq m1, m6 ; t31 t30a + psubsw m10, m5, m8 ; out20 out21 + paddsw m5, m8 ; out11 out10 + psubsw m6, m3, m14 ; out24 out25 + paddsw m3, m14 ; out7 out6 + psubsw m8, m7, m0 ; out16 out17 + paddsw m7, m0 ; out15 out14 + mova m0, [rsp+gprsize+0*32] + punpcklqdq m12, m13, m15 ; t19a t18 + punpckhqdq m13, m15 ; t28a t29 + psubsw m15, m0, m1 ; out31 out30 + paddsw m0, m1 ; out0 out1 + mova m1, [rsp+gprsize+1*32] + mova [rsp+gprsize+0*32], m6 + mova m6, [rsp+gprsize+2*32] + psubsw m14, m1, m13 ; out28 out29 + paddsw m1, m13 ; out3 out2 + psubsw m13, m2, m11 ; out27 out26 + paddsw m2, m11 ; out4 out5 + psubsw m11, m4, m9 ; out23 out22 + paddsw m4, m9 ; out8 out9 + psubsw m9, m6, m12 ; out19 out18 + paddsw m6, m12 ; out12 out13 + ret + +%macro LOAD_PACKED_16X2 4 ; dst, tmp, row[1-2] + vbroadcasti128 m%1, [cq+16*%3] + vbroadcasti128 m%2, [cq+16*%4] + shufpd m%1, m%2, 0x0c +%endmacro + +cglobal inv_txfm_add_dct_dct_32x8, 4, 4, 0, dst, stride, c, eob + lea rax, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_8192)] + mov [cq], eobd + mov r2d, 8 +.dconly: + pmulhrsw xm0, xm2 + movd xm2, [pw_2048] ; intentionally rip-relative + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm2 + vpbroadcastw m0, xm0 + pxor m3, m3 +.dconly_loop: + mova m1, [dstq] + punpckhbw m2, m1, m3 + punpcklbw m1, m3 + paddw m2, m0 + paddw m1, m0 + packuswb m1, m2 + mova [dstq], m1 + add dstq, strideq + dec r2d + jg .dconly_loop + RET +.normal: + PROLOGUE 0, 4, 16, 32*3, dst, stride, c, eob + %undef cmp + LOAD_PACKED_16X2 0, 7, 0, 2 ; in0 in2 + LOAD_PACKED_16X2 4, 7, 1, 3 ; in1 in3 + LOAD_PACKED_16X2 1, 7, 4, 6 ; in4 in6 + LOAD_PACKED_16X2 5, 7, 5, 7 ; in5 in7 + pxor m8, m8 + REPX {mova [cq+32*x], m8}, 0, 1, 2, 3 + add cq, 16*16 + LOAD_PACKED_16X2 2, 7, -8, -6 ; in8 in10 + LOAD_PACKED_16X2 6, 7, -7, -5 ; in9 in11 + LOAD_PACKED_16X2 3, 7, -4, -2 ; in12 in14 + LOAD_PACKED_16X2 11, 7, -3, -1 ; in13 in15 + REPX {mova [cq+32*x], m8}, -4, -3, -2, -1 + mova [rsp+32*0], m4 + mova [rsp+32*1], m5 + mova [rsp+32*2], m6 + cmp eobd, 106 + jg .full + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(inv_txfm_add_dct_dct_8x32).main_fast + jmp .pass2 +.full: + LOAD_PACKED_16X2 4, 7, 0, 2 ; in16 in18 + LOAD_PACKED_16X2 12, 7, 3, 1 ; in19 in17 + LOAD_PACKED_16X2 5, 7, 4, 6 ; in20 in22 + LOAD_PACKED_16X2 13, 7, 7, 5 ; in23 in21 + REPX {mova [cq+32*x], m8}, 0, 1, 2, 3 + add cq, 16*8 + LOAD_PACKED_16X2 6, 7, 0, 2 ; in24 in26 + LOAD_PACKED_16X2 14, 7, 3, 1 ; in27 in25 + LOAD_PACKED_16X2 7, 8, 4, 6 ; in28 in30 + LOAD_PACKED_16X2 15, 8, 7, 5 ; in31 in29 + pxor m8, m8 + REPX {mova [cq+32*x], m8}, 0, 1, 2, 3 + call m(inv_txfm_add_dct_dct_8x32).main +.pass2: + vpbroadcastd m12, [o(pw_8192)] + REPX {pmulhrsw x, m12}, m8, m9, m10, m11, m13, m14, m15 + mova [rsp+32*1], m9 + mova [rsp+32*2], m10 + punpckhwd m9, m0, m2 + punpcklwd m0, m2 + punpckhwd m2, m1, m3 + punpcklwd m1, m3 + punpcklwd m10, m4, m6 + punpckhwd m4, m6 + punpcklwd m6, m5, m7 + punpckhwd m5, m7 + punpckhwd m3, m0, m9 + punpcklwd m0, m9 + punpckhwd m9, m2, m1 + punpcklwd m2, m1 + punpcklwd m7, m10, m4 + punpckhwd m10, m4 + punpcklwd m4, m5, m6 + punpckhwd m5, m6 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + punpckldq m2, m3, m9 + punpckhdq m3, m9 + punpckldq m6, m7, m4 + punpckhdq m7, m4 + punpckldq m9, m10, m5 + punpckhdq m10, m5 + REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m6, m7, m9, m10 + pmulhrsw m12, [rsp+32*0] + mova [rsp+32*0], m8 + vperm2i128 m4, m0, m6, 0x31 + vinserti128 m0, xm6, 1 + vperm2i128 m5, m1, m7, 0x31 + vinserti128 m1, xm7, 1 + vperm2i128 m6, m2, m9, 0x31 + vinserti128 m2, xm9, 1 + vperm2i128 m7, m3, m10, 0x31 + vinserti128 m3, xm10, 1 + call m(idct_16x8_internal).main + vpbroadcastd m8, [o(pw_2048)] + REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 + lea r2, [strideq*3] + WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r2 + lea r3, [dstq+strideq*4] + %define dstq r3 + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 6, 7, 0, 1, strideq*2, r2 + mova m0, [rsp+32*0] + mova m1, [rsp+32*1] + mova m2, [rsp+32*2] + punpckhwd m7, m0, m2 + punpcklwd m0, m2 + punpckhwd m2, m1, m11 + punpcklwd m1, m11 + punpckhwd m4, m12, m14 + punpcklwd m12, m14 + punpckhwd m5, m13, m15 + punpcklwd m13, m15 + punpckhwd m3, m0, m7 + punpcklwd m0, m7 + punpckhwd m9, m2, m1 + punpcklwd m2, m1 + punpcklwd m7, m12, m4 + punpckhwd m12, m4 + punpcklwd m4, m5, m13 + punpckhwd m5, m13 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + punpckldq m2, m3, m9 + punpckhdq m3, m9 + punpckldq m6, m7, m4 + punpckhdq m7, m4 + punpckldq m9, m12, m5 + punpckhdq m12, m5 + vperm2i128 m4, m0, m6, 0x31 + vinserti128 m0, xm6, 1 + vperm2i128 m5, m1, m7, 0x31 + vinserti128 m1, xm7, 1 + vperm2i128 m6, m2, m9, 0x31 + vinserti128 m2, xm9, 1 + vperm2i128 m7, m3, m12, 0x31 + vinserti128 m3, xm12, 1 + call m(idct_16x8_internal).main2 + vpbroadcastd m8, [o(pw_2048)] + REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 + add r0, 16 + add r3, 16 + %define dstq r0 + WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r2 + %define dstq r3 + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 6, 7, 0, 1, strideq*2, r2 + RET + +cglobal inv_txfm_add_identity_identity_8x32, 4, 5, 11, dst, stride, c, eob + vpbroadcastd m9, [pw_5] + lea r4, [strideq*3] + sub eobd, 107 ; loop_iterations = 1 + (eobd >= 107) +.loop: + mova xm0,[cq+16* 0] + mova xm1, [cq+16* 4] + vinserti128 m0, [cq+16* 1], 1 + vinserti128 m1, [cq+16* 5], 1 + pxor m8, m8 + mova [cq+32*0], m8 + mova [cq+32*2], m8 + add cq, 16*16 + mova xm2, [cq-16* 8] + mova xm3, [cq-16* 4] + vinserti128 m2, [cq-16* 7], 1 + vinserti128 m3, [cq-16* 3], 1 + mova xm4, [cq+16* 0] + mova xm5, [cq+16* 4] + vinserti128 m4, [cq+16* 1], 1 + vinserti128 m5, [cq+16* 5], 1 + mova xm6, [cq+16* 8] + mova xm7, [cq+16*12] + vinserti128 m6, [cq+16* 9], 1 + vinserti128 m7, [cq+16*13], 1 + REPX {mova [cq+32*x], m8}, -4, -2, 0, 2, 4, 6 + REPX {paddsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 + call .transpose8x8 + REPX {psraw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7 + WRITE_8X4 0, 4, 8, 10, strideq*8, strideq*4, r4*4 + add dstq, strideq + WRITE_8X4 1, 5, 0, 4, strideq*8, strideq*4, r4*4 + add dstq, strideq + WRITE_8X4 2, 6, 0, 4, strideq*8, strideq*4, r4*4 + add dstq, strideq + WRITE_8X4 3, 7, 0, 4, strideq*8, strideq*4, r4*4 + add dstq, strideq + sub cq, 16*16-32 + lea dstq, [dstq+r4*4] + add eobd, 0x80000000 + jnc .loop + RET +ALIGN function_align +.transpose8x8: + punpckhwd m8, m4, m5 + punpcklwd m4, m5 + punpckhwd m5, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m6, m7 + punpcklwd m6, m7 + punpckhwd m7, m2, m3 + punpcklwd m2, m3 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckldq m2, m4, m6 + punpckhdq m4, m6 + punpckhdq m6, m5, m7 + punpckldq m5, m7 + punpckldq m7, m8, m1 + punpckhdq m8, m1 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + punpcklqdq m4, m5, m7 + punpckhqdq m5, m7 + punpckhqdq m7, m6, m8 + punpcklqdq m6, m8 + ret + +cglobal inv_txfm_add_identity_identity_32x8, 4, 6, 10, dst, stride, c, eob + add cq, 16*8 + vpbroadcastd m9, [pw_4096] + lea r4, [strideq*3] + lea r5, [dstq+strideq*4] + sub eobd, 107 +.loop: + mova xm0, [cq-16*8] + mova xm1, [cq-16*7] + vinserti128 m0, [cq+16*0], 1 + vinserti128 m1, [cq+16*1], 1 + mova xm2, [cq-16*6] + mova xm3, [cq-16*5] + vinserti128 m2, [cq+16*2], 1 + vinserti128 m3, [cq+16*3], 1 + mova xm4, [cq-16*4] + mova xm5, [cq-16*3] + vinserti128 m4, [cq+16*4], 1 + vinserti128 m5, [cq+16*5], 1 + mova xm6, [cq-16*2] + mova xm7, [cq-16*1] + vinserti128 m6, [cq+16*6], 1 + vinserti128 m7, [cq+16*7], 1 + pxor m8, m8 + REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3 + call m(inv_txfm_add_identity_identity_8x32).transpose8x8 + REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 + WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r4 + %define dstq r5 + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 6, 7, 0, 1, strideq*2, r4 + add cq, 16*16 + add r0, 16 + add r5, 16 + add eobd, 0x80000000 + jnc .loop + RET + +%define o_base pw_5 + 128 + +%macro LOAD_16ROWS 2-4 0, 1 ; src, stride, is_rect2, zero_coefs +%if %3 + vpbroadcastd m15, [o(pw_2896x8)] + pmulhrsw m0, m15, [%1+%2* 0] + pmulhrsw m1, m15, [%1+%2* 1] + pmulhrsw m2, m15, [%1+%2* 2] + pmulhrsw m3, m15, [%1+%2* 3] + pmulhrsw m4, m15, [%1+%2* 4] + pmulhrsw m5, m15, [%1+%2* 5] + pmulhrsw m6, m15, [%1+%2* 6] + pmulhrsw m7, m15, [%1+%2* 7] + pmulhrsw m8, m15, [%1+%2* 8] + pmulhrsw m9, m15, [%1+%2* 9] + pmulhrsw m10, m15, [%1+%2*10] + pmulhrsw m11, m15, [%1+%2*11] + pmulhrsw m12, m15, [%1+%2*12] + pmulhrsw m13, m15, [%1+%2*13] + pmulhrsw m14, m15, [%1+%2*14] + pmulhrsw m15, [%1+%2*15] +%else + mova m0, [%1+%2* 0] + mova m1, [%1+%2* 1] + mova m2, [%1+%2* 2] + mova m3, [%1+%2* 3] + mova m4, [%1+%2* 4] + mova m5, [%1+%2* 5] + mova m6, [%1+%2* 6] + mova m7, [%1+%2* 7] + mova m8, [%1+%2* 8] + mova m9, [%1+%2* 9] + mova m10, [%1+%2*10] + mova m11, [%1+%2*11] + mova m12, [%1+%2*12] + mova m13, [%1+%2*13] + mova m14, [%1+%2*14] + mova m15, [%1+%2*15] +%endif + mova [rsp], m15 +%if %4 + pxor m15, m15 + REPX {mova [%1+%2*x], m15}, 0, 1, 2, 3, 4, 5, 6, 7, \ + 8, 9, 10, 11, 12, 13, 14, 15 +%endif +%endmacro + +%macro IDCT32_PASS2_END 7 ; coefs[1-2], tmp[1-2], rnd, offset[1-2] + mova m%4, [%2] + paddsw m%3, m%1, m%4 + psubsw m%1, m%4 + pmovzxbw m%4, [dstq+%6] + pmulhrsw m%3, m%5 + pmulhrsw m%1, m%5 + paddw m%3, m%4 + pmovzxbw m%4, [r2+%7] + paddw m%1, m%4 + packuswb m%3, m%1 + vpermq m%3, m%3, q3120 + mova [dstq+%6], xm%3 + vextracti128 [r2+%7], m%3, 1 +%endmacro + +cglobal inv_txfm_add_dct_dct_16x32, 4, 4, 0, dst, stride, c, eob + lea rax, [o_base] + test eobd, eobd + jz .dconly + PROLOGUE 0, 8, 16, 32*35, dst, stride, c, eob, tmp1, tmp2, \ + base, tmp3 + %undef cmp + LOAD_16ROWS cq, 64, 1 + call m(idct_16x16_internal).main + lea tmp1q, [rsp+32*7] + lea tmp2q, [tmp1q+32*8] + lea tmp3q, [tmp1q+32*16] + mova m1, [rsp+32*1] + mova [rsp+32*0], m6 + mova [rsp+32*1], m7 + vpbroadcastd m7, [o(pw_16384)] + call .transpose_2x8x8_round + mova m15, [rsp+32*0] + mova [tmp3q-32*4+ 0], xm0 + vextracti128 [tmp3q+32*0+ 0], m0, 1 + mova [tmp3q-32*3+ 0], xm2 + vextracti128 [tmp3q+32*1+ 0], m2, 1 + mova [tmp3q-32*2+ 0], xm4 + vextracti128 [tmp3q+32*2+ 0], m4, 1 + mova [tmp3q-32*1+ 0], xm6 + vextracti128 [tmp3q+32*3+ 0], m6, 1 + mova [tmp3q-32*4+16], xm8 + vextracti128 [tmp3q+32*0+16], m8, 1 + mova [tmp3q-32*3+16], xm10 + vextracti128 [tmp3q+32*1+16], m10, 1 + mova [tmp3q-32*2+16], xm12 + vextracti128 [tmp3q+32*2+16], m12, 1 + mova [tmp3q-32*1+16], xm14 + vextracti128 [tmp3q+32*3+16], m14, 1 + cmp eobd, 150 + jg .full + vinserti128 m0, m1, xm9, 1 + vperm2i128 m4, m1, m9, 0x31 + vinserti128 m2, m5, xm13, 1 + vperm2i128 m6, m5, m13, 0x31 + vinserti128 m1, m3, xm11, 1 + vperm2i128 m5, m3, m11, 0x31 + vinserti128 m3, m7, xm15, 1 + vperm2i128 m7, m7, m15, 0x31 + call .main_oddhalf_fast + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 + jmp .idct16 +.dconly: + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_16384)] + mov [cq], eobd + pmulhrsw xm0, xm1 + mov r2d, 16 + jmp m(inv_txfm_add_dct_dct_16x4).dconly +.full: + mova [tmp1q-32*4], m1 + mova [tmp1q-32*3], m3 + mova [tmp1q-32*2], m5 + mova [tmp1q-32*1], m7 + mova [tmp1q+32*0], m9 + mova [tmp1q+32*1], m11 + mova [tmp1q+32*2], m13 + mova [tmp1q+32*3], m15 + LOAD_16ROWS cq+32, 64, 1 + call m(idct_16x16_internal).main + lea r2, [tmp3q+32*8] + mova m1, [rsp+32*1] + mova [rsp+32*0], m6 + mova [rsp+32*1], m7 + vpbroadcastd m7, [o(pw_16384)] + call .transpose_2x8x8_round + mova m15, [rsp+32*0] + mova [r2-32*4+ 0], xm0 + vextracti128 [r2+32*0+ 0], m0, 1 + mova [r2-32*3+ 0], xm2 + vextracti128 [r2+32*1+ 0], m2, 1 + mova [r2-32*2+ 0], xm4 + vextracti128 [r2+32*2+ 0], m4, 1 + mova [r2-32*1+ 0], xm6 + vextracti128 [r2+32*3+ 0], m6, 1 + mova [r2-32*4+16], xm8 + vextracti128 [r2+32*0+16], m8, 1 + mova [r2-32*3+16], xm10 + vextracti128 [r2+32*1+16], m10, 1 + mova [r2-32*2+16], xm12 + vextracti128 [r2+32*2+16], m12, 1 + mova [r2-32*1+16], xm14 + vextracti128 [r2+32*3+16], m14, 1 + vinserti128 m8, m1, xm9, 1 + vperm2i128 m12, m1, m9, 0x31 + mova xm0, [tmp1q-32*4] + mova xm1, [tmp1q-32*3] + vinserti128 m0, [tmp1q+32*0], 1 + vinserti128 m1, [tmp1q+32*1], 1 + vinserti128 m10, m5, xm13, 1 + vperm2i128 m14, m5, m13, 0x31 + mova xm4, [tmp1q-32*4+16] + mova xm5, [tmp1q-32*3+16] + vinserti128 m4, [tmp1q+32*0+16], 1 + vinserti128 m5, [tmp1q+32*1+16], 1 + vinserti128 m9, m3, xm11, 1 + vperm2i128 m13, m3, m11, 0x31 + mova xm2, [tmp1q-32*2] + mova xm3, [tmp1q-32*1] + vinserti128 m2, [tmp1q+32*2], 1 + vinserti128 m3, [tmp1q+32*3], 1 + vinserti128 m11, m7, xm15, 1 + vperm2i128 m15, m7, m15, 0x31 + mova xm6, [tmp1q-32*2+16] + mova xm7, [tmp1q-32*1+16] + vinserti128 m6, [tmp1q+32*2+16], 1 + vinserti128 m7, [tmp1q+32*3+16], 1 + call .main_oddhalf + LOAD_8ROWS_H r2-32*4, 32 +.idct16: + LOAD_8ROWS tmp3q-32*4, 32 + mova [rsp], m15 + call m(idct_16x16_internal).main + imul r2, strideq, 19 + lea r3, [strideq*3] + add r2, dstq + call .pass2_end + RET +ALIGN function_align +.main_oddhalf_fast: ; lower half is zero + mova [rsp+gprsize+32*1], m7 + pxor m7, m7 + mova [rsp+gprsize+32*0], m7 + mova [rsp+gprsize+32*2], m7 + vpbroadcastd m11, [o(pw_3703x8)] + vpbroadcastd m7, [o(pw_1751x8)] + vpbroadcastd m12, [o(pw_m1380x8)] + vpbroadcastd m8, [o(pw_3857x8)] + vpbroadcastd m13, [o(pw_3973x8)] + vpbroadcastd m15, [o(pw_995x8)] + pmulhrsw m11, m4 ; t29a + pmulhrsw m4, m7 ; t18a + pmulhrsw m12, m3 ; t19a + pmulhrsw m3, m8 ; t28a + pmulhrsw m13, m2 ; t27a + pmulhrsw m2, m15 ; t20a + vpbroadcastd m10, [o(pw_m2106x8)] + vpbroadcastd m7, [o(pw_3513x8)] + vpbroadcastd m9, [o(pw_3290x8)] + vpbroadcastd m8, [o(pw_2440x8)] + vpbroadcastd m14, [o(pw_m601x8)] + vpbroadcastd m15, [o(pw_4052x8)] + pmulhrsw m10, m5 ; t21a + pmulhrsw m5, m7 ; t26a + pmulhrsw m9, m6 ; t25a + pmulhrsw m6, m8 ; t22a + pmulhrsw m14, m1 ; t23a + pmulhrsw m1, m15 ; t24a + vpbroadcastd m15, [o(pd_2048)] + jmp .main2 +ALIGN function_align +.main_oddhalf: + mova [rsp+gprsize+32*0], m15 + mova [rsp+gprsize+32*1], m7 + mova [rsp+gprsize+32*2], m8 + vpbroadcastd m15, [o(pd_2048)] + ITX_MULSUB_2W 4, 11, 7, 8, 15, 1751, 3703 ; t18a, t29a + ITX_MULSUB_2W 12, 3, 7, 8, 15, 3857, 1380 ; t19a, t28a + ITX_MULSUB_2W 2, 13, 7, 8, 15, 995, 3973 ; t20a, t27a + ITX_MULSUB_2W 10, 5, 7, 8, 15, 3513, 2106 ; t21a, t26a + ITX_MULSUB_2W 6, 9, 7, 8, 15, 2440, 3290 ; t22a, t25a + ITX_MULSUB_2W 14, 1, 7, 8, 15, 4052, 601 ; t23a, t24a +.main2: + psubsw m7, m12, m4 ; t18 + paddsw m12, m4 ; t19 + psubsw m4, m2, m10 ; t21 + paddsw m2, m10 ; t20 + psubsw m10, m14, m6 ; t22 + paddsw m14, m6 ; t23 + psubsw m6, m1, m9 ; t25 + paddsw m1, m9 ; t24 + psubsw m9, m13, m5 ; t26 + paddsw m13, m5 ; t27 + psubsw m5, m3, m11 ; t29 + paddsw m3, m11 ; t28 + ITX_MULSUB_2W 5, 7, 8, 11, 15, m4017, 799 ; t18a, t29a + ITX_MULSUB_2W 9, 4, 8, 11, 15, 3406, 2276 ; t21a, t26a + ITX_MULSUB_2W 6, 10, 8, 11, 15, m2276, 3406 ; t22a, t25a + psubsw m8, m14, m2 ; t20a + paddsw m14, m2 ; t23a + psubsw m2, m1, m13 ; t27a + paddsw m1, m13 ; t24a + psubsw m13, m6, m9 ; t21 + paddsw m6, m9 ; t22 + psubsw m9, m10, m4 ; t26 + paddsw m10, m4 ; t25 + ITX_MULSUB_2W 2, 8, 4, 11, 15, m3784, 1567 ; t20, t27 + ITX_MULSUB_2W 9, 13, 4, 11, 15, m3784, 1567 ; t21a, t26a + mova m4, [rsp+gprsize+32*0] ; in31 + mova [rsp+gprsize+32*0], m6 ; t22 + mova m6, [rsp+gprsize+32*1] ; in15 + mova [rsp+gprsize+32*1], m14 ; t23a + mova m14, [rsp+gprsize+32*2] ; in17 + mova [rsp+gprsize+32*2], m1 ; t24a + ITX_MULSUB_2W 0, 4, 1, 11, 15, 201, 4091 ; t16a, t31a + ITX_MULSUB_2W 14, 6, 1, 11, 15, 3035, 2751 ; t17a, t30a + psubsw m1, m0, m14 ; t17 + paddsw m0, m14 ; t16 + psubsw m14, m4, m6 ; t30 + paddsw m4, m6 ; t31 + ITX_MULSUB_2W 14, 1, 6, 11, 15, 799, 4017 ; t17a, t30a + psubsw m6, m0, m12 ; t19a + paddsw m0, m12 ; t16a + psubsw m12, m4, m3 ; t28a + paddsw m4, m3 ; t31a + psubsw m3, m14, m5 ; t18 + paddsw m14, m5 ; t17 + psubsw m5, m1, m7 ; t29 + paddsw m1, m7 ; t30 + ITX_MULSUB_2W 5, 3, 7, 11, 15, 1567, 3784 ; t18a, t29a + ITX_MULSUB_2W 12, 6, 7, 11, 15, 1567, 3784 ; t19, t28 + psubsw m7, m1, m10 ; t25a + paddsw m1, m10 ; t30a + psubsw m10, m5, m9 ; t21 + paddsw m5, m9 ; t18 + psubsw m9, m12, m2 ; t20a + paddsw m12, m2 ; t19a + psubsw m2, m3, m13 ; t26 + paddsw m3, m13 ; t29 + psubsw m13, m6, m8 ; t27a + paddsw m6, m8 ; t28a + mova [tmp1q-32*2], m5 + mova [tmp1q-32*1], m12 + mova [tmp2q+32*0], m6 + mova [tmp2q+32*1], m3 + mova [tmp2q+32*2], m1 + mova m5, [rsp+gprsize+32*0] ; t22 + mova m6, [rsp+gprsize+32*1] ; t23 + mova m3, [rsp+gprsize+32*2] ; t24a + psubsw m1, m14, m5 ; t22a + paddsw m14, m5 ; t17a + psubsw m5, m0, m6 ; t23 + paddsw m0, m6 ; t16 + psubsw m6, m4, m3 ; t24 + paddsw m4, m3 ; t31 + vpbroadcastd m8, [o(pw_m2896_2896)] + vpbroadcastd m3, [o(pw_2896_2896)] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m14 + mova [tmp2q+32*3], m4 + ITX_MULSUB_2W 13, 9, 0, 4, 15, 3, 8 ; t20, t27 + ITX_MULSUB_2W 2, 10, 0, 4, 15, 3, 8 ; t21a, t26a + ITX_MULSUB_2W 7, 1, 0, 4, 15, 3, 8 ; t22, t25 + ITX_MULSUB_2W 6, 5, 0, 4, 15, 3, 8 ; t23a, t24a + mova [tmp1q+32*0], m13 + mova [tmp1q+32*1], m2 + mova [tmp1q+32*2], m7 + mova [tmp1q+32*3], m6 + mova [tmp2q-32*4], m5 + mova [tmp2q-32*3], m1 + mova [tmp2q-32*2], m10 + mova [tmp2q-32*1], m9 + ret +ALIGN function_align +.transpose_2x8x8_round: + punpckhwd m6, m12, m13 + punpcklwd m12, m13 + punpckhwd m13, m8, m9 + punpcklwd m8, m9 + punpckhwd m9, m14, m15 + punpcklwd m14, m15 + punpckhwd m15, m10, m11 + punpcklwd m10, m11 + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5 + punpckhdq m11, m8, m10 + punpckldq m8, m10 + punpckldq m10, m12, m14 + punpckhdq m12, m14 + punpckhdq m14, m13, m15 + punpckldq m13, m15 + punpckldq m15, m6, m9 + punpckhdq m6, m9 + punpckhqdq m9, m8, m10 + punpcklqdq m8, m10 + punpcklqdq m10, m11, m12 + punpckhqdq m11, m12 + punpcklqdq m12, m13, m15 + punpckhqdq m13, m15 + punpckhqdq m15, m14, m6 + punpcklqdq m14, m6 + pmulhrsw m6, m7, [rsp+gprsize+32*0] + REPX {pmulhrsw x, m7}, m8, m9, m10, m11, m12, m13, m14, m15 + pmulhrsw m7, [rsp+gprsize+32*1] + mova [rsp+gprsize+32*0], m15 + punpckhwd m15, m4, m5 + punpcklwd m4, m5 + punpckhwd m5, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m6, m7 + punpcklwd m6, m7 + punpckhwd m7, m2, m3 + punpcklwd m2, m3 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckldq m2, m4, m6 + punpckhdq m4, m6 + punpckhdq m6, m5, m7 + punpckldq m5, m7 + punpckldq m7, m15, m1 + punpckhdq m15, m1 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + punpcklqdq m4, m5, m7 + punpckhqdq m5, m7 + punpckhqdq m7, m6, m15 + punpcklqdq m6, m15 + ret +ALIGN function_align +.pass2_end: + mova [rsp+gprsize+32*0], m7 + mova [rsp+gprsize+32*2], m15 + vpbroadcastd m15, [o(pw_2048)] + IDCT32_PASS2_END 0, tmp2q+32*3, 1, 7, 15, strideq*0, r3*4 + IDCT32_PASS2_END 4, tmp2q-32*1, 0, 7, 15, strideq*4, strideq*8 + IDCT32_PASS2_END 8, tmp1q+32*3, 0, 4, 15, strideq*8, strideq*4 + IDCT32_PASS2_END 12, tmp1q-32*1, 0, 4, 15, r3*4, strideq*0 + add dstq, strideq + sub r2, strideq + mova m1, [rsp+gprsize+32*1] + IDCT32_PASS2_END 1, tmp2q+32*2, 0, 4, 15, strideq*0, r3*4 + IDCT32_PASS2_END 5, tmp2q-32*2, 0, 4, 15, strideq*4, strideq*8 + IDCT32_PASS2_END 9, tmp1q+32*2, 0, 4, 15, strideq*8, strideq*4 + IDCT32_PASS2_END 13, tmp1q-32*2, 0, 4, 15, r3*4, strideq*0 + add dstq, strideq + sub r2, strideq + IDCT32_PASS2_END 2, tmp2q+32*1, 0, 4, 15, strideq*0, r3*4 + IDCT32_PASS2_END 6, tmp2q-32*3, 0, 4, 15, strideq*4, strideq*8 + IDCT32_PASS2_END 10, tmp1q+32*1, 0, 4, 15, strideq*8, strideq*4 + IDCT32_PASS2_END 14, tmp1q-32*3, 0, 4, 15, r3*4, strideq*0 + add dstq, strideq + sub r2, strideq + mova m7, [rsp+gprsize+32*0] + mova m1, [rsp+gprsize+32*2] + IDCT32_PASS2_END 3, tmp2q+32*0, 0, 4, 15, strideq*0, r3*4 + IDCT32_PASS2_END 7, tmp2q-32*4, 0, 4, 15, strideq*4, strideq*8 + IDCT32_PASS2_END 11, tmp1q+32*0, 0, 4, 15, strideq*8, strideq*4 + IDCT32_PASS2_END 1, tmp1q-32*4, 0, 4, 15, r3*4, strideq*0 + ret + +; Perform the final sumsub step and YMM lane shuffling +%macro IDCT32_PASS1_END 4 ; row[1-2], tmp[1-2] + mova m%3, [tmp2q+32*( 3-%1)] + psubsw m%4, m%1, m%3 + paddsw m%1, m%3 + mova m%3, [tmp1q+32*(11-%2)] + mova [tmp1q+32*(11-%2)+16], xm%4 + vextracti128 [tmp2q+32*( 3-%1)+16], m%4, 1 + paddsw m%4, m%2, m%3 + psubsw m%2, m%3 + mova [tmp1q+32*(11-%2)], xm%2 + vextracti128 [tmp2q+32*( 3-%1)], m%2, 1 + vperm2i128 m%2, m%1, m%4, 0x31 + vinserti128 m%1, xm%4, 1 +%endmacro + +cglobal inv_txfm_add_dct_dct_32x16, 4, 4, 0, dst, stride, c, eob + lea rax, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_16384)] + mov [cq], eobd + pmulhrsw xm0, xm1 + mov r2d, 16 + jmp m(inv_txfm_add_dct_dct_32x8).dconly +.normal: + PROLOGUE 0, 6, 16, 32*19, dst, stride, c, eob, tmp1, tmp2 + vpbroadcastd m15, [o(pw_2896x8)] + pmulhrsw m0, m15, [cq+32* 1] + pmulhrsw m1, m15, [cq+32* 3] + pmulhrsw m2, m15, [cq+32* 5] + pmulhrsw m3, m15, [cq+32* 7] + pmulhrsw m4, m15, [cq+32* 9] + pmulhrsw m5, m15, [cq+32*11] + pmulhrsw m6, m15, [cq+32*13] + pmulhrsw m7, m15, [cq+32*15] + pmulhrsw m8, m15, [cq+32*17] + pmulhrsw m9, m15, [cq+32*19] + pmulhrsw m10, m15, [cq+32*21] + pmulhrsw m11, m15, [cq+32*23] + pmulhrsw m12, m15, [cq+32*25] + pmulhrsw m13, m15, [cq+32*27] + pmulhrsw m14, m15, [cq+32*29] + pmulhrsw m15, [cq+32*31] + lea tmp1q, [rsp+32*7] + lea tmp2q, [tmp1q+32*8] + call m(inv_txfm_add_dct_dct_16x32).main_oddhalf + LOAD_16ROWS cq+32*0, 32*2, 1, 0 + pxor m15, m15 + mov r3d, 8 +.zero_loop: + mova [cq+32*0], m15 + mova [cq+32*1], m15 + mova [cq+32*2], m15 + mova [cq+32*3], m15 + add cq, 32*4 + dec r3d + jg .zero_loop + call m(idct_16x16_internal).main + call .pass1_end + lea r2, [strideq*3] + mov r3, dstq +.pass2: + vpbroadcastd m7, [o(pw_16384)] + call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round + call m(idct_16x16_internal).main + mova [rsp+32*2], m15 + vpbroadcastd m15, [o(pw_2048)] + REPX {pmulhrsw x, m15}, m2, m3, m0 + WRITE_16X2 2, 3, 1, 2, strideq*2, r2 + pmulhrsw m1, m15, [rsp+32*1] + WRITE_16X2 0, 1, 2, 3, strideq*0, strideq*1 + lea dstq, [dstq+strideq*4] + REPX {pmulhrsw x, m15}, m4, m5, m6, m7 + WRITE_16X2 4, 5, 2, 3, strideq*0, strideq*1 + WRITE_16X2 6, 7, 2, 3, strideq*2, r2 + lea dstq, [dstq+strideq*4] + REPX {pmulhrsw x, m15}, m8, m9, m10, m11 + WRITE_16X2 8, 9, 2, 3, strideq*0, strideq*1 + WRITE_16X2 10, 11, 2, 3, strideq*2, r2 + lea dstq, [dstq+strideq*4] + REPX {pmulhrsw x, m15}, m11, m12, m13, m14 + pmulhrsw m15, [rsp+32*2] + WRITE_16X2 12, 13, 2, 3, strideq*0, strideq*1 + WRITE_16X2 14, 15, 2, 3, strideq*2, r2 + test r3, r3 + jnz .right_half + RET +.right_half: + LOAD_8ROWS tmp1q-32*4, 32 + LOAD_8ROWS_H tmp2q-32*4, 32 + lea dstq, [r3+16] + xor r3d, r3d + mova [rsp+32*0], m6 + mova [rsp+32*1], m7 + jmp .pass2 +ALIGN function_align +.pass1_end: + mova [rsp+gprsize+32*0], m9 + IDCT32_PASS1_END 0, 8, 1, 9 + IDCT32_PASS1_END 2, 10, 1, 9 + IDCT32_PASS1_END 3, 11, 1, 9 + IDCT32_PASS1_END 4, 12, 1, 9 + IDCT32_PASS1_END 5, 13, 1, 9 + IDCT32_PASS1_END 6, 14, 1, 9 + IDCT32_PASS1_END 7, 15, 1, 9 + mova m1, [rsp+gprsize+32*1] + mova m9, [rsp+gprsize+32*0] + mova [rsp+gprsize+32*0], m6 + mova [rsp+gprsize+32*1], m7 + IDCT32_PASS1_END 1, 9, 6, 7 + ret + +cglobal inv_txfm_add_identity_identity_16x32, 4, 5, 13, dst, stride, c, eob +%undef cmp + lea rax, [o_base] + vpbroadcastd m9, [o(pw_2896x8)] + vpbroadcastd m10, [o(pw_1697x16)] + vpbroadcastd m12, [o(pw_8192)] + cmp eobd, 43 ; if (eob > 43) + setg r4b ; iteration_count++ + cmp eobd, 150 ; if (eob > 150) + setg al ; iteration_count++ + add eobd, -279 ; if (eob > 278) + adc r4b, al ; iteration_count++ + lea r3, [strideq*3] + mov rax, cq + paddw m11, m12, m12 ; pw_16384 +.loop: + mova xm0, [cq+64* 0] + mova xm1, [cq+64* 1] + vinserti128 m0, [cq+64* 8], 1 + vinserti128 m1, [cq+64* 9], 1 + mova xm2, [cq+64* 2] + mova xm3, [cq+64* 3] + vinserti128 m2, [cq+64*10], 1 + vinserti128 m3, [cq+64*11], 1 + mova xm4, [cq+64* 4] + mova xm5, [cq+64* 5] + vinserti128 m4, [cq+64*12], 1 + vinserti128 m5, [cq+64*13], 1 + mova xm6, [cq+64* 6] + mova xm7, [cq+64* 7] + vinserti128 m6, [cq+64*14], 1 + vinserti128 m7, [cq+64*15], 1 + REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {IDTX16 x, 8, 10, 11}, 0, 1, 2, 3, 4, 5, 6, 7 + call m(inv_txfm_add_identity_identity_8x32).transpose8x8 + REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r3 + lea dstq, [dstq+strideq*4] + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 6, 7, 0, 1, strideq*2, r3 + lea dstq, [dstq+strideq*4] + add cq, 16 + dec r4b + jge .loop + sub cq, 32 + pxor m0, m0 + mov r0d, 8 + cmp cq, rax + ja .zero_loop +.zero_loop_half: + mova [rax+64*0], m0 + mova [rax+64*1], m0 + add rax, 64*4 + mova [rax-64*2], m0 + mova [rax-64*1], m0 + sub r0d, 2 + jg .zero_loop_half + RET +.zero_loop: + mova [rax+32*0], m0 + mova [rax+32*1], m0 + mova [rax+32*2], m0 + mova [rax+32*3], m0 + add rax, 32*4 + dec r0d + jg .zero_loop + RET + +cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 12, dst, stride, c, eob +%undef cmp + lea rax, [o_base] + vpbroadcastd m9, [o(pw_2896x8)] + vpbroadcastd m10, [o(pw_1697x16)] + vpbroadcastd m11, [o(pw_2048)] + cmp eobd, 35 ; if (eob > 35) + setg r4b ; iteration_count++ + cmp eobd, 150 ; if (eob > 150) + setg r3b ; iteration_count += 2 + lea r4d, [r4+r3*2] + lea r3, [strideq*3] + mov r5, dstq + mov rax, cq +.loop: + mova xm0, [cq+32* 0] + mova xm1, [cq+32* 1] + vinserti128 m0, [cq+32* 8], 1 + vinserti128 m1, [cq+32* 9], 1 + mova xm2, [cq+32* 2] + mova xm3, [cq+32* 3] + vinserti128 m2, [cq+32*10], 1 + vinserti128 m3, [cq+32*11], 1 + mova xm4, [cq+32* 4] + mova xm5, [cq+32* 5] + vinserti128 m4, [cq+32*12], 1 + vinserti128 m5, [cq+32*13], 1 + mova xm6, [cq+32* 6] + mova xm7, [cq+32* 7] + vinserti128 m6, [cq+32*14], 1 + vinserti128 m7, [cq+32*15], 1 + REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {paddsw x, x }, m0, m1, m2, m3, m4, m5, m6, m7 + call m(inv_txfm_add_identity_identity_8x32).transpose8x8 + REPX {IDTX16 x, 8, 10}, 0, 1, 2, 3, 4, 5, 6, 7 + REPX {pmulhrsw x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r3 + lea dstq, [dstq+strideq*4] + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 6, 7, 0, 1, strideq*2, r3 + lea dstq, [dstq+strideq*4] + add cq, 16 + dec r4b + jl .ret + test r4b, 1 + jz .loop + add cq, 32*15 + lea dstq, [r5+16] + jmp .loop +.ret: + sub cd, eax + pxor m0, m0 + add cd, 384 +.zero_loop: + mova [rax+32*0], m0 + mova [rax+32*1], m0 + mova [rax+32*2], m0 + mova [rax+32*3], m0 + add rax, 32*4 + sub cd, 128 + jge .zero_loop + RET + +cglobal inv_txfm_add_dct_dct_32x32, 4, 4, 0, dst, stride, c, eob + lea rax, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_8192)] + mov [cq], eobd + mov r2d, 32 + jmp m(inv_txfm_add_dct_dct_32x8).dconly +.normal: + PROLOGUE 0, 9, 16, 32*67, dst, stride, c, eob, tmp1, tmp2, \ + base, tmp3, tmp4 + %undef cmp + lea tmp1q, [rsp+32*7] + lea tmp2q, [tmp1q+32*8] + sub eobd, 136 + mov tmp4d, eobd +.pass1_loop: + LOAD_8ROWS cq+64*1, 64*2 + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 1, 3, 5, 7, 9, 11, 13, 15 + test tmp4d, tmp4d + jl .fast + LOAD_8ROWS_H cq+64*17, 64*2 + call m(inv_txfm_add_dct_dct_16x32).main_oddhalf + LOAD_8ROWS_H cq+64*16, 64*2 + pxor m0, m0 + REPX {mova [cq+64*x], m0}, 16, 17, 18, 19, 20, 21, 22, 23, \ + 24, 25, 26, 27, 28, 29, 30, 31 + mova [rsp], m15 + jmp .idct16 +.fast: + call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 +.idct16: + LOAD_8ROWS cq+64*0, 64*2 + pxor m15, m15 + REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14 + call m(idct_16x16_internal).main + call m(inv_txfm_add_dct_dct_32x16).pass1_end + vpbroadcastd m7, [o(pw_8192)] + call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round + lea tmp3q, [tmp1q+32*32] + mova m15, [rsp] + mova [tmp3q-32*4], m0 + mova [tmp3q-32*3], m2 + mova [tmp3q-32*2], m4 + mova [tmp3q-32*1], m6 + mova [tmp3q+32*0], m8 + mova [tmp3q+32*1], m10 + mova [tmp3q+32*2], m12 + mova [tmp3q+32*3], m14 + add tmp3q, 32*8 + mova [tmp3q-32*4], m1 + mova [tmp3q-32*3], m3 + mova [tmp3q-32*2], m5 + mova [tmp3q-32*1], m7 + mova [tmp3q+32*0], m9 + mova [tmp3q+32*1], m11 + mova [tmp3q+32*2], m13 + mova [tmp3q+32*3], m15 + vpbroadcastd m9, [o(pw_8192)] + pmulhrsw m0, m9, [tmp1q-32*4] + pmulhrsw m1, m9, [tmp1q-32*3] + pmulhrsw m2, m9, [tmp1q-32*2] + pmulhrsw m3, m9, [tmp1q-32*1] + pmulhrsw m4, m9, [tmp1q+32*0] + pmulhrsw m5, m9, [tmp1q+32*1] + pmulhrsw m6, m9, [tmp1q+32*2] + pmulhrsw m7, m9, [tmp1q+32*3] + call m(inv_txfm_add_identity_identity_8x32).transpose8x8 + mova [tmp1q-32*4], m0 + pmulhrsw m0, m9, [tmp2q-32*4] + mova [tmp2q-32*4], m1 + pmulhrsw m1, m9, [tmp2q-32*3] + mova [tmp1q-32*3], m2 + pmulhrsw m2, m9, [tmp2q-32*2] + mova [tmp2q-32*3], m3 + pmulhrsw m3, m9, [tmp2q-32*1] + mova [tmp1q-32*2], m4 + pmulhrsw m4, m9, [tmp2q+32*0] + mova [tmp2q-32*2], m5 + pmulhrsw m5, m9, [tmp2q+32*1] + mova [tmp1q-32*1], m6 + pmulhrsw m6, m9, [tmp2q+32*2] + mova [tmp2q-32*1], m7 + pmulhrsw m7, m9, [tmp2q+32*3] + call m(inv_txfm_add_identity_identity_8x32).transpose8x8 + mova [tmp1q+32*0], m0 + mova [tmp2q+32*0], m1 + mova [tmp1q+32*1], m2 + mova [tmp2q+32*1], m3 + mova [tmp1q+32*2], m4 + mova [tmp2q+32*2], m5 + mova [tmp1q+32*3], m6 + mova [tmp2q+32*3], m7 + add cq, 32 + add tmp1q, 32*16 + add tmp2q, 32*16 + add eobd, 0x80000000 + jnc .pass1_loop + add tmp1q, 32*24 + imul r2, strideq, 19 + lea r3, [strideq*3] + add r2, dstq + test tmp4d, tmp4d + jge .pass2_loop + add tmp1q, 32*16 + add tmp2q, 32*16 + add tmp3q, 32*16 +.pass2_loop: + LOAD_8ROWS tmp2q-32*4, 32 + test tmp4d, tmp4d + jl .fast2 + LOAD_8ROWS_H tmp3q-32*4, 32 + call m(inv_txfm_add_dct_dct_16x32).main_oddhalf + sub tmp3q, 32*8 + LOAD_8ROWS_H tmp3q-32*4, 32 + sub tmp3q, 32*16 + jmp .pass2_loop_end +.fast2: + call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast + sub tmp3q, 32*24 + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 +.pass2_loop_end: + LOAD_8ROWS tmp3q-32*4, 32 + mova [rsp], m15 + call m(idct_16x16_internal).main + call m(inv_txfm_add_dct_dct_16x32).pass2_end + lea tmp3q, [tmp1q-32*32] + cmp tmp2q, tmp3q + jb .ret + sub tmp2q, 32*32 + sub dstq, r3 + lea r2, [r2+r3+16] + add dstq, 16 + jmp .pass2_loop +.ret: + RET + +cglobal inv_txfm_add_identity_identity_32x32, 4, 6, 10, dst, stride, c, eob + %undef cmp + vpbroadcastd m9, [pw_8192] + sub eobd, 136 ; if (eob < 136) + shr eobd, 30 ; topleft 16x16 only + lea eobd, [eobq*2-8] + lea r4, [strideq*3] + mov r5, dstq + lea rax, [cq+32] +.loop: + mova xm0, [cq+64* 0] + mova xm1, [cq+64* 1] + vinserti128 m0, [cq+64* 8], 1 + vinserti128 m1, [cq+64* 9], 1 + mova xm2, [cq+64* 2] + mova xm3, [cq+64* 3] + vinserti128 m2, [cq+64*10], 1 + vinserti128 m3, [cq+64*11], 1 + mova xm4, [cq+64* 4] + mova xm5, [cq+64* 5] + vinserti128 m4, [cq+64*12], 1 + vinserti128 m5, [cq+64*13], 1 + mova xm6, [cq+64* 6] + mova xm7, [cq+64* 7] + vinserti128 m6, [cq+64*14], 1 + vinserti128 m7, [cq+64*15], 1 + call m(inv_txfm_add_identity_identity_8x32).transpose8x8 + REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 + WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r4 + lea dstq, [dstq+strideq*4] + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 6, 7, 0, 1, strideq*2, r4 + lea dstq, [dstq+strideq*4] + add cq, 16 + inc eobd + jz .ret + test eobd, 3 + jnz .loop + add cq, 64*15 + lea dstq, [r5+16] + jmp .loop +.ret: + pxor m0, m0 + mov r0d, 16 + cmp cq, rax + jne .zero_loop +.zero_loop_topleft: + mova [rax-32*1], m0 + mova [rax+32*1], m0 + mova [rax+32*3], m0 + mova [rax+32*5], m0 + add rax, 64*4 + sub r0d, 4 + jg .zero_loop_topleft + RET +.zero_loop: + mova [rax-32*1], m0 + mova [rax+32*0], m0 + mova [rax+32*1], m0 + mova [rax+32*2], m0 + add rax, 32*4 + dec r0d + jg .zero_loop + RET + +%macro IDCT64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4]) +%if %1 & 1 + mova m%5, [tmp2q-32*(51-%1)] ; idct16 out 0+n + mova m%4, [tmp1q-32*(14+%1)] ; idct32 out31-n +%else + mova m%5, [tmp1q-32*(45-%1)] + mova m%4, [tmp2q-32*(20+%1)] +%endif + psubsw m%6, m%5, m%4 ; idct32 out31-n + paddsw m%5, m%4 ; idct32 out 0+n + psubsw m%4, m%6, m%3 ; out32+n + paddsw m%6, m%3 ; out31-n + psubsw m%3, m%5, m%2 ; out63-n + paddsw m%5, m%2 ; out 0+n +%if %0 == 6 ; pass 1 +%if %1 & 1 + mova [tmp2q-32*(19-%1)], m%4 + mova [tmp1q-32*(14+%1)], m%6 + mova [tmp1q+32*(18-%1)], m%3 + mova [tmp2q-32*(51-%1)], m%5 +%else + mova [tmp1q-32*(13-%1)], m%4 + mova [tmp2q-32*(20+%1)], m%6 + mova [tmp2q+32*(12-%1)], m%3 + mova [tmp1q-32*(45-%1)], m%5 +%endif +%else ; pass 2 + REPX {pmulhrsw x, m14}, m%4, m%6, m%3, m%5 +%if %1 & 1 + %define %%d0 r2 + %define %%d1 dstq +%else + %define %%d0 dstq + %define %%d1 r2 +%endif + pmovzxbw m%2, [%%d0+%9 ] + paddw m%2, m%4 + pmovzxbw m%4, [%%d1+%8 ] + paddw m%4, m%6 + pmovzxbw m%6, [%%d1+%10] + paddw m%3, m%6 + pmovzxbw m%6, [%%d0+%7 ] + paddw m%5, m%6 + packuswb m%2, m%4 + packuswb m%3, m%5 + vpermq m%2, m%2, q3120 + vpermq m%3, m%3, q3120 + mova [%%d0+%9 ], xm%2 + vextracti128 [%%d1+%8 ], m%2, 1 + mova [%%d1+%10], xm%3 + vextracti128 [%%d0+%7 ], m%3, 1 +%endif +%endmacro + +cglobal inv_txfm_add_dct_dct_16x64, 4, 4, 0, dst, stride, c, eob + lea rax, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_8192)] + mov [cq], eobd + mov r2d, 32 + jmp m(inv_txfm_add_dct_dct_16x4).dconly +.normal: + PROLOGUE 0, 10, 16, 32*67, dst, stride, c, eob, tmp1, tmp2 + %undef cmp + lea tmp1q, [rsp+32*23] + lea tmp2q, [tmp1q+32*24] + sub eobd, 151 + mov r7d, eobd +.pass1_loop: + LOAD_16ROWS cq, 64 + call m(idct_16x16_internal).main + mova m1, [rsp+32*1] + mova [rsp+32*0], m6 + mova [rsp+32*1], m7 + vpbroadcastd m7, [o(pw_8192)] + call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round + mova m15, [rsp+32*0] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m2 + mova [tmp1q-32*2], m4 + mova [tmp1q-32*1], m6 + mova [tmp1q+32*0], m8 + mova [tmp1q+32*1], m10 + mova [tmp1q+32*2], m12 + mova [tmp1q+32*3], m14 + mova [tmp2q-32*4], m1 + mova [tmp2q-32*3], m3 + mova [tmp2q-32*2], m5 + mova [tmp2q-32*1], m7 + mova [tmp2q+32*0], m9 + mova [tmp2q+32*1], m11 + mova [tmp2q+32*2], m13 + mova [tmp2q+32*3], m15 + add cq, 32 + add tmp1q, 32*8 + add tmp2q, 32*8 + add eobd, 0x80000000 + jnc .pass1_loop + lea r2, [rsp+32*23] + mova xm0, [r2-32*4+ 0] + mova xm1, [r2-32*2+ 0] + vinserti128 m0, [r2+32*0+ 0], 1 + vinserti128 m1, [r2+32*2+ 0], 1 + mova xm2, [r2-32*4+16] + mova xm3, [r2-32*2+16] + vinserti128 m2, [r2+32*0+16], 1 + vinserti128 m3, [r2+32*2+16], 1 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14 + test r7d, r7d + jl .fast + lea r3, [r2+32*8] + mova xm4, [r3-32*4+ 0] + mova xm5, [r3-32*2+ 0] + vinserti128 m4, [r3+32*0+ 0], 1 + vinserti128 m5, [r3+32*2+ 0], 1 + mova xm6, [r3-32*4+16] + mova xm7, [r3-32*2+16] + vinserti128 m6, [r3+32*0+16], 1 + vinserti128 m7, [r3+32*2+16], 1 +.fast: + mova [rsp], m8 + lea tmp1q, [rsp+32*7] + call m(idct_16x16_internal).main + mova m1, [rsp+32*1] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m1 + mova [tmp1q-32*2], m2 + mova [tmp1q-32*1], m3 + mova [tmp1q+32*0], m4 + mova [tmp1q+32*1], m5 + mova [tmp1q+32*2], m6 + mova [tmp1q+32*3], m7 + add tmp1q, 32*8 + mova [tmp1q-32*4], m8 + mova [tmp1q-32*3], m9 + mova [tmp1q-32*2], m10 + mova [tmp1q-32*1], m11 + mova [tmp1q+32*0], m12 + mova [tmp1q+32*1], m13 + mova [tmp1q+32*2], m14 + mova [tmp1q+32*3], m15 + mova xm0, [r2-32*3+ 0] + mova xm1, [r2-32*1+ 0] + vinserti128 m0, [r2+32*1+ 0], 1 + vinserti128 m1, [r2+32*3+ 0], 1 + mova xm2, [r2-32*3+16] + mova xm3, [r2-32*1+16] + vinserti128 m2, [r2+32*1+16], 1 + vinserti128 m3, [r2+32*3+16], 1 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + test r7d, r7d + jl .fast2 + mova xm4, [r3-32*3+ 0] + mova xm5, [r3-32*1+ 0] + vinserti128 m4, [r3+32*1+ 0], 1 + vinserti128 m5, [r3+32*3+ 0], 1 + mova xm6, [r3-32*3+16] + mova xm7, [r3-32*1+16] + vinserti128 m6, [r3+32*1+16], 1 + vinserti128 m7, [r3+32*3+16], 1 +.fast2: + add tmp1q, 32*8 + lea tmp2q, [tmp1q+32*8] + call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast + add r2, 32*24 + vpbroadcastd m15, [o(pd_2048)] + add tmp1q, 32*16 + add tmp2q, 32*32 + mova xm0, [r2-32*4+ 0] + mova xm3, [r2-32*1+16] + vinserti128 m0, [r2+32*0+ 0], 1 + vinserti128 m3, [r2+32*3+16], 1 + mova xm4, [r2-32*4+16] + mova xm7, [r2-32*1+ 0] + vinserti128 m4, [r2+32*0+16], 1 + vinserti128 m7, [r2+32*3+ 0], 1 + pxor m1, m1 + REPX {mova x, m1}, m2, m5, m6 + test r7d, r7d + jl .fast3 + add r3, 32*24 + mova xm1, [r3-32*1+16] + mova xm2, [r3-32*4+ 0] + vinserti128 m1, [r3+32*3+16], 1 + vinserti128 m2, [r3+32*0+ 0], 1 + mova xm5, [r3-32*1+ 0] + mova xm6, [r3-32*4+16] + vinserti128 m5, [r3+32*3+ 0], 1 + vinserti128 m6, [r3+32*0+16], 1 +.fast3: + add rax, o_idct64_offset + call m(inv_txfm_add_dct_dct_16x64).main_part1 + add rax, 8 + add tmp1q, 32*8 + sub tmp2q, 32*8 + mova xm0, [r2-32*2+ 0] + mova xm3, [r2-32*3+16] + vinserti128 m0, [r2+32*2+ 0], 1 + vinserti128 m3, [r2+32*1+16], 1 + mova xm4, [r2-32*2+16] + mova xm7, [r2-32*3+ 0] + vinserti128 m4, [r2+32*2+16], 1 + vinserti128 m7, [r2+32*1+ 0], 1 + pxor m1, m1 + REPX {mova x, m1}, m2, m5, m6 + test r7d, r7d + jl .fast4 + mova xm1, [r3-32*3+16] + mova xm2, [r3-32*2+ 0] + vinserti128 m1, [r3+32*1+16], 1 + vinserti128 m2, [r3+32*2+ 0], 1 + mova xm5, [r3-32*3+ 0] + mova xm6, [r3-32*2+16] + vinserti128 m5, [r3+32*1+ 0], 1 + vinserti128 m6, [r3+32*2+16], 1 +.fast4: + call m(inv_txfm_add_dct_dct_16x64).main_part1 + call m(inv_txfm_add_dct_dct_16x64).main_part2_pass2 + RET +ALIGN function_align +%define o_base idct64_mul - 8 +.main_part1: + ; idct64 steps 1-5: + ; in1/31/17/15/ 9/23/25/ 7 -> + ; t32a/33/34a/35/36/37a/38/39a/56a/57/58a/59/60/61a/62/63a + ; in5/27/21/11/13/19/29/ 3 -> + ; t40a/41/42a/43/44/45a/46/47a/48a/49/50a/51/52/53a/54/55a + vpbroadcastd m11, [o(idct64_mul+4* 0)] + vpbroadcastd m13, [o(idct64_mul+4* 1)] + vpbroadcastd m10, [o(idct64_mul+4* 4)] + vpbroadcastd m12, [o(idct64_mul+4* 5)] + pmulhrsw m11, m0 ; t63a + pmulhrsw m0, m13 ; t32a + pmulhrsw m10, m1 ; t62a + pmulhrsw m1, m12 ; t33a + vpbroadcastd m9, [o(idct64_mul+4* 8)] + vpbroadcastd m13, [o(idct64_mul+4* 9)] + vpbroadcastd m8, [o(idct64_mul+4*12)] + vpbroadcastd m12, [o(idct64_mul+4*13)] + pmulhrsw m9, m2 ; t61a + pmulhrsw m2, m13 ; t34a + pmulhrsw m8, m3 ; t60a + pmulhrsw m3, m12 ; t35a + psubsw m12, m0, m1 ; t33 + paddsw m0, m1 ; t32 + psubsw m1, m3, m2 ; t34 + paddsw m3, m2 ; t35 + psubsw m2, m8, m9 ; t61 + paddsw m8, m9 ; t60 + psubsw m9, m11, m10 ; t62 + paddsw m11, m10 ; t63 + ITX_MULSUB_2W 2, 1, 10, 13, 15, m4076, 401 ; t34a, t61a + vpbroadcastd m14, [o(pw_401_4076)] + ITX_MULSUB_2W 9, 12, 10, 13, 15, 14, 13 ; t33a, t62a + psubsw m10, m0, m3 ; t35a + paddsw m0, m3 ; t32a + psubsw m3, m11, m8 ; t60a + paddsw m11, m8 ; t63a + psubsw m8, m9, m2 ; t34 + paddsw m9, m2 ; t33 + psubsw m2, m12, m1 ; t61 + paddsw m12, m1 ; t62 + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m9 + mova [tmp2q+32*2], m12 + mova [tmp2q+32*3], m11 + vpbroadcastd m13, [o(pw_m4017_799)] + vpbroadcastd m14, [o(pw_799_4017)] + ITX_MULSUB_2W 2, 8, 0, 1, 15, 14, 13 ; t34a, t61a + ITX_MULSUB_2W 3, 10, 0, 1, 15, 14, 13 ; t35, t60 + mova [tmp1q-32*2], m2 + mova [tmp1q-32*1], m3 + mova [tmp2q+32*0], m10 + mova [tmp2q+32*1], m8 + vpbroadcastd m3, [o(idct64_mul+4*16)] + vpbroadcastd m11, [o(idct64_mul+4*17)] + vpbroadcastd m2, [o(idct64_mul+4*20)] + vpbroadcastd m10, [o(idct64_mul+4*21)] + vpbroadcastd m1, [o(idct64_mul+4*24)] + vpbroadcastd m9, [o(idct64_mul+4*25)] + vpbroadcastd m0, [o(idct64_mul+4*28)] + vpbroadcastd m8, [o(idct64_mul+4*29)] + pmulhrsw m3, m4 ; t59a + pmulhrsw m4, m11 ; t36a + pmulhrsw m2, m5 ; t58a + pmulhrsw m5, m10 ; t37a + pmulhrsw m1, m6 ; t57a + pmulhrsw m6, m9 ; t38a + pmulhrsw m0, m7 ; t56a + pmulhrsw m7, m8 ; t39a + psubsw m8, m4, m5 ; t37 + paddsw m4, m5 ; t36 + psubsw m5, m7, m6 ; t38 + paddsw m7, m6 ; t39 + psubsw m6, m0, m1 ; t57 + paddsw m0, m1 ; t56 + psubsw m1, m3, m2 ; t58 + paddsw m3, m2 ; t59 + ITX_MULSUB_2W 6, 5, 2, 9, 15, m2598, 3166 ; t38a, t57a + vpbroadcastd m10, [o(pw_3166_2598)] + ITX_MULSUB_2W 1, 8, 2, 9, 15, 10, 9 ; t37a, t58a + psubsw m2, m7, m4 ; t36a + paddsw m7, m4 ; t39a + psubsw m4, m0, m3 ; t59a + paddsw m0, m3 ; t56a + psubsw m3, m6, m1 ; t37 + paddsw m6, m1 ; t38 + psubsw m1, m5, m8 ; t58 + paddsw m5, m8 ; t57 + mova [tmp1q+32*2], m6 + mova [tmp1q+32*3], m7 + mova [tmp2q-32*4], m0 + mova [tmp2q-32*3], m5 + vpbroadcastd m6, [o(pw_m799_m4017)] + vpbroadcastd m7, [o(pw_m4017_799)] + ITX_MULSUB_2W 4, 2, 0, 5, 15, 7, 6 ; t36, t59 + ITX_MULSUB_2W 1, 3, 0, 5, 15, 7, 6 ; t37a, t58a + mova [tmp1q+32*0], m4 + mova [tmp1q+32*1], m1 + mova [tmp2q-32*2], m3 + mova [tmp2q-32*1], m2 + ret +%define o_base pw_5 + 128 +.main_part2_pass1: ; idct64 steps 6-9 + idct16/32/64 sumsub + sub rax, o_idct64_offset + 8 + vpbroadcastd m11, [o(pw_1567_3784)] + vpbroadcastd m12, [o(pw_m3784_1567)] + vpbroadcastd m13, [o(pw_2896_2896)] + vpbroadcastd m14, [o(pw_m2896_2896)] +.main_part2_pass1_loop: + call .main_part2_internal + IDCT64_PART2_END 0, 7, 0, 6, 9, 10 + IDCT64_PART2_END 7, 8, 5, 0, 6, 7 + IDCT64_PART2_END 8, 2, 1, 0, 6, 7 + IDCT64_PART2_END 15, 3, 4, 0, 6, 7 + cmp tmp1q, tmp2q + jne .main_part2_pass1_loop + ret +.main_part2_internal: + mova m0, [tmp1q-32*12] ; t32a + mova m6, [tmp2q-32*13] ; t39a + mova m1, [tmp1q-32* 4] ; t40a + mova m5, [tmp2q+32* 3] ; t55a + add tmp1q, 32 + sub tmp2q, 32 + mova m2, [tmp1q+32* 3] ; t48a + mova m4, [tmp2q-32* 4] ; t47a + mova m3, [tmp1q+32*11] ; t56a + mova m7, [tmp2q+32*12] ; t63a + psubsw m8, m0, m6 ; t39 + paddsw m0, m6 ; t32 + psubsw m6, m4, m1 ; t40 + paddsw m4, m1 ; t47 + psubsw m1, m2, m5 ; t55 + paddsw m2, m5 ; t48 + psubsw m5, m7, m3 ; t56 + paddsw m7, m3 ; t63 + ITX_MULSUB_2W 5, 8, 3, 9, 15, 11, 12 ; t39a, t56a + vpbroadcastd m9, [o(pw_m1567_m3784)] + ITX_MULSUB_2W 1, 6, 3, 9, 15, 12, 9 ; t40a, t55a + psubsw m3, m0, m4 ; t47a + paddsw m0, m4 ; t32a + psubsw m4, m7, m2 ; t48a + paddsw m7, m2 ; t63a + psubsw m2, m5, m1 ; t40 + paddsw m5, m1 ; t39 + psubsw m1, m8, m6 ; t55 + paddsw m8, m6 ; t56 + ITX_MULSUB_2W 4, 3, 6, 9, 15, 13, 14 ; t47, t48 + ITX_MULSUB_2W 1, 2, 6, 9, 15, 13, 14 ; t40a, t55a + ret +.main_part2_pass2: + sub rax, o_idct64_offset + 8 + vpbroadcastd m11, [o(pw_1567_3784)] + vpbroadcastd m12, [o(pw_m3784_1567)] + vpbroadcastd m13, [o(pw_2896_2896)] + lea r9, [strideq*5] ; stride*5 + lea r3, [r9+strideq*1] ; stride*6 + lea r7, [r9+strideq*2] ; stride*7 + lea r8, [r3+strideq*2] ; stride*8 + lea r2, [dstq+r7] +.main_part2_pass2_loop: + vpbroadcastd m14, [o(pw_m2896_2896)] + call .main_part2_internal + vpbroadcastd m14, [o(pw_2048)] + IDCT64_PART2_END 0, 7, 0, 6, 9, 10, strideq*0, r3*4, r8*4, r7*8 + IDCT64_PART2_END 7, 8, 5, 0, 6, 7, strideq*0, r3*4, r8*4, r7*8 + IDCT64_PART2_END 8, 2, 1, 0, 6, 7, strideq*8, r8*2, r9*8, r3*8 + IDCT64_PART2_END 15, 3, 4, 0, 6, 7, strideq*8, r8*2, r9*8, r3*8 + add dstq, strideq + sub r2, strideq + cmp tmp1q, tmp2q + jne .main_part2_pass2_loop + ret + +cglobal inv_txfm_add_dct_dct_64x16, 4, 4, 0, dst, stride, c, eob + lea rax, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_8192)] + mov [cq], eobd + mov r2d, 16 +.dconly: + pmulhrsw xm0, xm2 + movd xm2, [o(pw_2048)] + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm2 + vpbroadcastw m0, xm0 + pxor m1, m1 +.dconly_loop: + mova m2, [dstq+32*0] + mova m3, [dstq+32*1] + punpckhbw m4, m2, m1 + punpcklbw m2, m1 + punpckhbw m5, m3, m1 + punpcklbw m3, m1 + paddw m4, m0 + paddw m2, m0 + paddw m5, m0 + paddw m3, m0 + packuswb m2, m4 + packuswb m3, m5 + mova [dstq+32*0], m2 + mova [dstq+32*1], m3 + add dstq, strideq + dec r2d + jg .dconly_loop + RET +.normal: + PROLOGUE 0, 7, 16, 32*67, dst, stride, c, eob, tmp1, tmp2 + LOAD_8ROWS cq+32*0, 32*4 + pxor m8, m8 + REPX {mova [cq+32*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 + lea tmp1q, [rsp+32*7] + call m(idct_16x16_internal).main + mova m1, [rsp+32*1] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m1 + mova [tmp1q-32*2], m2 + mova [tmp1q-32*1], m3 + mova [tmp1q+32*0], m4 + mova [tmp1q+32*1], m5 + mova [tmp1q+32*2], m6 + mova [tmp1q+32*3], m7 + add tmp1q, 32*8 + mova [tmp1q-32*4], m8 + mova [tmp1q-32*3], m9 + mova [tmp1q-32*2], m10 + mova [tmp1q-32*1], m11 + mova [tmp1q+32*0], m12 + mova [tmp1q+32*1], m13 + mova [tmp1q+32*2], m14 + mova [tmp1q+32*3], m15 + LOAD_8ROWS cq+32*2, 32*4 + pxor m8, m8 + REPX {mova [cq+32*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30 + add tmp1q, 32*8 + lea tmp2q, [tmp1q+32*8] + call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast + vpbroadcastd m15, [o(pd_2048)] + add tmp1q, 32*16 + add tmp2q, 32*32 + mova m0, [cq+32* 1] + mova m1, [cq+32*31] + mova m2, [cq+32*17] + mova m3, [cq+32*15] + mova m4, [cq+32* 9] + mova m5, [cq+32*23] + mova m6, [cq+32*25] + mova m7, [cq+32* 7] + pxor m8, m8 + REPX {mova [cq+32*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7 + add rax, o_idct64_offset + call m(inv_txfm_add_dct_dct_16x64).main_part1 + add rax, 8 + add tmp1q, 32*8 + sub tmp2q, 32*8 + mova m0, [cq+32* 5] + mova m1, [cq+32*27] + mova m2, [cq+32*21] + mova m3, [cq+32*11] + mova m4, [cq+32*13] + mova m5, [cq+32*19] + mova m6, [cq+32*29] + mova m7, [cq+32* 3] + pxor m8, m8 + REPX {mova [cq+32*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3 + call m(inv_txfm_add_dct_dct_16x64).main_part1 + call m(inv_txfm_add_dct_dct_16x64).main_part2_pass1 + sub tmp1q, 32*36 + lea r2, [strideq*3] + mov tmp2d, 4 +.pass2_loop: + lea r3, [tmp1q-32*8] + mova xm0, [r3 -32*4] + mova xm1, [r3 -32*3] + vinserti128 m0, [tmp1q-32*4], 1 + vinserti128 m1, [tmp1q-32*3], 1 + mova xm2, [r3 -32*2] + mova xm3, [r3 -32*1] + vinserti128 m2, [tmp1q-32*2], 1 + vinserti128 m3, [tmp1q-32*1], 1 + mova xm4, [r3 +32*0] + mova xm5, [r3 +32*1] + vinserti128 m4, [tmp1q+32*0], 1 + vinserti128 m5, [tmp1q+32*1], 1 + mova xm6, [r3 +32*2] + mova xm7, [r3 +32*3] + vinserti128 m6, [tmp1q+32*2], 1 + vinserti128 m7, [tmp1q+32*3], 1 + mova xm8, [r3 -32*4+16] + mova xm9, [r3 -32*3+16] + vinserti128 m8, [tmp1q-32*4+16], 1 + vinserti128 m9, [tmp1q-32*3+16], 1 + mova xm10, [r3 -32*2+16] + mova xm11, [r3 -32*1+16] + vinserti128 m10, [tmp1q-32*2+16], 1 + vinserti128 m11, [tmp1q-32*1+16], 1 + mova xm12, [r3 +32*0+16] + mova xm13, [r3 +32*1+16] + vinserti128 m12, [tmp1q+32*0+16], 1 + vinserti128 m13, [tmp1q+32*1+16], 1 + mova xm14, [r3 +32*2+16] + mova xm15, [r3 +32*3+16] + vinserti128 m14, [tmp1q+32*2+16], 1 + vinserti128 m15, [tmp1q+32*3+16], 1 + mova [rsp+32*0], m6 + mova [rsp+32*1], m7 + vpbroadcastd m7, [o(pw_8192)] + call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round + call m(idct_16x16_internal).main + mova [rsp+32*0], m15 + vpbroadcastd m15, [o(pw_2048)] + REPX {pmulhrsw x, m15}, m0, m2, m3, m4, m5, m6, m7 + WRITE_16X2 2, 3, 1, 2, strideq*2, r2 + pmulhrsw m1, m15, [rsp+32*1] + WRITE_16X2 0, 1, 2, 3, strideq*0, strideq*1 + lea r3, [dstq+strideq*4] + %define dstq r3 + WRITE_16X2 4, 5, 2, 3, strideq*0, strideq*1 + WRITE_16X2 6, 7, 2, 3, strideq*2, r2 + REPX {pmulhrsw x, m15}, m8, m9, m10, m11, m12, m13, m14 + lea r3, [r3+strideq*4] + WRITE_16X2 8, 9, 2, 3, strideq*0, strideq*1 + WRITE_16X2 10, 11, 2, 3, strideq*2, r2 + pmulhrsw m15, [rsp+32*0] + lea r3, [r3+strideq*4] + WRITE_16X2 12, 13, 2, 3, strideq*0, strideq*1 + WRITE_16X2 14, 15, 2, 3, strideq*2, r2 + add tmp1q, 32*16 + add r0, 16 + dec tmp2d + jg .pass2_loop + RET + +cglobal inv_txfm_add_dct_dct_32x64, 4, 4, 0, dst, stride, c, eob + lea rax, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_16384)] + mov [cq], eobd + pmulhrsw xm0, xm1 + mov r2d, 64 + jmp m(inv_txfm_add_dct_dct_32x8).dconly +.normal: + PROLOGUE 0, 11, 16, 32*99, dst, stride, c, eob, tmp1, tmp2 + lea tmp1q, [rsp+32*7] + lea r10d, [eobq-136] + sar r10d, 31 +.pass1_loop: + lea tmp2q, [tmp1q+32*16] + LOAD_8ROWS cq+64*1, 64*2, 1 + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 1, 3, 5, 7, 9, 11, 13, 15 + test r10b, r10b + jnz .fast + LOAD_8ROWS_H cq+64*17, 64*2, 2 + call m(inv_txfm_add_dct_dct_16x32).main_oddhalf + LOAD_8ROWS_H cq+64*16, 64*2, 1 + mova [rsp], m15 + pxor m15, m15 + REPX {mova [cq+64*x], m15}, 16, 17, 18, 19, 20, 21, 22, 23, \ + 24, 25, 26, 27, 28, 29, 30, 31 + jmp .idct16 +.fast: + call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 +.idct16: + LOAD_8ROWS cq+64*0, 64*2, 1 + pxor m15, m15 + REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14 + call m(idct_16x16_internal).main + call m(inv_txfm_add_dct_dct_32x16).pass1_end + vpbroadcastd m7, [o(pw_16384)] + call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round + lea r3, [tmp1q+32*48] + mova m15, [rsp] + mova [r3-32*4], m0 + mova [r3-32*3], m2 + mova [r3-32*2], m4 + mova [r3-32*1], m6 + mova [r3+32*0], m8 + mova [r3+32*1], m10 + mova [r3+32*2], m12 + mova [r3+32*3], m14 + add r3, 32*24 + mova [r3-32*4], m1 + mova [r3-32*3], m3 + mova [r3-32*2], m5 + mova [r3-32*1], m7 + mova [r3+32*0], m9 + mova [r3+32*1], m11 + mova [r3+32*2], m13 + mova [r3+32*3], m15 + vpbroadcastd m9, [o(pw_16384)] + pmulhrsw m0, m9, [tmp1q-32*4] + pmulhrsw m1, m9, [tmp1q-32*3] + pmulhrsw m2, m9, [tmp1q-32*2] + pmulhrsw m3, m9, [tmp1q-32*1] + pmulhrsw m4, m9, [tmp1q+32*0] + pmulhrsw m5, m9, [tmp1q+32*1] + pmulhrsw m6, m9, [tmp1q+32*2] + pmulhrsw m7, m9, [tmp1q+32*3] + call m(inv_txfm_add_identity_identity_8x32).transpose8x8 + mova [tmp1q-32*4], m0 + pmulhrsw m0, m9, [tmp2q-32*4] + mova [tmp2q-32*4], m1 + pmulhrsw m1, m9, [tmp2q-32*3] + mova [tmp1q-32*3], m2 + pmulhrsw m2, m9, [tmp2q-32*2] + mova [tmp2q-32*3], m3 + pmulhrsw m3, m9, [tmp2q-32*1] + mova [tmp1q-32*2], m4 + pmulhrsw m4, m9, [tmp2q+32*0] + mova [tmp2q-32*2], m5 + pmulhrsw m5, m9, [tmp2q+32*1] + mova [tmp1q-32*1], m6 + pmulhrsw m6, m9, [tmp2q+32*2] + mova [tmp2q-32*1], m7 + pmulhrsw m7, m9, [tmp2q+32*3] + call m(inv_txfm_add_identity_identity_8x32).transpose8x8 + mova [tmp1q+32*0], m0 + mova [tmp2q+32*0], m1 + mova [tmp1q+32*1], m2 + mova [tmp2q+32*1], m3 + mova [tmp1q+32*2], m4 + mova [tmp2q+32*2], m5 + mova [tmp1q+32*3], m6 + mova [tmp2q+32*3], m7 + add cq, 32 + add tmp1q, 32*8 + add r10d, 0x80000000 + jnc .pass1_loop + lea r2, [rsp+32*55] + lea r7, [r2+32*24] +.pass2_loop: + lea r3, [r2+32*8] + lea r8, [r7+32*8] + mova m0, [r2-32*4] + mova m1, [r2-32*2] + mova m2, [r2+32*0] + mova m3, [r2+32*2] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14 + test r10b, r10b + jnz .fast2 + mova m4, [r3-32*4] + mova m5, [r3-32*2] + mova m6, [r3+32*0] + mova m7, [r3+32*2] +.fast2: + mova [rsp], m8 + lea tmp1q, [rsp+32*39] + call m(idct_16x16_internal).main + mova m1, [rsp+32*1] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m1 + mova [tmp1q-32*2], m2 + mova [tmp1q-32*1], m3 + mova [tmp1q+32*0], m4 + mova [tmp1q+32*1], m5 + mova [tmp1q+32*2], m6 + mova [tmp1q+32*3], m7 + add tmp1q, 32*8 + mova [tmp1q-32*4], m8 + mova [tmp1q-32*3], m9 + mova [tmp1q-32*2], m10 + mova [tmp1q-32*1], m11 + mova [tmp1q+32*0], m12 + mova [tmp1q+32*1], m13 + mova [tmp1q+32*2], m14 + mova [tmp1q+32*3], m15 + mova m0, [r2-32*3] + mova m1, [r2-32*1] + mova m2, [r2+32*1] + mova m3, [r2+32*3] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + test r10b, r10b + jnz .fast3 + mova m4, [r3-32*3] + mova m5, [r3-32*1] + mova m6, [r3+32*1] + mova m7, [r3+32*3] +.fast3: + add tmp1q, 32*8 + lea tmp2q, [tmp1q+32*8] + call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast + vpbroadcastd m15, [o(pd_2048)] + add tmp1q, 32*16 + add tmp2q, 32*32 + mova m0, [r7-32*4] + mova m3, [r7+32*3] + mova m4, [r7+32*0] + mova m7, [r7-32*1] + pxor m1, m1 + REPX {mova x, m1}, m2, m5, m6 + test r10b, r10b + jnz .fast4 + mova m1, [r8+32*3] + mova m2, [r8-32*4] + mova m5, [r8-32*1] + mova m6, [r8+32*0] +.fast4: + add rax, o_idct64_offset + call m(inv_txfm_add_dct_dct_16x64).main_part1 + add rax, 8 + add tmp1q, 32*8 + sub tmp2q, 32*8 + mova m0, [r7-32*2] + mova m3, [r7+32*1] + mova m4, [r7+32*2] + mova m7, [r7-32*3] + pxor m1, m1 + REPX {mova x, m1}, m2, m5, m6 + test r10b, r10b + jnz .fast5 + mova m1, [r8+32*1] + mova m2, [r8-32*2] + mova m5, [r8-32*3] + mova m6, [r8+32*2] +.fast5: + call m(inv_txfm_add_dct_dct_16x64).main_part1 + call m(inv_txfm_add_dct_dct_16x64).main_part2_pass2 + add r10d, 0x80000000 + jc .ret + lea r2, [rsp+32*7] + lea r7, [r2+32*16] + sub dstq, r8 + lea dstq, [dstq+strideq*4+16] + jmp .pass2_loop +.ret: + RET + +cglobal inv_txfm_add_dct_dct_64x32, 4, 4, 0, dst, stride, c, eob + lea rax, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_16384)] + mov [cq], eobd + pmulhrsw xm0, xm1 + mov r2d, 32 + jmp m(inv_txfm_add_dct_dct_64x16).dconly +.normal: + PROLOGUE 0, 9, 16, 32*131, dst, stride, c, eob, tmp1, tmp2, \ + base, tmp3, tmp4 + lea tmp1q, [rsp+32*7] + lea tmp4d, [eobq-136] +.pass1_loop: + LOAD_8ROWS cq+64*0, 64*4, 1 + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 + call m(idct_16x16_internal).main + mova m1, [rsp+32*1] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m1 + mova [tmp1q-32*2], m2 + mova [tmp1q-32*1], m3 + mova [tmp1q+32*0], m4 + mova [tmp1q+32*1], m5 + mova [tmp1q+32*2], m6 + mova [tmp1q+32*3], m7 + add tmp1q, 32*8 + mova [tmp1q-32*4], m8 + mova [tmp1q-32*3], m9 + mova [tmp1q-32*2], m10 + mova [tmp1q-32*1], m11 + mova [tmp1q+32*0], m12 + mova [tmp1q+32*1], m13 + mova [tmp1q+32*2], m14 + mova [tmp1q+32*3], m15 + LOAD_8ROWS cq+64*2, 64*4, 1 + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30 + add tmp1q, 32*8 + lea tmp2q, [tmp1q+32*8] + call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast + vpbroadcastd m15, [o(pd_2048)] + add tmp1q, 32*16 + add tmp2q, 32*32 + vpbroadcastd m7, [o(pw_2896x8)] + pmulhrsw m0, m7, [cq+64* 1] + pmulhrsw m1, m7, [cq+64*31] + pmulhrsw m2, m7, [cq+64*17] + pmulhrsw m3, m7, [cq+64*15] + pmulhrsw m4, m7, [cq+64* 9] + pmulhrsw m5, m7, [cq+64*23] + pmulhrsw m6, m7, [cq+64*25] + pmulhrsw m7, [cq+64* 7] + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7 + add rax, o_idct64_offset + call m(inv_txfm_add_dct_dct_16x64).main_part1 + vpbroadcastd m7, [o(pw_2896x8-(o_idct64_offset))] + add rax, 8 + add tmp1q, 32*8 + sub tmp2q, 32*8 + pmulhrsw m0, m7, [cq+64* 5] + pmulhrsw m1, m7, [cq+64*27] + pmulhrsw m2, m7, [cq+64*21] + pmulhrsw m3, m7, [cq+64*11] + pmulhrsw m4, m7, [cq+64*13] + pmulhrsw m5, m7, [cq+64*19] + pmulhrsw m6, m7, [cq+64*29] + pmulhrsw m7, [cq+64* 3] + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3 + call m(inv_txfm_add_dct_dct_16x64).main_part1 + call m(inv_txfm_add_dct_dct_16x64).main_part2_pass1 + sub tmp1q, 32*44 + vpbroadcastd m10, [o(pw_16384)] + call m(inv_txfm_add_dct_dct_64x32).transpose_round_interleave + add cq, 32 + add tmp4d, 0x80000000 + jnc .pass1_loop + lea tmp1q, [rsp+32*15] + imul r2, strideq, 19 + lea r3, [strideq*3] + add r2, dstq + mov tmp4b, 4 +.pass2_loop: + lea tmp2q, [tmp1q+32*64] + LOAD_8ROWS tmp1q-32*4, 32 + test tmp4d, 0x40000000 + jnz .fast + LOAD_8ROWS_H tmp2q-32*4, 32 + call m(inv_txfm_add_dct_dct_16x32).main_oddhalf + lea tmp3q, [tmp2q-32*8] + LOAD_8ROWS_H tmp3q-32*4, 32 + mova [rsp], m15 + jmp .idct16 +.fast: + call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 +.idct16: + lea tmp3q, [tmp1q-32*8] + LOAD_8ROWS tmp3q-32*4, 32 + call m(idct_16x16_internal).main + call m(inv_txfm_add_dct_dct_16x32).pass2_end + add tmp1q, 32*16 + sub dstq, r3 + lea r2, [r2+r3+16] + add dstq, 16 + dec tmp4b + jg .pass2_loop + RET +ALIGN function_align +.transpose_round_interleave: + mov tmp3d, 4 +.loop: + lea tmp2q, [tmp1q+32*8] + mova xm0, [tmp1q-32*4] + mova xm1, [tmp1q-32*3] + vinserti128 m0, [tmp2q-32*4], 1 + vinserti128 m1, [tmp2q-32*3], 1 + mova xm2, [tmp1q-32*2] + mova xm3, [tmp1q-32*1] + vinserti128 m2, [tmp2q-32*2], 1 + vinserti128 m3, [tmp2q-32*1], 1 + mova xm4, [tmp1q+32*0] + mova xm5, [tmp1q+32*1] + vinserti128 m4, [tmp2q+32*0], 1 + vinserti128 m5, [tmp2q+32*1], 1 + mova xm6, [tmp1q+32*2] + mova xm7, [tmp1q+32*3] + vinserti128 m6, [tmp2q+32*2], 1 + vinserti128 m7, [tmp2q+32*3], 1 + REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(inv_txfm_add_identity_identity_8x32).transpose8x8 + mova xm8, [tmp1q-32*4+16] + mova xm9, [tmp1q-32*3+16] + vinserti128 m8, [tmp2q-32*4+16], 1 + vinserti128 m9, [tmp2q-32*3+16], 1 + mova [tmp1q-32*4], m0 + mova [tmp2q-32*4], m1 + mova [tmp1q-32*3], m2 + mova [tmp2q-32*3], m3 + mova xm2, [tmp1q-32*2+16] + mova xm3, [tmp1q-32*1+16] + vinserti128 m2, [tmp2q-32*2+16], 1 + vinserti128 m3, [tmp2q-32*1+16], 1 + mova [tmp1q-32*2], m4 + mova [tmp2q-32*2], m5 + mova [tmp1q-32*1], m6 + mova [tmp2q-32*1], m7 + mova xm4, [tmp1q+32*0+16] + mova xm5, [tmp1q+32*1+16] + vinserti128 m4, [tmp2q+32*0+16], 1 + vinserti128 m5, [tmp2q+32*1+16], 1 + mova xm6, [tmp1q+32*2+16] + mova xm7, [tmp1q+32*3+16] + vinserti128 m6, [tmp2q+32*2+16], 1 + vinserti128 m7, [tmp2q+32*3+16], 1 + pmulhrsw m0, m8, m10 + pmulhrsw m1, m9, m10 + REPX {pmulhrsw x, m10}, m2, m3, m4, m5, m6, m7 + call m(inv_txfm_add_identity_identity_8x32).transpose8x8 + mova [tmp1q+32*0], m0 + mova [tmp2q+32*0], m1 + mova [tmp1q+32*1], m2 + mova [tmp2q+32*1], m3 + mova [tmp1q+32*2], m4 + mova [tmp2q+32*2], m5 + mova [tmp1q+32*3], m6 + mova [tmp2q+32*3], m7 + add tmp1q, 32*16 + dec tmp3d + jg .loop + ret + +cglobal inv_txfm_add_dct_dct_64x64, 4, 4, 0, dst, stride, c, eob + lea rax, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_8192)] + mov [cq], eobd + mov r2d, 64 + jmp m(inv_txfm_add_dct_dct_64x16).dconly +.normal: + PROLOGUE 0, 11, 16, 32*199, dst, stride, c, eob, tmp1, tmp2 + lea tmp1q, [rsp+32*71] + lea r10d, [eobq-136] +.pass1_loop: + LOAD_8ROWS cq+64*0, 64*4 + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 + call m(idct_16x16_internal).main + mova m1, [rsp+32*1] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m1 + mova [tmp1q-32*2], m2 + mova [tmp1q-32*1], m3 + mova [tmp1q+32*0], m4 + mova [tmp1q+32*1], m5 + mova [tmp1q+32*2], m6 + mova [tmp1q+32*3], m7 + add tmp1q, 32*8 + mova [tmp1q-32*4], m8 + mova [tmp1q-32*3], m9 + mova [tmp1q-32*2], m10 + mova [tmp1q-32*1], m11 + mova [tmp1q+32*0], m12 + mova [tmp1q+32*1], m13 + mova [tmp1q+32*2], m14 + mova [tmp1q+32*3], m15 + LOAD_8ROWS cq+64*2, 64*4 + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30 + add tmp1q, 32*8 + lea tmp2q, [tmp1q+32*8] + call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast + vpbroadcastd m15, [o(pd_2048)] + add tmp1q, 32*16 + add tmp2q, 32*32 + mova m0, [cq+64* 1] + mova m1, [cq+64*31] + mova m2, [cq+64*17] + mova m3, [cq+64*15] + mova m4, [cq+64* 9] + mova m5, [cq+64*23] + mova m6, [cq+64*25] + mova m7, [cq+64* 7] + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7 + add rax, o_idct64_offset + call m(inv_txfm_add_dct_dct_16x64).main_part1 + add rax, 8 + add tmp1q, 32*8 + sub tmp2q, 32*8 + mova m0, [cq+64* 5] + mova m1, [cq+64*27] + mova m2, [cq+64*21] + mova m3, [cq+64*11] + mova m4, [cq+64*13] + mova m5, [cq+64*19] + mova m6, [cq+64*29] + mova m7, [cq+64* 3] + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3 + call m(inv_txfm_add_dct_dct_16x64).main_part1 + call m(inv_txfm_add_dct_dct_16x64).main_part2_pass1 + sub tmp1q, 32*44 + vpbroadcastd m10, [o(pw_8192)] + call m(inv_txfm_add_dct_dct_64x32).transpose_round_interleave + add cq, 32 + add r10d, 0x80000000 + jnc .pass1_loop + lea tmp1q, [rsp+32*7] + mov r10b, 4 +.pass2_loop: + lea r2, [tmp1q+32*64] + mova m0, [r2-32*4] + mova m1, [r2-32*2] + mova m2, [r2+32*0] + mova m3, [r2+32*2] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14 + mova [rsp], m4 + test r10d, 0x40000000 + jnz .fast + lea r3, [r2+32*64] + mova m4, [r3-32*4] + mova m5, [r3-32*2] + mova m6, [r3+32*0] + mova m7, [r3+32*2] +.fast: + call m(idct_16x16_internal).main + mova m1, [rsp+32*1] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m1 + mova [tmp1q-32*2], m2 + mova [tmp1q-32*1], m3 + mova [tmp1q+32*0], m4 + mova [tmp1q+32*1], m5 + mova [tmp1q+32*2], m6 + mova [tmp1q+32*3], m7 + add tmp1q, 32*8 + mova [tmp1q-32*4], m8 + mova [tmp1q-32*3], m9 + mova [tmp1q-32*2], m10 + mova [tmp1q-32*1], m11 + mova [tmp1q+32*0], m12 + mova [tmp1q+32*1], m13 + mova [tmp1q+32*2], m14 + mova [tmp1q+32*3], m15 + mova m0, [r2-32*3] + mova m1, [r2-32*1] + mova m2, [r2+32*1] + mova m3, [r2+32*3] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + test r10d, 0x40000000 + jnz .fast2 + mova m4, [r3-32*3] + mova m5, [r3-32*1] + mova m6, [r3+32*1] + mova m7, [r3+32*3] +.fast2: + add tmp1q, 32*8 + lea tmp2q, [tmp1q+32*8] + call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast + vpbroadcastd m15, [o(pd_2048)] + add r2, 32*8 + add r3, 32*8 + add tmp1q, 32*16 + add tmp2q, 32*32 + mova m0, [r2-32*4] ; 1 + mova m3, [r2+32*3] ; 15 + mova m4, [r2+32*0] ; 9 + mova m7, [r2-32*1] ; 7 + pxor m1, m1 + REPX {mova x, m1}, m2, m5, m6 + test r10d, 0x40000000 + jnz .fast3 + mova m1, [r3+32*3] ; 31 + mova m2, [r3-32*4] ; 17 + mova m5, [r3-32*1] ; 23 + mova m6, [r3+32*0] ; 25 +.fast3: + add rax, o_idct64_offset + call m(inv_txfm_add_dct_dct_16x64).main_part1 + add rax, 8 + add tmp1q, 32*8 + sub tmp2q, 32*8 + mova m0, [r2-32*2] ; 5 + mova m3, [r2+32*1] ; 11 + mova m4, [r2+32*2] ; 13 + mova m7, [r2-32*3] ; 3 + pxor m1, m1 + REPX {mova x, m1}, m2, m5, m6 + test r10d, 0x40000000 + jnz .fast4 + mova m1, [r3+32*1] ; 27 + mova m2, [r3-32*2] ; 21 + mova m5, [r3-32*3] ; 19 + mova m6, [r3+32*2] ; 29 +.fast4: + call m(inv_txfm_add_dct_dct_16x64).main_part1 + call m(inv_txfm_add_dct_dct_16x64).main_part2_pass2 + sub tmp1q, 32*28 + sub dstq, r8 + lea dstq, [dstq+strideq*4+16] + dec r10b + jg .pass2_loop + RET + +%endif ; ARCH_X86_64 diff --git a/third_party/dav1d/src/x86/itx_init_tmpl.c b/third_party/dav1d/src/x86/itx_init_tmpl.c new file mode 100644 index 0000000000..7d0c58c8e8 --- /dev/null +++ b/third_party/dav1d/src/x86/itx_init_tmpl.c @@ -0,0 +1,187 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/itx.h" + +#define decl_itx2_fns(w, h, opt) \ +decl_itx_fn(dav1d_inv_txfm_add_dct_dct_##w##x##h##_##opt); \ +decl_itx_fn(dav1d_inv_txfm_add_identity_identity_##w##x##h##_##opt) + +#define decl_itx12_fns(w, h, opt) \ +decl_itx2_fns(w, h, opt); \ +decl_itx_fn(dav1d_inv_txfm_add_dct_adst_##w##x##h##_##opt); \ +decl_itx_fn(dav1d_inv_txfm_add_dct_flipadst_##w##x##h##_##opt); \ +decl_itx_fn(dav1d_inv_txfm_add_dct_identity_##w##x##h##_##opt); \ +decl_itx_fn(dav1d_inv_txfm_add_adst_dct_##w##x##h##_##opt); \ +decl_itx_fn(dav1d_inv_txfm_add_adst_adst_##w##x##h##_##opt); \ +decl_itx_fn(dav1d_inv_txfm_add_adst_flipadst_##w##x##h##_##opt); \ +decl_itx_fn(dav1d_inv_txfm_add_flipadst_dct_##w##x##h##_##opt); \ +decl_itx_fn(dav1d_inv_txfm_add_flipadst_adst_##w##x##h##_##opt); \ +decl_itx_fn(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h##_##opt); \ +decl_itx_fn(dav1d_inv_txfm_add_identity_dct_##w##x##h##_##opt) + +#define decl_itx16_fns(w, h, opt) \ +decl_itx12_fns(w, h, opt); \ +decl_itx_fn(dav1d_inv_txfm_add_adst_identity_##w##x##h##_##opt); \ +decl_itx_fn(dav1d_inv_txfm_add_flipadst_identity_##w##x##h##_##opt); \ +decl_itx_fn(dav1d_inv_txfm_add_identity_adst_##w##x##h##_##opt); \ +decl_itx_fn(dav1d_inv_txfm_add_identity_flipadst_##w##x##h##_##opt) + +#define decl_itx17_fns(w, h, opt) \ +decl_itx16_fns(w, h, opt); \ +decl_itx_fn(dav1d_inv_txfm_add_wht_wht_##w##x##h##_##opt) + +decl_itx17_fns( 4, 4, avx2); +decl_itx16_fns( 4, 8, avx2); +decl_itx16_fns( 4, 16, avx2); +decl_itx16_fns( 8, 4, avx2); +decl_itx16_fns( 8, 8, avx2); +decl_itx16_fns( 8, 16, avx2); +decl_itx2_fns ( 8, 32, avx2); +decl_itx16_fns(16, 4, avx2); +decl_itx16_fns(16, 8, avx2); +decl_itx12_fns(16, 16, avx2); +decl_itx2_fns (16, 32, avx2); +decl_itx2_fns (32, 8, avx2); +decl_itx2_fns (32, 16, avx2); +decl_itx2_fns (32, 32, avx2); + +decl_itx_fn(dav1d_inv_txfm_add_dct_dct_16x64_avx2); +decl_itx_fn(dav1d_inv_txfm_add_dct_dct_32x64_avx2); +decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x16_avx2); +decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x32_avx2); +decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x64_avx2); + +decl_itx17_fns( 4, 4, ssse3); +decl_itx16_fns( 4, 8, ssse3); +decl_itx16_fns( 8, 4, ssse3); +decl_itx16_fns( 8, 8, ssse3); +decl_itx16_fns( 4, 16, ssse3); +decl_itx16_fns(16, 4, ssse3); +decl_itx16_fns( 8, 16, ssse3); +decl_itx16_fns(16, 8, ssse3); +decl_itx12_fns(16, 16, ssse3); +decl_itx2_fns ( 8, 32, ssse3); +decl_itx2_fns (32, 8, ssse3); +decl_itx2_fns (16, 32, ssse3); +decl_itx2_fns (32, 16, ssse3); +decl_itx2_fns (32, 32, ssse3); + +decl_itx_fn(dav1d_inv_txfm_add_dct_dct_16x64_ssse3); +decl_itx_fn(dav1d_inv_txfm_add_dct_dct_32x64_ssse3); +decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x16_ssse3); +decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x32_ssse3); +decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x64_ssse3); + +COLD void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) { +#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \ + c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \ + dav1d_inv_txfm_add_##type##_##w##x##h##_##ext + +#define assign_itx1_fn(pfx, w, h, ext) \ + assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext) + +#define assign_itx2_fn(pfx, w, h, ext) \ + assign_itx1_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, identity_identity, IDTX, ext) + +#define assign_itx12_fn(pfx, w, h, ext) \ + assign_itx2_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, dct_adst, ADST_DCT, ext); \ + assign_itx_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, ext); \ + assign_itx_fn(pfx, w, h, dct_identity, H_DCT, ext); \ + assign_itx_fn(pfx, w, h, adst_dct, DCT_ADST, ext); \ + assign_itx_fn(pfx, w, h, adst_adst, ADST_ADST, ext); \ + assign_itx_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, identity_dct, V_DCT, ext) + +#define assign_itx16_fn(pfx, w, h, ext) \ + assign_itx12_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, adst_identity, H_ADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, identity_adst, V_ADST, ext); \ + assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST, ext) + +#define assign_itx17_fn(pfx, w, h, ext) \ + assign_itx16_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, wht_wht, WHT_WHT, ext) + + + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; + +#if BITDEPTH == 8 + assign_itx17_fn(, 4, 4, ssse3); + assign_itx16_fn(R, 4, 8, ssse3); + assign_itx16_fn(R, 8, 4, ssse3); + assign_itx16_fn(, 8, 8, ssse3); + assign_itx16_fn(R, 4, 16, ssse3); + assign_itx16_fn(R, 16, 4, ssse3); + assign_itx16_fn(R, 8, 16, ssse3); + assign_itx16_fn(R, 16, 8, ssse3); + assign_itx12_fn(, 16, 16, ssse3); + assign_itx2_fn (R, 8, 32, ssse3); + assign_itx2_fn (R, 32, 8, ssse3); + assign_itx2_fn (R, 16, 32, ssse3); + assign_itx2_fn (R, 32, 16, ssse3); + assign_itx2_fn (, 32, 32, ssse3); + assign_itx1_fn (R, 16, 64, ssse3); + assign_itx1_fn (R, 32, 64, ssse3); + assign_itx1_fn (R, 64, 16, ssse3); + assign_itx1_fn (R, 64, 32, ssse3); + assign_itx1_fn ( , 64, 64, ssse3); +#endif + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; + +#if BITDEPTH == 8 && ARCH_X86_64 + assign_itx17_fn( , 4, 4, avx2); + assign_itx16_fn(R, 4, 8, avx2); + assign_itx16_fn(R, 4, 16, avx2); + assign_itx16_fn(R, 8, 4, avx2); + assign_itx16_fn( , 8, 8, avx2); + assign_itx16_fn(R, 8, 16, avx2); + assign_itx2_fn (R, 8, 32, avx2); + assign_itx16_fn(R, 16, 4, avx2); + assign_itx16_fn(R, 16, 8, avx2); + assign_itx12_fn( , 16, 16, avx2); + assign_itx2_fn (R, 16, 32, avx2); + assign_itx1_fn (R, 16, 64, avx2); + assign_itx2_fn (R, 32, 8, avx2); + assign_itx2_fn (R, 32, 16, avx2); + assign_itx2_fn ( , 32, 32, avx2); + assign_itx1_fn (R, 32, 64, avx2); + assign_itx1_fn (R, 64, 16, avx2); + assign_itx1_fn (R, 64, 32, avx2); + assign_itx1_fn ( , 64, 64, avx2); +#endif +} diff --git a/third_party/dav1d/src/x86/itx_ssse3.asm b/third_party/dav1d/src/x86/itx_ssse3.asm new file mode 100644 index 0000000000..3ebd3cc17c --- /dev/null +++ b/third_party/dav1d/src/x86/itx_ssse3.asm @@ -0,0 +1,6559 @@ +; Copyright © 2018, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + + +SECTION_RODATA 16 + +deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 + +deint_shuf1: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 +deint_shuf2: db 8, 9, 0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15, 6, 7 + +%macro COEF_PAIR 2-3 0 ; !0 = m%1_m%2, 2 = no %2_%1 +pw_%1_m%2: times 4 dw %1, -%2 +%if %3 != 2 +pw_%2_%1: times 4 dw %2, %1 +%endif +%if %3 +pw_m%1_m%2: times 4 dw -%1, -%2 +%endif +%endmacro + +;adst4 +pw_1321_3803: times 4 dw 1321, 3803 +pw_2482_m1321: times 4 dw 2482, -1321 +pw_3344_2482: times 4 dw 3344, 2482 +pw_3344_m3803: times 4 dw 3344, -3803 +pw_3344_m3344: times 4 dw 3344, -3344 +pw_0_3344 times 4 dw 0, 3344 +pw_m6688_m3803: times 4 dw -6688, -3803 + +COEF_PAIR 2896, 2896 +COEF_PAIR 1567, 3784 +COEF_PAIR 799, 4017 +COEF_PAIR 3406, 2276 +COEF_PAIR 401, 4076 +COEF_PAIR 1931, 3612 +COEF_PAIR 3166, 2598 +COEF_PAIR 3920, 1189 +COEF_PAIR 3784, 1567, 1 +COEF_PAIR 995, 3973 +COEF_PAIR 1751, 3703 +COEF_PAIR 3513, 2106 +COEF_PAIR 3857, 1380 +COEF_PAIR 4017, 799, 1 +COEF_PAIR 201, 4091 +COEF_PAIR 2440, 3290 +COEF_PAIR 3035, 2751 +COEF_PAIR 4052, 601 +COEF_PAIR 2276, 3406, 1 +COEF_PAIR 4076, 401, 2 +COEF_PAIR 2598, 3166, 2 +COEF_PAIR 3612, 1931, 2 +COEF_PAIR 1189, 3920, 2 + +pd_2048: times 4 dd 2048 +pw_2048: times 8 dw 2048 +pw_m2048: times 8 dw -2048 +pw_4096: times 8 dw 4096 +pw_16384: times 8 dw 16384 +pw_m16384: times 8 dw -16384 +pw_1697x16: times 8 dw 1697*16 +pw_1697x8: times 8 dw 1697*8 +pw_2896x8: times 8 dw 2896*8 +pw_3344x8: times 8 dw 3344*8 +pw_8192: times 8 dw 8192 +pw_m8192: times 8 dw -8192 +pw_5: times 8 dw 5 +pw_201x8: times 8 dw 201*8 +pw_4091x8: times 8 dw 4091*8 +pw_m2751x8: times 8 dw -2751*8 +pw_3035x8: times 8 dw 3035*8 +pw_1751x8: times 8 dw 1751*8 +pw_3703x8: times 8 dw 3703*8 +pw_m1380x8: times 8 dw -1380*8 +pw_3857x8: times 8 dw 3857*8 +pw_995x8: times 8 dw 995*8 +pw_3973x8: times 8 dw 3973*8 +pw_m2106x8: times 8 dw -2106*8 +pw_3513x8: times 8 dw 3513*8 +pw_2440x8: times 8 dw 2440*8 +pw_3290x8: times 8 dw 3290*8 +pw_m601x8: times 8 dw -601*8 +pw_4052x8: times 8 dw 4052*8 + +pw_4095x8: times 8 dw 4095*8 +pw_101x8: times 8 dw 101*8 +pw_2967x8: times 8 dw 2967*8 +pw_m2824x8: times 8 dw -2824*8 +pw_3745x8: times 8 dw 3745*8 +pw_1660x8: times 8 dw 1660*8 +pw_3822x8: times 8 dw 3822*8 +pw_m1474x8: times 8 dw -1474*8 +pw_3996x8: times 8 dw 3996*8 +pw_897x8: times 8 dw 897*8 +pw_3461x8: times 8 dw 3461*8 +pw_m2191x8: times 8 dw -2191*8 +pw_3349x8: times 8 dw 3349*8 +pw_2359x8: times 8 dw 2359*8 +pw_4036x8: times 8 dw 4036*8 +pw_m700x8: times 8 dw -700*8 +pw_4065x8: times 8 dw 4065*8 +pw_501x8: times 8 dw 501*8 +pw_3229x8: times 8 dw 3229*8 +pw_m2520x8: times 8 dw -2520*8 +pw_3564x8: times 8 dw 3564*8 +pw_2019x8: times 8 dw 2019*8 +pw_3948x8: times 8 dw 3948*8 +pw_m1092x8: times 8 dw -1092*8 +pw_3889x8: times 8 dw 3889*8 +pw_1285x8: times 8 dw 1285*8 +pw_3659x8: times 8 dw 3659*8 +pw_m1842x8: times 8 dw -1842*8 +pw_3102x8: times 8 dw 3102*8 +pw_2675x8: times 8 dw 2675*8 +pw_4085x8: times 8 dw 4085*8 +pw_m301x8: times 8 dw -301*8 + +SECTION .text + +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro + +%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) + +%if ARCH_X86_64 +%define o(x) x +%else +%define o(x) r5-$$+x ; PIC +%endif + +%macro WRITE_4X4 9 ;src[1-2], tmp[1-3], row[1-4] + lea r2, [dstq+strideq*2] +%assign %%i 1 +%rotate 5 +%rep 4 + %if %1 & 2 + CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1) + %else + CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1) + %endif + %assign %%i %%i + 1 + %rotate 1 +%endrep + + movd m%3, [%%row_adr1] ;dst0 + movd m%5, [%%row_adr2] ;dst1 + punpckldq m%3, m%5 ;high: dst1 :low: dst0 + movd m%4, [%%row_adr3] ;dst2 + movd m%5, [%%row_adr4] ;dst3 + punpckldq m%4, m%5 ;high: dst3 :low: dst2 + + pxor m%5, m%5 + punpcklbw m%3, m%5 ;extend byte to word + punpcklbw m%4, m%5 ;extend byte to word + + paddw m%3, m%1 ;high: dst1 + out1 ;low: dst0 + out0 + paddw m%4, m%2 ;high: dst3 + out3 ;low: dst2 + out2 + + packuswb m%3, m%4 ;high->low: dst3 + out3, dst2 + out2, dst1 + out1, dst0 + out0 + + movd [%%row_adr1], m%3 ;store dst0 + out0 + pshuflw m%4, m%3, q1032 + movd [%%row_adr2], m%4 ;store dst1 + out1 + punpckhqdq m%3, m%3 + movd [%%row_adr3], m%3 ;store dst2 + out2 + psrlq m%3, 32 + movd [%%row_adr4], m%3 ;store dst3 + out3 +%endmacro + +%macro ITX4_END 4-5 2048 ; row[1-4], rnd +%if %5 + mova m2, [o(pw_%5)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 +%endif + + WRITE_4X4 0, 1, 2, 3, 4, %1, %2, %3, %4 + ret +%endmacro + +; flags: 1 = swap, 2: coef_regs, 4: no_pack +%macro ITX_MUL2X_PACK 5-6 0 ; dst/src, tmp[1], rnd, coef[1-2], flags +%if %6 & 2 + pmaddwd m%2, m%4, m%1 + pmaddwd m%1, m%5 +%elif %6 & 1 + pmaddwd m%2, m%1, [o(pw_%5_%4)] + pmaddwd m%1, [o(pw_%4_m%5)] +%else + pmaddwd m%2, m%1, [o(pw_%4_m%5)] + pmaddwd m%1, [o(pw_%5_%4)] +%endif + paddd m%2, m%3 + paddd m%1, m%3 + psrad m%2, 12 + psrad m%1, 12 +%if %6 & 4 == 0 + packssdw m%1, m%2 +%endif +%endmacro + +%macro IDCT4_1D_PACKED 0-1 ;pw_2896x8 + mova m3, [o(pd_2048)] + punpckhwd m2, m0, m1 ;unpacked in1 in3 + punpcklwd m0, m1 ;unpacked in0 in2 + ITX_MUL2X_PACK 2, 1, 3, 1567, 3784 + ITX_MUL2X_PACK 0, 1, 3, 2896, 2896 + psubsw m1, m0, m2 ;high: out2 ;low: out3 + paddsw m0, m2 ;high: out1 ;low: out0 +%endmacro + +%macro INV_TXFM_FN 4+ ; type1, type2, size, xmm/stack +cglobal inv_txfm_add_%1_%2_%3, 4, 6, %4, dst, stride, coeff, eob, tx2 + %define %%p1 m(i%1_%3_internal) +%if ARCH_X86_32 + LEA r5, $$ +%endif +%if has_epilogue +%ifidn %1_%2, dct_dct + test eobd, eobd + jz %%end +%endif + lea tx2q, [o(m(i%2_%3_internal).pass2)] + call %%p1 + RET +%%end: +%else + lea tx2q, [o(m(i%2_%3_internal).pass2)] +%ifidn %1_%2, dct_dct + test eobd, eobd + jnz %%p1 +%else + times ((%%end - %%p1) >> 31) & 1 jmp %%p1 +ALIGN function_align +%%end: +%endif +%endif +%endmacro + +%macro INV_TXFM_4X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x4, 6 +%ifidn %1_%2, dct_dct + pshuflw m0, [coeffq], q0000 + punpcklqdq m0, m0 + mova m1, [o(pw_2896x8)] + pmulhrsw m0, m1 + mov [coeffq], eobd ;0 + pmulhrsw m0, m1 + mova m1, m0 + TAIL_CALL m(iadst_4x4_internal).end2 +%endif +%endmacro + +INIT_XMM ssse3 + +INV_TXFM_4X4_FN dct, dct +INV_TXFM_4X4_FN dct, adst +INV_TXFM_4X4_FN dct, flipadst +INV_TXFM_4X4_FN dct, identity + +cglobal idct_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m0, [coeffq+16*0] ;high: in1 ;low: in0 + mova m1, [coeffq+16*1] ;high: in3 ;low in2 + + IDCT4_1D_PACKED + + mova m2, [o(deint_shuf)] + shufps m3, m0, m1, q1331 + shufps m0, m1, q0220 + pshufb m0, m2 ;high: in1 ;low: in0 + pshufb m1, m3, m2 ;high: in3 ;low :in2 + jmp tx2q + +.pass2: + IDCT4_1D_PACKED + + pxor m2, m2 + mova [coeffq+16*0], m2 + mova [coeffq+16*1], m2 ;memset(coeff, 0, sizeof(*coeff) * sh * sw); + + ITX4_END 0, 1, 3, 2 + +INV_TXFM_4X4_FN adst, dct +INV_TXFM_4X4_FN adst, adst +INV_TXFM_4X4_FN adst, flipadst +INV_TXFM_4X4_FN adst, identity + +cglobal iadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m0, [coeffq+16*0] + mova m1, [coeffq+16*1] + call .main + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m2 ;high: in3 ;low :in2 + punpcklwd m0, m2 ;high: in1 ;low: in0 + jmp tx2q + +.pass2: + call .main + +.end: + pxor m2, m2 + mova [coeffq+16*0], m2 + mova [coeffq+16*1], m2 + +.end2: + ITX4_END 0, 1, 2, 3 + +ALIGN function_align +.main: + punpcklwd m2, m0, m1 ;unpacked in0 in2 + punpckhwd m0, m1 ;unpacked in1 in3 + mova m3, m0 + pmaddwd m1, m2, [o(pw_3344_m3344)];3344 * in0 - 3344 * in2 + pmaddwd m0, [o(pw_0_3344)] ;3344 * in3 + paddd m1, m0 ;t2 + pmaddwd m0, m2, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2 + pmaddwd m2, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2 + pmaddwd m4, m3, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3 + pmaddwd m5, m3, [o(pw_3344_m3803)];3344 * in1 - 3803 * in3 + paddd m4, m0 ;t0 + t3 + pmaddwd m3, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3 + mova m0, [o(pd_2048)] + paddd m1, m0 ;t2 + 2048 + paddd m2, m0 + paddd m0, m4 ;t0 + t3 + 2048 + paddd m5, m2 ;t1 + t3 + 2048 + paddd m2, m4 + paddd m2, m3 ;t0 + t1 - t3 + 2048 + REPX {psrad x, 12}, m1, m0, m5, m2 + packssdw m0, m5 ;high: out1 ;low: out0 + packssdw m1, m2 ;high: out3 ;low: out3 + ret + +INV_TXFM_4X4_FN flipadst, dct +INV_TXFM_4X4_FN flipadst, adst +INV_TXFM_4X4_FN flipadst, flipadst +INV_TXFM_4X4_FN flipadst, identity + +cglobal iflipadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m0, [coeffq+16*0] + mova m1, [coeffq+16*1] + call m(iadst_4x4_internal).main + punpcklwd m2, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m1, m2 ;high: in3 ;low :in2 + punpckhwd m1, m2 ;high: in1 ;low: in0 + jmp tx2q + +.pass2: + call m(iadst_4x4_internal).main + +.end: + pxor m2, m2 + mova [coeffq+16*0], m2 + mova [coeffq+16*1], m2 + +.end2: + ITX4_END 3, 2, 1, 0 + +INV_TXFM_4X4_FN identity, dct +INV_TXFM_4X4_FN identity, adst +INV_TXFM_4X4_FN identity, flipadst +INV_TXFM_4X4_FN identity, identity + +cglobal iidentity_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m0, [coeffq+16*0] + mova m1, [coeffq+16*1] + mova m3, [o(pw_1697x8)] + pmulhrsw m2, m0, m3 + pmulhrsw m3, m1 + paddsw m0, m2 + paddsw m1, m3 + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m2 ;high: in3 ;low :in2 + punpcklwd m0, m2 ;high: in1 ;low: in0 + jmp tx2q + +.pass2: + mova m3, [o(pw_1697x8)] + pmulhrsw m2, m3, m0 + pmulhrsw m3, m1 + paddsw m0, m2 + paddsw m1, m3 + jmp m(iadst_4x4_internal).end + +%macro IWHT4_1D_PACKED 0 + punpckhqdq m3, m0, m1 ;low: in1 high: in3 + punpcklqdq m0, m1 ;low: in0 high: in2 + psubw m2, m0, m3 ;low: in0 - in1 high: in2 - in3 + paddw m0, m3 ;low: in0 + in1 high: in2 + in3 + punpckhqdq m2, m2 ;t2 t2 + punpcklqdq m0, m0 ;t0 t0 + psubw m1, m0, m2 + psraw m1, 1 ;t4 t4 + psubw m1, m3 ;low: t1/out2 high: t3/out1 + psubw m0, m1 ;high: out0 + paddw m2, m1 ;low: out3 +%endmacro + +cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, coeff + mova m0, [coeffq+16*0] + mova m1, [coeffq+16*1] + pxor m2, m2 + mova [coeffq+16*0], m2 + mova [coeffq+16*1], m2 + psraw m0, 2 + psraw m1, 2 + + IWHT4_1D_PACKED + + punpckhwd m0, m1 + punpcklwd m3, m1, m2 + punpckhdq m1, m0, m3 + punpckldq m0, m3 + + IWHT4_1D_PACKED + + shufpd m0, m2, 0x01 + ITX4_END 0, 3, 2, 1, 0 + + +%macro IDCT8_1D_PACKED 0 + mova m6, [o(pd_2048)] + punpckhwd m4, m0, m3 ;unpacked in1 in7 + punpcklwd m0, m2 ;unpacked in0 in4 + punpckhwd m2, m1 ;unpacked in5 in3 + punpcklwd m1, m3 ;unpacked in2 in6 + ITX_MUL2X_PACK 4, 3, 6, 799, 4017 ;low: t7a high: t4a + ITX_MUL2X_PACK 2, 3, 6, 3406, 2276 ;low: t6a high: t5a + ITX_MUL2X_PACK 1, 3, 6, 1567, 3784 ;low: t3 high: t2 + psubsw m3, m4, m2 ;low: t6a high: t5a + paddsw m4, m2 ;low: t7 high: t4 + pshufb m3, [o(deint_shuf1)] + ITX_MUL2X_PACK 0, 2, 6, 2896, 2896 ;low: t0 high: t1 + ITX_MUL2X_PACK 3, 2, 6, 2896, 2896 ;low: t6 high: t5 + psubsw m2, m0, m1 ;low: tmp3 high: tmp2 + paddsw m0, m1 ;low: tmp0 high: tmp1 + punpcklqdq m1, m4, m3 ;low: t7 high: t6 + punpckhqdq m4, m3 ;low: t4 high: t5 + psubsw m3, m0, m1 ;low: out7 high: out6 + paddsw m0, m1 ;low: out0 high: out1 + paddsw m1, m2, m4 ;low: out3 high: out2 + psubsw m2, m4 ;low: out4 high: out5 +%endmacro + +;dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 +;dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 +%macro ITX_MULSUB_2W 7-8 0 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2_in_tmp1 + punpckhwd m%4, m%1, m%2 + punpcklwd m%1, m%2 +%if %7 < 8 + pmaddwd m%2, m%7, m%1 + pmaddwd m%3, m%7, m%4 +%else + mova m%2, [o(pw_%7_%6)] +%if %8 + pmaddwd m%3, m%1, m%2 + pmaddwd m%2, m%4 +%else + pmaddwd m%3, m%4, m%2 + pmaddwd m%2, m%1 +%endif +%endif + paddd m%3, m%5 + paddd m%2, m%5 + psrad m%3, 12 + psrad m%2, 12 +%if %8 + packssdw m%3, m%2 +%else + packssdw m%2, m%3 ;dst2 +%endif +%if %7 < 8 + pmaddwd m%4, m%6 + pmaddwd m%1, m%6 +%elif %8 + mova m%2, [o(pw_%6_m%7)] + pmaddwd m%4, m%2 + pmaddwd m%1, m%2 +%else + mova m%3, [o(pw_%6_m%7)] + pmaddwd m%4, m%3 + pmaddwd m%1, m%3 +%endif + paddd m%4, m%5 + paddd m%1, m%5 + psrad m%4, 12 + psrad m%1, 12 + packssdw m%1, m%4 ;dst1 +%endmacro + +%macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048 + ITX_MULSUB_2W %2, %4, %5, %6, %7, 1567, 3784, 1 ;t2, t3 + ITX_MULSUB_2W %1, %3, %4, %6, %7, 2896, 2896, 1 ;t1, t0 + psubsw m%3, m%1, m%2 ;out2 + paddsw m%2, m%1 ;out1 + paddsw m%1, m%5, m%4 ;out0 + psubsw m%4, m%5 ;out3 +%endmacro + +%macro WRITE_4X8 4 ;row[1-4] + WRITE_4X4 0, 1, 4, 5, 6, %1, %2, %3, %4 + lea dstq, [dstq+strideq*4] + WRITE_4X4 2, 3, 4, 5, 6, %1, %2, %3, %4 +%endmacro + +%macro INV_4X8 0 + punpckhwd m4, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m0, m1 + punpcklwd m0, m1 + punpckhdq m1, m0, m2 ;low: in2 high: in3 + punpckldq m0, m2 ;low: in0 high: in1 + punpckldq m2, m3, m4 ;low: in4 high: in5 + punpckhdq m3, m4 ;low: in6 high: in7 +%endmacro + +%macro INV_TXFM_4X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x8, 8 +%ifidn %1_%2, dct_dct + pshuflw m0, [coeffq], q0000 + punpcklqdq m0, m0 + mova m1, [o(pw_2896x8)] + pmulhrsw m0, m1 + mov [coeffq], eobd + pmulhrsw m0, m1 + pmulhrsw m0, m1 + pmulhrsw m0, [o(pw_2048)] + mova m1, m0 + mova m2, m0 + mova m3, m0 + TAIL_CALL m(iadst_4x8_internal).end3 +%endif +%endmacro + +INV_TXFM_4X8_FN dct, dct +INV_TXFM_4X8_FN dct, adst +INV_TXFM_4X8_FN dct, flipadst +INV_TXFM_4X8_FN dct, identity + +cglobal idct_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m3, [o(pw_2896x8)] + pmulhrsw m0, m3, [coeffq+16*0] + pmulhrsw m1, m3, [coeffq+16*1] + pmulhrsw m2, m3, [coeffq+16*2] + pmulhrsw m3, [coeffq+16*3] + +.pass1: + call m(idct_8x4_internal).main + jmp m(iadst_4x8_internal).pass1_end + +.pass2: + call .main + shufps m1, m1, q1032 + shufps m3, m3, q1032 + mova m4, [o(pw_2048)] + jmp m(iadst_4x8_internal).end2 + +ALIGN function_align +.main: + IDCT8_1D_PACKED + ret + + +INV_TXFM_4X8_FN adst, dct +INV_TXFM_4X8_FN adst, adst +INV_TXFM_4X8_FN adst, flipadst +INV_TXFM_4X8_FN adst, identity + +cglobal iadst_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m3, [o(pw_2896x8)] + pmulhrsw m0, m3, [coeffq+16*0] + pmulhrsw m1, m3, [coeffq+16*1] + pmulhrsw m2, m3, [coeffq+16*2] + pmulhrsw m3, [coeffq+16*3] + +.pass1: + call m(iadst_8x4_internal).main + +.pass1_end: + INV_4X8 + jmp tx2q + +.pass2: + shufps m0, m0, q1032 + shufps m1, m1, q1032 + call .main + mova m4, [o(pw_2048)] + pxor m5, m5 + psubw m5, m4 + +.end: + punpcklqdq m4, m5 + +.end2: + pmulhrsw m0, m4 + pmulhrsw m1, m4 + pmulhrsw m2, m4 + pmulhrsw m3, m4 + pxor m5, m5 + mova [coeffq+16*0], m5 + mova [coeffq+16*1], m5 + mova [coeffq+16*2], m5 + mova [coeffq+16*3], m5 + +.end3: + WRITE_4X8 0, 1, 2, 3 + RET + +ALIGN function_align +.main: + mova m6, [o(pd_2048)] + punpckhwd m4, m3, m0 ;unpacked in7 in0 + punpckhwd m5, m2, m1 ;unpacked in5 in2 + punpcklwd m1, m2 ;unpacked in3 in4 + punpcklwd m0, m3 ;unpacked in1 in6 + ITX_MUL2X_PACK 4, 2, 6, 401, 4076 ;low: t0a high: t1a + ITX_MUL2X_PACK 5, 2, 6, 1931, 3612 ;low: t2a high: t3a + ITX_MUL2X_PACK 1, 2, 6, 3166, 2598 ;low: t4a high: t5a + ITX_MUL2X_PACK 0, 2, 6, 3920, 1189 ;low: t6a high: t7a + + psubsw m3, m4, m1 ;low: t4 high: t5 + paddsw m4, m1 ;low: t0 high: t1 + psubsw m2, m5, m0 ;low: t6 high: t7 + paddsw m5, m0 ;low: t2 high: t3 + + shufps m1, m3, m2, q1032 + punpckhwd m2, m1 + punpcklwd m3, m1 + ITX_MUL2X_PACK 3, 0, 6, 1567, 3784, 1 ;low: t5a high: t4a + ITX_MUL2X_PACK 2, 0, 6, 3784, 1567 ;low: t7a high: t6a + + psubsw m1, m4, m5 ;low: t2 high: t3 + paddsw m4, m5 ;low: out0 high: -out7 + psubsw m5, m3, m2 ;low: t7 high: t6 + paddsw m3, m2 ;low: out6 high: -out1 + shufps m0, m4, m3, q3210 ;low: out0 high: -out1 + shufps m3, m4, q3210 ;low: out6 high: -out7 + + mova m2, [o(pw_2896_m2896)] + mova m7, [o(pw_2896_2896)] + shufps m4, m1, m5, q1032 ;low: t3 high: t7 + shufps m1, m5, q3210 ;low: t2 high: t6 + punpcklwd m5, m1, m4 + punpckhwd m1, m4 + pmaddwd m4, m2, m1 ;-out5 + pmaddwd m2, m5 ; out4 + pmaddwd m1, m7 ; out2 + pmaddwd m5, m7 ;-out3 + REPX {paddd x, m6}, m4, m2, m1, m5 + REPX {psrad x, 12}, m4, m2, m1, m5 + packssdw m1, m5 ;low: out2 high: -out3 + packssdw m2, m4 ;low: out4 high: -out5 + ret + +INV_TXFM_4X8_FN flipadst, dct +INV_TXFM_4X8_FN flipadst, adst +INV_TXFM_4X8_FN flipadst, flipadst +INV_TXFM_4X8_FN flipadst, identity + +cglobal iflipadst_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m3, [o(pw_2896x8)] + pmulhrsw m0, m3, [coeffq+16*0] + pmulhrsw m1, m3, [coeffq+16*1] + pmulhrsw m2, m3, [coeffq+16*2] + pmulhrsw m3, [coeffq+16*3] + +.pass1: + call m(iadst_8x4_internal).main + + punpcklwd m4, m3, m2 + punpckhwd m3, m2 + punpcklwd m5, m1, m0 + punpckhwd m1, m0 + punpckldq m2, m3, m1 ;low: in4 high: in5 + punpckhdq m3, m1 ;low: in6 high: in7 + punpckldq m0, m4, m5 ;low: in0 high: in1 + punpckhdq m1, m4, m5 ;low: in2 high: in3 + jmp tx2q + +.pass2: + shufps m0, m0, q1032 + shufps m1, m1, q1032 + call m(iadst_4x8_internal).main + + mova m4, m0 + mova m5, m1 + pshufd m0, m3, q1032 + pshufd m1, m2, q1032 + pshufd m2, m5, q1032 + pshufd m3, m4, q1032 + mova m5, [o(pw_2048)] + pxor m4, m4 + psubw m4, m5 + jmp m(iadst_4x8_internal).end + +INV_TXFM_4X8_FN identity, dct +INV_TXFM_4X8_FN identity, adst +INV_TXFM_4X8_FN identity, flipadst +INV_TXFM_4X8_FN identity, identity + +cglobal iidentity_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m3, [o(pw_2896x8)] + pmulhrsw m0, m3, [coeffq+16*0] + pmulhrsw m1, m3, [coeffq+16*1] + pmulhrsw m2, m3, [coeffq+16*2] + pmulhrsw m3, [coeffq+16*3] + +.pass1: + mova m7, [o(pw_1697x8)] + pmulhrsw m4, m7, m0 + pmulhrsw m5, m7, m1 + pmulhrsw m6, m7, m2 + pmulhrsw m7, m3 + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 + jmp m(iadst_4x8_internal).pass1_end + +.pass2: + mova m4, [o(pw_4096)] + jmp m(iadst_4x8_internal).end2 + + +%macro WRITE_8X2 5 ;coefs[1-2], tmp[1-3] + movq m%3, [dstq ] + movq m%4, [dstq+strideq] + pxor m%5, m%5 + punpcklbw m%3, m%5 ;extend byte to word + punpcklbw m%4, m%5 ;extend byte to word +%ifnum %1 + paddw m%3, m%1 +%else + paddw m%3, %1 +%endif +%ifnum %2 + paddw m%4, m%2 +%else + paddw m%4, %2 +%endif + packuswb m%3, m%4 + movq [dstq ], m%3 + punpckhqdq m%3, m%3 + movq [dstq+strideq], m%3 +%endmacro + +%macro WRITE_8X4 7 ;coefs[1-4], tmp[1-3] + WRITE_8X2 %1, %2, %5, %6, %7 + lea dstq, [dstq+strideq*2] + WRITE_8X2 %3, %4, %5, %6, %7 +%endmacro + +%macro INV_TXFM_8X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x4, 8 +%ifidn %1_%2, dct_dct + pshuflw m0, [coeffq], q0000 + punpcklqdq m0, m0 + mova m1, [o(pw_2896x8)] + pmulhrsw m0, m1 + pmulhrsw m0, m1 + mova m2, [o(pw_2048)] + pmulhrsw m0, m1 + pmulhrsw m0, m2 + mova m1, m0 + mova m2, m0 + mova m3, m0 + TAIL_CALL m(iadst_8x4_internal).end2 +%endif +%endmacro + +INV_TXFM_8X4_FN dct, dct +INV_TXFM_8X4_FN dct, adst +INV_TXFM_8X4_FN dct, flipadst +INV_TXFM_8X4_FN dct, identity + +cglobal idct_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m3, [o(pw_2896x8)] + pmulhrsw m0, m3, [coeffq+16*0] + pmulhrsw m1, m3, [coeffq+16*1] + pmulhrsw m2, m3, [coeffq+16*2] + pmulhrsw m3, [coeffq+16*3] + + call m(idct_4x8_internal).main + + mova m4, [o(deint_shuf1)] + mova m5, [o(deint_shuf2)] + pshufb m0, m4 + pshufb m1, m5 + pshufb m2, m4 + pshufb m3, m5 + punpckhdq m4, m0, m1 + punpckldq m0, m1 + punpckhdq m5, m2, m3 + punpckldq m2, m3 + punpckhqdq m1, m0, m2 ;in1 + punpcklqdq m0, m2 ;in0 + punpckhqdq m3, m4, m5 ;in3 + punpcklqdq m2 ,m4, m5 ;in2 + jmp tx2q + +.pass2: + call .main + jmp m(iadst_8x4_internal).end + +ALIGN function_align +.main: + mova m6, [o(pd_2048)] + IDCT4_1D 0, 1, 2, 3, 4, 5, 6 + ret + +INV_TXFM_8X4_FN adst, dct +INV_TXFM_8X4_FN adst, adst +INV_TXFM_8X4_FN adst, flipadst +INV_TXFM_8X4_FN adst, identity + +cglobal iadst_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m3, [o(pw_2896x8)] + pmulhrsw m0, m3, [coeffq+16*0] + pmulhrsw m1, m3, [coeffq+16*1] + pmulhrsw m2, m3, [coeffq+16*2] + pmulhrsw m3, [coeffq+16*3] + + shufps m0, m0, q1032 + shufps m1, m1, q1032 + call m(iadst_4x8_internal).main + + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + pxor m5, m5 + psubsw m3, m5, m1 + psubsw m5, m4 + punpckhdq m4, m5, m3 + punpckldq m5, m3 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckhwd m1, m0, m5 ;in1 + punpcklwd m0, m5 ;in0 + punpcklwd m2, m3, m4 ;in2 + punpckhwd m3, m4 ;in3 + jmp tx2q + +.pass2: + call .main + +.end: + mova m4, [o(pw_2048)] + pmulhrsw m0, m4 + pmulhrsw m1, m4 + pmulhrsw m2, m4 + pmulhrsw m3, m4 + +.end2: + pxor m6, m6 + mova [coeffq+16*0], m6 + mova [coeffq+16*1], m6 + mova [coeffq+16*2], m6 + mova [coeffq+16*3], m6 +.end3: + WRITE_8X4 0, 1, 2, 3, 4, 5, 6 + RET + +ALIGN function_align +.main: + punpckhwd m6, m0, m2 ;unpacked in0 in2 + punpcklwd m0, m2 ;unpacked in0 in2 + punpckhwd m7, m1, m3 ;unpacked in1 in3 + punpcklwd m1, m3 ;unpacked in1 in3 + + mova m2, [o(pw_3344_m3344)] + mova m4, [o(pw_0_3344)] + pmaddwd m3, m2, m6 ;3344 * in0 - 3344 * in2 + pmaddwd m5, m4, m7 ;3344 * in3 + pmaddwd m2, m0 + pmaddwd m4, m1 + paddd m3, m5 + paddd m2, m4 + mova m4, [o(pd_2048)] + paddd m3, m4 ;t2 + 2048 + paddd m2, m4 + psrad m3, 12 + psrad m2, 12 + packssdw m2, m3 ;out2 + + pmaddwd m4, m0, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2 + pmaddwd m0, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2 + pmaddwd m3, m1, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3 + pmaddwd m5, m1, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3 + paddd m3, m4 ;t0 + t3 + + pmaddwd m1, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3 + mova m4, [o(pd_2048)] + paddd m0, m4 + paddd m4, m3 ;t0 + t3 + 2048 + paddd m5, m0 ;t1 + t3 + 2048 + paddd m3, m0 + paddd m3, m1 ;t0 + t1 - t3 + 2048 + + psrad m4, 12 ;out0 + psrad m5, 12 ;out1 + psrad m3, 12 ;out3 + packssdw m0, m4, m5 ;low: out0 high: out1 + + pmaddwd m4, m6, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2 + pmaddwd m6, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2 + pmaddwd m1, m7, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3 + pmaddwd m5, m7, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3 + paddd m1, m4 ;t0 + t3 + pmaddwd m7, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3 + + mova m4, [o(pd_2048)] + paddd m6, m4 + paddd m4, m1 ;t0 + t3 + 2048 + paddd m5, m6 ;t1 + t3 + 2048 + paddd m1, m6 + paddd m1, m7 ;t0 + t1 - t3 + 2048 + + psrad m4, 12 ;out0 + psrad m5, 12 ;out1 + psrad m1, 12 ;out3 + packssdw m3, m1 ;out3 + packssdw m4, m5 ;low: out0 high: out1 + + punpckhqdq m1, m0, m4 ;out1 + punpcklqdq m0, m4 ;out0 + ret + +INV_TXFM_8X4_FN flipadst, dct +INV_TXFM_8X4_FN flipadst, adst +INV_TXFM_8X4_FN flipadst, flipadst +INV_TXFM_8X4_FN flipadst, identity + +cglobal iflipadst_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m3, [o(pw_2896x8)] + pmulhrsw m0, m3, [coeffq+16*0] + pmulhrsw m1, m3, [coeffq+16*1] + pmulhrsw m2, m3, [coeffq+16*2] + pmulhrsw m3, [coeffq+16*3] + + shufps m0, m0, q1032 + shufps m1, m1, q1032 + call m(iadst_4x8_internal).main + + punpckhwd m5, m3, m2 + punpcklwd m3, m2 + punpckhwd m2, m1, m0 + punpcklwd m1, m0 + + pxor m0, m0 + psubsw m4, m0, m2 + psubsw m0, m5 + punpckhdq m2, m0, m4 + punpckldq m0, m4 + punpckhdq m4, m3, m1 + punpckldq m3, m1 + punpckhwd m1, m0, m3 ;in1 + punpcklwd m0, m3 ;in0 + punpckhwd m3, m2, m4 ;in3 + punpcklwd m2, m4 ;in2 + jmp tx2q + +.pass2: + call m(iadst_8x4_internal).main + mova m4, m0 + mova m5, m1 + mova m0, m3 + mova m1, m2 + mova m2, m5 + mova m3, m4 + jmp m(iadst_8x4_internal).end + +INV_TXFM_8X4_FN identity, dct +INV_TXFM_8X4_FN identity, adst +INV_TXFM_8X4_FN identity, flipadst +INV_TXFM_8X4_FN identity, identity + +cglobal iidentity_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m3, [o(pw_2896x8)] + pmulhrsw m0, m3, [coeffq+16*0] + pmulhrsw m1, m3, [coeffq+16*1] + pmulhrsw m2, m3, [coeffq+16*2] + pmulhrsw m3, [coeffq+16*3] + paddsw m0, m0 + paddsw m1, m1 + paddsw m2, m2 + paddsw m3, m3 + + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhdq m5, m4, m1 + punpckldq m4, m1 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckhwd m1, m0, m4 ;in1 + punpcklwd m0, m4 ;in0 + punpcklwd m2, m3, m5 ;in2 + punpckhwd m3, m5 ;in3 + jmp tx2q + +.pass2: + mova m7, [o(pw_1697x8)] + pmulhrsw m4, m7, m0 + pmulhrsw m5, m7, m1 + pmulhrsw m6, m7, m2 + pmulhrsw m7, m3 + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 + jmp m(iadst_8x4_internal).end + +%macro INV_TXFM_8X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x8, 8, 16*4 +%ifidn %1_%2, dct_dct + pshuflw m0, [coeffq], q0000 + punpcklwd m0, m0 + mova m1, [o(pw_2896x8)] + pmulhrsw m0, m1 + mova m2, [o(pw_16384)] + mov [coeffq], eobd + pmulhrsw m0, m2 + psrlw m2, 3 + pmulhrsw m0, m1 + pmulhrsw m0, m2 +.end: + mov r3d, 2 + lea tx2q, [o(m(inv_txfm_add_dct_dct_8x8).end3)] +.loop: + WRITE_8X4 0, 0, 0, 0, 1, 2, 3 + lea dstq, [dstq+strideq*2] + dec r3d + jg .loop + jmp tx2q +.end3: + RET +%endif +%endmacro + +%macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2 +%if %3 + mova m7, [o(pw_2896x8)] + pmulhrsw m0, m7, [%1+%2*0] + pmulhrsw m1, m7, [%1+%2*1] + pmulhrsw m2, m7, [%1+%2*2] + pmulhrsw m3, m7, [%1+%2*3] + pmulhrsw m4, m7, [%1+%2*4] + pmulhrsw m5, m7, [%1+%2*5] + pmulhrsw m6, m7, [%1+%2*6] + pmulhrsw m7, [%1+%2*7] +%else + mova m0, [%1+%2*0] + mova m1, [%1+%2*1] + mova m2, [%1+%2*2] + mova m3, [%1+%2*3] + mova m4, [%1+%2*4] + mova m5, [%1+%2*5] + mova m6, [%1+%2*6] + mova m7, [%1+%2*7] +%endif +%endmacro + +%macro IDCT8_1D_ODDHALF 7 ; src[1-4], tmp[1-2], pd_2048 + ITX_MULSUB_2W %1, %4, %5, %6, %7, 799, 4017 ;t4a, t7a + ITX_MULSUB_2W %3, %2, %5, %6, %7, 3406, 2276, 1 ;t5a, t6a + psubsw m%2, m%4, m%5 ;t6a + paddsw m%4, m%5 ;t7 + psubsw m%5, m%1, m%3 ;t5a + paddsw m%1, m%3 ;t4 + ITX_MULSUB_2W %2, %5, %3, %6, %7, 2896, 2896, 1 ;t5, t6 +%endmacro + +INV_TXFM_8X8_FN dct, dct +INV_TXFM_8X8_FN dct, adst +INV_TXFM_8X8_FN dct, flipadst +INV_TXFM_8X8_FN dct, identity + +cglobal idct_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_8ROWS coeffq, 16 + +.pass1: + call .main + +.pass1_end: + mova m7, [o(pw_16384)] + +.pass1_end1: + REPX {pmulhrsw x, m7}, m0, m2, m4, m6 + mova [rsp+gprsize+16*1], m6 + +.pass1_end2: + REPX {pmulhrsw x, m7}, m1, m3, m5 + pmulhrsw m7, [rsp+gprsize+16*0] + +.pass1_end3: + punpcklwd m6, m1, m5 ;10 50 11 51 12 52 13 53 + punpckhwd m1, m5 ;14 54 15 55 16 56 17 57 + punpckhwd m5, m0, m4 ;04 44 05 45 06 46 07 47 + punpcklwd m0, m4 ;00 40 01 41 02 42 03 43 + punpckhwd m4, m3, m7 ;34 74 35 75 36 76 37 77 + punpcklwd m3, m7 ;30 70 31 71 32 72 33 73 + punpckhwd m7, m1, m4 ;16 36 56 76 17 37 57 77 + punpcklwd m1, m4 ;14 34 54 74 15 35 55 75 + punpckhwd m4, m6, m3 ;12 32 52 72 13 33 53 73 + punpcklwd m6, m3 ;10 30 50 70 11 31 51 71 + mova [rsp+gprsize+16*2], m6 + mova m6, [rsp+gprsize+16*1] + punpckhwd m3, m2, m6 ;24 64 25 65 26 66 27 67 + punpcklwd m2, m6 ;20 60 21 61 22 62 23 63 + punpckhwd m6, m5, m3 ;06 26 46 66 07 27 47 67 + punpcklwd m5, m3 ;04 24 44 64 05 25 45 65 + punpckhwd m3, m0, m2 ;02 22 42 62 03 23 43 63 + punpcklwd m0, m2 ;00 20 40 60 01 21 41 61 + + punpckhwd m2, m6, m7 ;07 17 27 37 47 57 67 77 + punpcklwd m6, m7 ;06 16 26 36 46 56 66 76 + mova [rsp+gprsize+16*0], m2 + punpcklwd m2, m3, m4 ;02 12 22 32 42 52 62 72 + punpckhwd m3, m4 ;03 13 23 33 43 53 63 73 + punpcklwd m4, m5, m1 ;04 14 24 34 44 54 64 74 + punpckhwd m5, m1 ;05 15 25 35 45 55 65 75 + mova m7, [rsp+gprsize+16*2] + punpckhwd m1, m0, m7 ;01 11 21 31 41 51 61 71 + punpcklwd m0, m7 ;00 10 20 30 40 50 60 70 + mova m7, [rsp+gprsize+16*0] + jmp tx2q + +.pass2: + lea tx2q, [o(m(idct_8x8_internal).end4)] + +.pass2_main: + call .main + +.end: + mova m7, [o(pw_2048)] + REPX {pmulhrsw x, m7}, m0, m2, m4, m6 + mova [rsp+gprsize+16*1], m6 + +.end2: + REPX {pmulhrsw x, m7}, m1, m3, m5 + pmulhrsw m7, [rsp+gprsize+16*0] + mova [rsp+gprsize+16*2], m5 + mova [rsp+gprsize+16*0], m7 + +.end3: + WRITE_8X4 0, 1, 2, 3, 5, 6, 7 + lea dstq, [dstq+strideq*2] + WRITE_8X4 4, [rsp+gprsize+16*2], [rsp+gprsize+16*1], [rsp+gprsize+16*0], 5, 6, 7 + jmp tx2q + +.end4: + pxor m7, m7 + REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 + ret + +ALIGN function_align +.main: + mova [rsp+gprsize*2+16*0], m7 + mova [rsp+gprsize*2+16*1], m3 + mova [rsp+gprsize*2+16*2], m1 + mova m7, [o(pd_2048)] + IDCT4_1D 0, 2, 4, 6, 1, 3, 7 + mova m3, [rsp+gprsize*2+16*2] + mova [rsp+gprsize*2+16*2], m2 + mova m2, [rsp+gprsize*2+16*1] + mova [rsp+gprsize*2+16*1], m4 + mova m4, [rsp+gprsize*2+16*0] + mova [rsp+gprsize*2+16*0], m6 + IDCT8_1D_ODDHALF 3, 2, 5, 4, 1, 6, 7 + mova m6, [rsp+gprsize*2+16*0] + psubsw m7, m0, m4 ;out7 + paddsw m0, m4 ;out0 + mova [rsp+gprsize*2+16*0], m7 + mova m1, [rsp+gprsize*2+16*2] + psubsw m4, m6, m3 ;out4 + paddsw m3, m6 ;out3 + mova m7, [rsp+gprsize*2+16*1] + psubsw m6, m1, m5 ;out6 + paddsw m1, m5 ;out1 + psubsw m5, m7, m2 ;out5 + paddsw m2, m7 ;out2 + ret + + +INV_TXFM_8X8_FN adst, dct +INV_TXFM_8X8_FN adst, adst +INV_TXFM_8X8_FN adst, flipadst +INV_TXFM_8X8_FN adst, identity + +cglobal iadst_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_8ROWS coeffq, 16 + +.pass1: + call .main + call .main_pass1_end + +.pass1_end: + mova m7, [o(pw_16384)] + +.pass1_end1: + REPX {pmulhrsw x, m7}, m0, m2, m4, m6 + mova [rsp+gprsize+16*1], m6 + pxor m6, m6 + psubw m6, m7 + mova m7, m6 + jmp m(idct_8x8_internal).pass1_end2 + +ALIGN function_align +.pass2: + lea tx2q, [o(m(idct_8x8_internal).end4)] + +.pass2_main: + call .main + call .main_pass2_end + +.end: + mova m7, [o(pw_2048)] + REPX {pmulhrsw x, m7}, m0, m2, m4, m6 + mova [rsp+gprsize+16*1], m6 + pxor m6, m6 + psubw m6, m7 + mova m7, m6 + jmp m(idct_8x8_internal).end2 + +ALIGN function_align +.main: + mova [rsp+gprsize*2+16*0], m7 + mova [rsp+gprsize*2+16*1], m3 + mova [rsp+gprsize*2+16*2], m4 + mova m7, [o(pd_2048)] + ITX_MULSUB_2W 5, 2, 3, 4, 7, 1931, 3612 ;t3a, t2a + ITX_MULSUB_2W 1, 6, 3, 4, 7, 3920, 1189 ;t7a, t6a + paddsw m3, m2, m6 ;t2 + psubsw m2, m6 ;t6 + paddsw m4, m5, m1 ;t3 + psubsw m5, m1 ;t7 + ITX_MULSUB_2W 5, 2, 1, 6, 7, 3784, 1567 ;t6a, t7a + + mova m6, [rsp+gprsize*2+16*2] + mova [rsp+gprsize*2+16*2], m5 + mova m1, [rsp+gprsize*2+16*1] + mova [rsp+gprsize*2+16*1], m2 + mova m5, [rsp+gprsize*2+16*0] + mova [rsp+gprsize*2+16*0], m3 + ITX_MULSUB_2W 5, 0, 2, 3, 7, 401, 4076 ;t1a, t0a + ITX_MULSUB_2W 1, 6, 2, 3, 7, 3166, 2598 ;t5a, t4a + psubsw m2, m0, m6 ;t4 + paddsw m0, m6 ;t0 + paddsw m3, m5, m1 ;t1 + psubsw m5, m1 ;t5 + ITX_MULSUB_2W 2, 5, 1, 6, 7, 1567, 3784 ;t5a, t4a + + mova m7, [rsp+gprsize*2+16*0] + paddsw m1, m3, m4 ;-out7 + psubsw m3, m4 ;t3 + mova [rsp+gprsize*2+16*0], m1 + psubsw m4, m0, m7 ;t2 + paddsw m0, m7 ;out0 + mova m6, [rsp+gprsize*2+16*2] + mova m7, [rsp+gprsize*2+16*1] + paddsw m1, m5, m6 ;-out1 + psubsw m5, m6 ;t6 + paddsw m6, m2, m7 ;out6 + psubsw m2, m7 ;t7 + ret +ALIGN function_align +.main_pass1_end: + mova [rsp+gprsize*2+16*1], m1 + mova [rsp+gprsize*2+16*2], m6 + punpckhwd m1, m4, m3 + punpcklwd m4, m3 + punpckhwd m7, m5, m2 + punpcklwd m5, m2 + mova m2, [o(pw_2896_2896)] + mova m6, [o(pd_2048)] + pmaddwd m3, m2, m7 + pmaddwd m2, m5 + paddd m3, m6 + paddd m2, m6 + psrad m3, 12 + psrad m2, 12 + packssdw m2, m3 ;out2 + mova m3, [o(pw_2896_m2896)] + pmaddwd m7, m3 + pmaddwd m5, m3 + paddd m7, m6 + paddd m5, m6 + psrad m7, 12 + psrad m5, 12 + packssdw m5, m7 ;-out5 + mova m3, [o(pw_2896_2896)] + pmaddwd m7, m3, m1 + pmaddwd m3, m4 + paddd m7, m6 + paddd m3, m6 + psrad m7, 12 + psrad m3, 12 + packssdw m3, m7 ;-out3 + mova m7, [o(pw_2896_m2896)] + pmaddwd m1, m7 + pmaddwd m4, m7 + paddd m1, m6 + paddd m4, m6 + psrad m1, 12 + psrad m4, 12 + packssdw m4, m1 ;-out5 + mova m1, [rsp+gprsize*2+16*1] + mova m6, [rsp+gprsize*2+16*2] + ret +ALIGN function_align +.main_pass2_end: + paddsw m7, m4, m3 ;t2 + t3 + psubsw m4, m3 ;t2 - t3 + paddsw m3, m5, m2 ;t6 + t7 + psubsw m5, m2 ;t6 - t7 + mova m2, [o(pw_2896x8)] + pmulhrsw m4, m2 ;out4 + pmulhrsw m5, m2 ;-out5 + pmulhrsw m7, m2 ;-out3 + pmulhrsw m2, m3 ;out2 + mova m3, m7 + ret + +INV_TXFM_8X8_FN flipadst, dct +INV_TXFM_8X8_FN flipadst, adst +INV_TXFM_8X8_FN flipadst, flipadst +INV_TXFM_8X8_FN flipadst, identity + +cglobal iflipadst_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_8ROWS coeffq, 16 + +.pass1: + call m(iadst_8x8_internal).main + call m(iadst_8x8_internal).main_pass1_end + +.pass1_end: + mova m7, [o(pw_m16384)] + +.pass1_end1: + pmulhrsw m1, m7 + mova [rsp+gprsize+16*1], m1 + mova m1, m6 + mova m6, m2 + pmulhrsw m2, m5, m7 + mova m5, m6 + mova m6, m4 + pmulhrsw m4, m3, m7 + mova m3, m6 + mova m6, m0 + mova m0, m7 + pxor m7, m7 + psubw m7, m0 + pmulhrsw m0, [rsp+gprsize+16*0] + REPX {pmulhrsw x, m7}, m1, m3, m5 + pmulhrsw m7, m6 + jmp m(idct_8x8_internal).pass1_end3 + +ALIGN function_align +.pass2: + lea tx2q, [o(m(idct_8x8_internal).end4)] + +.pass2_main: + call m(iadst_8x8_internal).main + call m(iadst_8x8_internal).main_pass2_end + +.end: + mova m7, [o(pw_2048)] + REPX {pmulhrsw x, m7}, m0, m2, m4, m6 + mova [rsp+gprsize+16*2], m2 + mova m2, m0 + pxor m0, m0 + psubw m0, m7 + mova m7, m2 + pmulhrsw m1, m0 + pmulhrsw m2, m5, m0 + mova [rsp+gprsize+16*1], m1 + mova m5, m4 + mova m1, m6 + pmulhrsw m4, m3, m0 + pmulhrsw m0, [rsp+gprsize+16*0] + mova m3, m5 + mova [rsp+gprsize+16*0], m7 + jmp m(idct_8x8_internal).end3 + +INV_TXFM_8X8_FN identity, dct +INV_TXFM_8X8_FN identity, adst +INV_TXFM_8X8_FN identity, flipadst +INV_TXFM_8X8_FN identity, identity + +cglobal iidentity_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_8ROWS coeffq, 16 + mova [rsp+gprsize+16*1], m6 + jmp m(idct_8x8_internal).pass1_end3 + +ALIGN function_align +.pass2: + lea tx2q, [o(m(idct_8x8_internal).end4)] + +.end: + pmulhrsw m7, [o(pw_4096)] + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_4096)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + mova [rsp+gprsize+16*2], m5 + mova [rsp+gprsize+16*1], m6 + jmp m(idct_8x8_internal).end3 + + +%macro INV_TXFM_4X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x16, 8 +%ifidn %1_%2, dct_dct + pshuflw m0, [coeffq], q0000 + punpcklwd m0, m0 + mova m1, [o(pw_2896x8)] + pmulhrsw m0, m1 + mov [coeffq], eobd + pmulhrsw m0, [o(pw_16384)] + pmulhrsw m0, m1 + pmulhrsw m0, [o(pw_2048)] +.end: + WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 + lea dstq, [dstq+strideq*4] + WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 + lea dstq, [dstq+strideq*4] + WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 + lea dstq, [dstq+strideq*4] + WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 + RET +%endif +%endmacro + +INV_TXFM_4X16_FN dct, dct +INV_TXFM_4X16_FN dct, adst +INV_TXFM_4X16_FN dct, flipadst +INV_TXFM_4X16_FN dct, identity + +cglobal idct_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + lea r3, [o(m(idct_4x8_internal).pass1)] + +.pass1: + mova m0, [coeffq+16*1] + mova m1, [coeffq+16*3] + mova m2, [coeffq+16*5] + mova m3, [coeffq+16*7] + push tx2q + lea tx2q, [o(m(idct_4x16_internal).pass1_2)] + jmp r3 + +.pass1_2: + mova [coeffq+16*1], m0 + mova [coeffq+16*3], m1 + mova [coeffq+16*5], m2 + mova [coeffq+16*7], m3 + mova m0, [coeffq+16*0] + mova m1, [coeffq+16*2] + mova m2, [coeffq+16*4] + mova m3, [coeffq+16*6] + lea tx2q, [o(m(idct_4x16_internal).pass1_end)] + jmp r3 + +.pass1_end: + pop tx2q + + mova m4, [coeffq+16*1] + mova m5, [coeffq+16*3] + mova m6, [coeffq+16*5] + mova m7, [o(pw_16384)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + + pmulhrsw m7, [coeffq+16*7] + mova [coeffq+16*7], m7 + jmp tx2q + +.pass2: + call m(idct_16x4_internal).main + +.end: + mova m7, [o(pw_2048)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulhrsw m7, [coeffq+16*7] + mova [coeffq+16*4], m4 + +.end1: + mova [coeffq+16*5], m5 + mova [coeffq+16*6], m6 + mov r3, coeffq + WRITE_4X8 0, 1, 3, 2 + + mova m0, [r3+16*4] + mova m1, [r3+16*5] + mova m2, [r3+16*6] + mova m3, m7 + lea dstq, [dstq+strideq*4] + WRITE_4X8 0, 1, 3, 2 + +.end2: + pxor m7, m7 + REPX {mova [r3+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 + ret + +INV_TXFM_4X16_FN adst, dct +INV_TXFM_4X16_FN adst, adst +INV_TXFM_4X16_FN adst, flipadst +INV_TXFM_4X16_FN adst, identity + +cglobal iadst_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + lea r3, [o(m(iadst_4x8_internal).pass1)] + jmp m(idct_4x16_internal).pass1 + +.pass2: + call m(iadst_16x4_internal).main + call m(iadst_16x4_internal).main_pass2_end + + punpcklqdq m6, m5, m4 ;low: -out5 high: -out7 + punpckhqdq m4, m5 ;low: out8 high: out10 + punpcklqdq m5, m7, m2 ;low: out4 high: out6 + punpckhqdq m2, m7 ;low: -out9 high: -out11 + mova [coeffq+16*4], m2 + mova [coeffq+16*5], m6 + mova m2, [coeffq+16*6] + mova m6, [coeffq+16*7] + punpckhqdq m1, m6, m0 ;low: -out13 high: -out15 + punpcklqdq m0, m6 ;low: out0 high: out2 + punpckhqdq m6, m3, m2 ;low: out12 high: out14 + punpcklqdq m2, m3 ;low: -out1 high: -out3 + + mova m7, [o(pw_2048)] + +.end1: + REPX {pmulhrsw x, m7}, m0, m5, m4, m6 + pxor m3, m3 + psubw m3, m7 + mova m7, [coeffq+16*4] + REPX {pmulhrsw x, m3}, m2, m7, m1 + pmulhrsw m3, [coeffq+16*5] + mova [coeffq+16*7], m5 + + punpckhqdq m5, m4, m7 ;low: out10 high: out11 + punpcklqdq m4, m7 ;low: out8 high: out9 + punpckhqdq m7, m6, m1 ;low: out14 high: out15 + punpcklqdq m6, m1 ;low: out12 high: out13 + punpckhqdq m1, m0, m2 ;low: out2 high: out3 + punpcklqdq m0, m2 ;low: out0 high: out1 + mova [coeffq+16*4], m4 + mova m4, [coeffq+16*7] + punpcklqdq m2, m4, m3 ;low: out4 high: out5 + punpckhqdq m4, m3 ;low: out6 high: out7 + mova m3, m4 + +.end2: + mova [coeffq+16*5], m5 + mova [coeffq+16*6], m6 + mov r3, coeffq + WRITE_4X8 0, 1, 2, 3 + + mova m0, [r3+16*4] + mova m1, [r3+16*5] + mova m2, [r3+16*6] + mova m3, m7 + lea dstq, [dstq+strideq*4] + WRITE_4X8 0, 1, 2, 3 + +.end3: + pxor m7, m7 + REPX {mova [r3+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 + ret + + +INV_TXFM_4X16_FN flipadst, dct +INV_TXFM_4X16_FN flipadst, adst +INV_TXFM_4X16_FN flipadst, flipadst +INV_TXFM_4X16_FN flipadst, identity + +cglobal iflipadst_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + lea r3, [o(m(iflipadst_4x8_internal).pass1)] + jmp m(idct_4x16_internal).pass1 + +.pass2: + call m(iadst_16x4_internal).main + call m(iadst_16x4_internal).main_pass2_end + + punpckhqdq m6, m5, m4 ;low: out5 high: out7 + punpcklqdq m4, m5 ;low: -out8 high: -out10 + punpckhqdq m5, m7, m2 ;low: -out4 high: -out6 + punpcklqdq m2, m7 ;low: out9 high: out11 + mova [coeffq+16*4], m2 + mova [coeffq+16*5], m6 + mova m2, [coeffq+16*6] + mova m6, [coeffq+16*7] + punpcklqdq m1, m6, m0 ;low: out13 high: out15 + punpckhqdq m0, m6 ;low: -out0 high: -out2 + punpcklqdq m6, m3, m2 ;low: -out12 high: -out14 + punpckhqdq m2, m3 ;low: out1 high: out3 + + mova m7, [o(pw_m2048)] + jmp m(iadst_4x16_internal).end1 + + +INV_TXFM_4X16_FN identity, dct +INV_TXFM_4X16_FN identity, adst +INV_TXFM_4X16_FN identity, flipadst +INV_TXFM_4X16_FN identity, identity + +%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394] + pmulhrsw m%2, m%3, m%1 +%if %0 == 4 ; if downshifting by 1 + pmulhrsw m%2, m%4 +%else + paddsw m%1, m%1 +%endif + paddsw m%1, m%2 +%endmacro + +cglobal iidentity_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m0, [coeffq+16*1] + mova m6, [o(pw_1697x8)] + mova m1, [coeffq+16*3] + mova m2, [coeffq+16*5] + mova m3, [coeffq+16*7] + pcmpeqw m7, m7 + mov r3, tx2q + lea tx2q, [o(.pass1_2)] +.pass1: + pmulhrsw m4, m6, m0 + pmulhrsw m5, m6, m1 + pavgw m4, m0 + pcmpeqw m0, m7 + pavgw m5, m1 + pcmpeqw m1, m7 + pandn m0, m4 + pmulhrsw m4, m6, m2 + pandn m1, m5 + pmulhrsw m5, m6, m3 + pavgw m4, m2 + pcmpeqw m2, m7 + pavgw m5, m3 + pcmpeqw m3, m7 + pandn m2, m4 + pandn m3, m5 + jmp m(iadst_4x8_internal).pass1_end +.pass1_2: + mova [coeffq+16*1], m0 + mova [coeffq+16*3], m1 + mova [coeffq+16*5], m2 + mova [coeffq+16*7], m3 + mova m0, [coeffq+16*0] + mova m1, [coeffq+16*2] + mova m2, [coeffq+16*4] + mova m3, [coeffq+16*6] + lea tx2q, [o(.pass1_end)] + jmp .pass1 +.pass1_end: + mova m4, [coeffq+16*1] + mova m5, [coeffq+16*3] + mova m6, [coeffq+16*5] + jmp r3 +.pass2: + mova m7, [o(pw_1697x16)] + mova [coeffq+16*6], m6 + REPX {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5 + mova m6, [coeffq+16*7] + IDTX16 6, 7, 7 + mova [coeffq+16*7], m6 + mova m6, [coeffq+16*6] + pmulhrsw m7, m6, [o(pw_1697x16)] + paddsw m6, m6 + paddsw m6, m7 + mova m7, [o(pw_2048)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulhrsw m7, [coeffq+16*7] + mova [coeffq+16*4], m4 + jmp m(iadst_4x16_internal).end2 + + +%macro INV_TXFM_16X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x4, 8 +%ifidn %1_%2, dct_dct + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_16384)] + mov [coeffq], eobd + mov r2d, 2 + lea tx2q, [o(m(inv_txfm_add_dct_dct_16x4).end)] +.dconly: + pmulhrsw m0, m2 + movd m2, [o(pw_2048)] ;intentionally rip-relative + pmulhrsw m0, m1 + pmulhrsw m0, m2 + pshuflw m0, m0, q0000 + punpcklwd m0, m0 + pxor m5, m5 +.dconly_loop: + mova m1, [dstq] + mova m3, [dstq+strideq] + punpckhbw m2, m1, m5 + punpcklbw m1, m5 + punpckhbw m4, m3, m5 + punpcklbw m3, m5 + paddw m2, m0 + paddw m1, m0 + paddw m4, m0 + paddw m3, m0 + packuswb m1, m2 + packuswb m3, m4 + mova [dstq], m1 + mova [dstq+strideq], m3 + lea dstq, [dstq+strideq*2] + dec r2d + jg .dconly_loop + jmp tx2q +.end: + RET +%endif +%endmacro + +%macro LOAD_7ROWS 2 ;src, stride + mova m0, [%1+%2*0] + mova m1, [%1+%2*1] + mova m2, [%1+%2*2] + mova m3, [%1+%2*3] + mova m4, [%1+%2*4] + mova m5, [%1+%2*5] + mova m6, [%1+%2*6] +%endmacro + +%macro SAVE_7ROWS 2 ;src, stride + mova [%1+%2*0], m0 + mova [%1+%2*1], m1 + mova [%1+%2*2], m2 + mova [%1+%2*3], m3 + mova [%1+%2*4], m4 + mova [%1+%2*5], m5 + mova [%1+%2*6], m6 +%endmacro + +%macro IDCT16_1D_PACKED_ODDHALF 7 ;src[1-4], tmp[1-3] + punpckhwd m%5, m%4, m%1 ;packed in13 in3 + punpcklwd m%1, m%4 ;packed in1 in15 + punpcklwd m%4, m%3, m%2 ;packed in9 in7 + punpckhwd m%2, m%3 ;packed in5 in11 + mova m%7, [o(pd_2048)] + ITX_MUL2X_PACK %1, %6, %7, 401, 4076, 1 ;low: t8a high: t15a + ITX_MUL2X_PACK %4, %6, %7, 3166, 2598, 1 ;low: t9a high: t14a + ITX_MUL2X_PACK %2, %6, %7, 1931, 3612, 1 ;low: t10a high: t13a + ITX_MUL2X_PACK %5, %6, %7, 3920, 1189, 1 ;low: t11a high: t12a + psubsw m%6, m%1, m%4 ;low: t9 high: t14 + paddsw m%1, m%4 ;low: t8 high: t15 + psubsw m%4, m%5, m%2 ;low: t10 high: t13 + paddsw m%5, m%2 ;low: t11 high: t12 + mova m%2, [o(deint_shuf2)] + pshufb m%6, m%2 + pshufb m%4, m%2 + ITX_MUL2X_PACK %6, %3, %7, 1567, 3784, 1 ;low: t9a high: t14a + ITX_MUL2X_PACK %4, %3, %7, m3784, 1567, 1 ;low: t10a high: t13a + psubsw m%3, m%1, m%5 ;low: t11a high: t12a + paddsw m%1, m%5 ;low: t8a high: t15a + psubsw m%5, m%6, m%4 ;low: t10 high: t13 + paddsw m%6, m%4 ;low: t9 high: t14 + pshufb m%3, m%2 + pshufb m%5, m%2 + ITX_MUL2X_PACK %3, %2, %7, 2896, 2896, 4 ;t12, t11 + ITX_MUL2X_PACK %5, %4, %7, 2896, 2896, 4 ;t13a, t10a + packssdw m%2, m%4 ;low: t11 high: t10a + packssdw m%3, m%5 ;low: t12 high: t13a + punpckhqdq m%4, m%1, m%6 ;low: t15a high: t14 + punpcklqdq m%1, m%6 ;low: t8a high: t9 +%endmacro + +INV_TXFM_16X4_FN dct, dct +INV_TXFM_16X4_FN dct, adst +INV_TXFM_16X4_FN dct, flipadst +INV_TXFM_16X4_FN dct, identity + +cglobal idct_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_7ROWS coeffq, 16 + call .main + +.pass1_end: + punpckhwd m7, m0, m2 ;packed out1, out5 + punpcklwd m0, m2 ;packed out0, out4 + punpcklwd m2, m1, m3 ;packed out3, out7 + punpckhwd m1, m3 ;packed out2, out6 + mova [coeffq+16*6], m7 + mova m7, [coeffq+16*7] + punpckhwd m3, m4, m6 ;packed out9, out13 + punpcklwd m4, m6 ;packed out8, out12 + punpcklwd m6, m5, m7 ;packed out11, out15 + punpckhwd m5, m7 ;packed out10, out14 + +.pass1_end2: + mova m7, [o(pw_16384)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulhrsw m7, [coeffq+16*6] + mova [coeffq+16*6], m7 + +.pass1_end3: + punpckhwd m7, m3, m6 ;packed 9, 11, 13, 15 high + punpcklwd m3, m6 ;packed 9, 10, 13, 15 low + punpckhwd m6, m4, m5 ;packed 8, 10, 12, 14 high + punpcklwd m4, m5 ;packed 8, 10, 12, 14 low + punpckhwd m5, m4, m3 ;8, 9, 10, 11, 12, 13, 14, 15(1) + punpcklwd m4, m3 ;8, 9, 10, 11, 12, 13, 14, 15(0) + punpckhwd m3, m6, m7 ;8, 9, 10, 11, 12, 13, 14, 15(3) + punpcklwd m6, m7 ;8, 9, 10, 11, 12, 13, 14, 15(2) + mova [coeffq+16*7], m3 + mova m3, [coeffq+16*6] + punpckhwd m7, m3, m2 ;packed 1, 3, 5, 7 high + punpcklwd m3, m2 ;packed 1, 3, 5, 7 low + punpckhwd m2, m0, m1 ;packed 0, 2, 4, 6 high + punpcklwd m0, m1 ;packed 0, 2, 4, 6 low + punpckhwd m1, m0, m3 ;0, 1, 2, 3, 4, 5, 6, 7(1) + punpcklwd m0, m3 ;0, 1, 2, 3, 4, 5, 6, 7(0) + punpckhwd m3, m2, m7 ;0, 1, 2, 3, 4, 5, 6, 7(3) + punpcklwd m2, m7 ;0, 1, 2, 3, 4, 5, 6, 7(2) + jmp tx2q + +.pass2: + lea tx2q, [o(m(idct_8x4_internal).pass2)] + +.pass2_end: + mova [coeffq+16*4], m4 + mova [coeffq+16*5], m5 + mova [coeffq+16*6], m6 + lea r3, [dstq+8] + call tx2q + + add coeffq, 16*4 + mova m0, [coeffq+16*0] + mova m1, [coeffq+16*1] + mova m2, [coeffq+16*2] + mova m3, [coeffq+16*3] + mov dstq, r3 + jmp tx2q + +ALIGN function_align +.main: + punpckhqdq m7, m0, m1 ;low:in1 high:in3 + punpcklqdq m0, m1 + punpcklqdq m1, m2, m3 + punpckhqdq m3, m2 ;low:in7 high:in5 + mova [coeffq+16*4], m7 + mova [coeffq+16*5], m3 + mova m7, [coeffq+16*7] + punpcklqdq m2, m4, m5 + punpckhqdq m4, m5 ;low:in9 high:in11 + punpcklqdq m3, m6, m7 + punpckhqdq m7, m6 ;low:in15 high:in13 + mova [coeffq+16*6], m4 + IDCT8_1D_PACKED + mova m6, [coeffq+16*4] + mova m4, [coeffq+16*5] + mova m5, [coeffq+16*6] + mova [coeffq+16*4], m1 + mova [coeffq+16*5], m2 + mova [coeffq+16*6], m3 + + IDCT16_1D_PACKED_ODDHALF 6, 4, 5, 7, 1, 2, 3 + + mova m1, [coeffq+16*4] + psubsw m3, m0, m7 ;low:out15 high:out14 + paddsw m0, m7 ;low:out0 high:out1 + psubsw m7, m1, m5 ;low:out12 high:out13 + paddsw m1, m5 ;low:out3 high:out2 + mova [coeffq+16*7], m3 + mova m2, [coeffq+16*5] + mova m3, [coeffq+16*6] + psubsw m5, m2, m4 ;low:out11 high:out10 + paddsw m2, m4 ;low:out4 high:out5 + psubsw m4, m3, m6 ;low:out8 high:out9 + paddsw m3, m6 ;low:out7 high:out6 + mova m6, m7 + ret + +INV_TXFM_16X4_FN adst, dct +INV_TXFM_16X4_FN adst, adst +INV_TXFM_16X4_FN adst, flipadst +INV_TXFM_16X4_FN adst, identity + +cglobal iadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_7ROWS coeffq, 16 + call .main + call .main_pass1_end + + punpckhwd m6, m7, m0 ;packed -out11, -out15 + punpcklwd m0, m7 ;packed out0, out4 + punpcklwd m7, m3, m4 ;packed -out3, -out7 + punpckhwd m4, m3 ;packed out8, out12 + mova m1, [coeffq+16*6] + punpcklwd m3, m1, m5 ;packed -out1, -out5 + punpckhwd m5, m1 ;packed out10, out14 + mova m1, [coeffq+16*7] + mova [coeffq+16*6], m3 + mova [coeffq+16*7], m7 + punpckhwd m3, m2, m1 ;packed -out9, -out13 + punpcklwd m1, m2 ;packed out2, out6 + + mova m7, [o(pw_16384)] + +.pass1_end: + REPX {pmulhrsw x, m7}, m0, m1, m4, m5 + pxor m2, m2 + psubw m2, m7 + mova m7, [coeffq+16*6] + REPX {pmulhrsw x, m2}, m7, m3, m6 + pmulhrsw m2, [coeffq+16*7] + mova [coeffq+16*6], m7 + jmp m(idct_16x4_internal).pass1_end3 + +.pass2: + lea tx2q, [o(m(iadst_8x4_internal).pass2)] + jmp m(idct_16x4_internal).pass2_end + +ALIGN function_align +.main: + mova [coeffq+16*6], m0 + pshufd m0, m1, q1032 + pshufd m2, m2, q1032 + punpckhwd m1, m6, m0 ;packed in13, in2 + punpcklwd m0, m6 ;packed in3, in12 + punpckhwd m7, m5, m2 ;packed in11, in4 + punpcklwd m2, m5 ;packed in5, in10 + mova m6, [o(pd_2048)] + ITX_MUL2X_PACK 1, 5, 6, 995, 3973 ;low:t2 high:t3 + ITX_MUL2X_PACK 7, 5, 6, 1751, 3703 ;low:t4 high:t5 + ITX_MUL2X_PACK 2, 5, 6, 3513, 2106 ;low:t10 high:t11 + ITX_MUL2X_PACK 0, 5, 6, 3857, 1380 ;low:t12 high:t13 + psubsw m5, m1, m2 ;low:t10a high:t11a + paddsw m1, m2 ;low:t2a high:t3a + psubsw m2, m7, m0 ;low:t12a high:t13a + paddsw m7, m0 ;low:t4a high:t5a + punpcklqdq m0, m5 + punpckhwd m0, m5 ;packed t10a, t11a + punpcklqdq m5, m2 + punpckhwd m2, m5 ;packed t13a, t12a + ITX_MUL2X_PACK 0, 5, 6, 3406, 2276 ;low:t10 high:t11 + ITX_MUL2X_PACK 2, 5, 6, 4017, 799, 1 ;low:t12 high:t13 + mova [coeffq+16*4], m1 + mova [coeffq+16*5], m7 + mova m1, [coeffq+16*6] + mova m7, [coeffq+16*7] + pshufd m1, m1, q1032 + pshufd m3, m3, q1032 + punpckhwd m5, m7, m1 ;packed in15, in0 + punpcklwd m1, m7 ;packed in1, in14 + punpckhwd m7, m4, m3 ;packed in9, in6 + punpcklwd m3, m4 ;packed in7, in8 + ITX_MUL2X_PACK 5, 4, 6, 201, 4091 ;low:t0 high:t1 + ITX_MUL2X_PACK 7, 4, 6, 2440, 3290 ;low:t6 high:t7 + ITX_MUL2X_PACK 3, 4, 6, 3035, 2751 ;low:t8 high:t9 + ITX_MUL2X_PACK 1, 4, 6, 4052, 601 ;low:t14 high:t15 + psubsw m4, m5, m3 ;low:t8a high:t9a + paddsw m5, m3 ;low:t0a high:t1a + psubsw m3, m7, m1 ;low:t14a high:t15a + paddsw m7, m1 ;low:t6a high:t7a + punpcklqdq m1, m4 + punpckhwd m1, m4 ;packed t8a, t9a + punpcklqdq m4, m3 + punpckhwd m3, m4 ;packed t15a, t14a + ITX_MUL2X_PACK 1, 4, 6, 799, 4017 ;low:t8 high:t9 + ITX_MUL2X_PACK 3, 4, 6, 2276, 3406, 1 ;low:t14 high:t15 + paddsw m4, m1, m2 ;low:t12a high:t13a + psubsw m1, m2 ;low:t8a high:t9a + psubsw m2, m0, m3 ;low:t14a high:t15a + paddsw m0, m3 ;low:t10a high:t11a + punpcklqdq m3, m1 + punpckhwd m3, m1 ;packed t12a, t13a + punpcklqdq m1, m2 + punpckhwd m2, m1 ;packed t15a, t14a + ITX_MUL2X_PACK 3, 1, 6, 1567, 3784 ;low:t12 high:t13 + ITX_MUL2X_PACK 2, 1, 6, 3784, 1567, 1 ;low:t14 high:t15 + psubsw m1, m3, m2 ;low:t14a high:t15a + paddsw m3, m2 ;low:out2 high:-out13 + psubsw m2, m4, m0 ;low:t10 high:t11 + paddsw m0, m4 ;low:-out1 high:out14 + mova [coeffq+16*6], m0 + mova [coeffq+16*7], m3 + mova m0, [coeffq+16*4] + mova m3, [coeffq+16*5] + psubsw m4, m5, m3 ;low:t4 high:t5 + paddsw m5, m3 ;low:t0 high:t1 + psubsw m3, m0, m7 ;low:t6 high:t7 + paddsw m0, m7 ;low:t2 high:t3 + punpcklqdq m7, m4 + punpckhwd m7, m4 ;packed t4, t5 + punpcklqdq m4, m3 + punpckhwd m3, m4 ;packed t7, t6 + ITX_MUL2X_PACK 7, 4, 6, 1567, 3784 ;low:t4a high:t5a + ITX_MUL2X_PACK 3, 4, 6, 3784, 1567, 1 ;low:t6a high:t7a + psubsw m4, m5, m0 ;low:t2a high:t3a + paddsw m0, m5 ;low:out0 high:-out15 + psubsw m5, m7, m3 ;low:t6 high:t7 + paddsw m3, m7 ;low:-out3 high:out12 + ret +ALIGN function_align +.main_pass1_end: + mova m7, [o(deint_shuf1)] + mova [coeffq+16*4], m0 + mova [coeffq+16*5], m3 + mova m0, [o(pw_2896_m2896)] + mova m3, [o(pw_2896_2896)] + pshufb m1, m7 ;t14a t15a + pshufb m2, m7 ;t10 t11 + pshufb m4, m7 ;t2a t3a + pshufb m5, m7 ;t6 t7 + pmaddwd m7, m0, m2 + pmaddwd m2, m3 + paddd m7, m6 + paddd m2, m6 + psrad m7, 12 + psrad m2, 12 + packssdw m2, m7 ;low:out6 high:-out9 + pmaddwd m7, m0, m4 + pmaddwd m4, m3 + paddd m7, m6 + paddd m4, m6 + psrad m7, 12 + psrad m4, 12 + packssdw m4, m7 ;low:-out7 high:out8 + pmaddwd m7, m3, m5 + pmaddwd m5, m0 + paddd m7, m6 + paddd m5, m6 + psrad m7, 12 + psrad m5, 12 + packssdw m7, m5 ;low:out4 high:-out11 + pmaddwd m5, m3, m1 + pmaddwd m1, m0 + paddd m5, m6 + paddd m1, m6 + psrad m5, 12 + psrad m1, 12 + packssdw m5, m1 ;low:-out5 high:out10 + mova m0, [coeffq+16*4] + mova m3, [coeffq+16*5] + ret +ALIGN function_align +.main_pass2_end: + mova m7, [o(pw_2896x8)] + punpckhqdq m6, m2, m1 ;low:t11 high:t15a + punpcklqdq m2, m1 ;low:t10 high:t14a + psubsw m1, m2, m6 + paddsw m2, m6 + punpckhqdq m6, m4, m5 ;low:t3a high:t7 + punpcklqdq m4, m5 ;low:t2a high:t6 + psubsw m5, m4, m6 + paddsw m4, m6 + pmulhrsw m1, m7 ;low:-out9 high:out10 + pmulhrsw m2, m7 ;low:out6 high:-out5 + pmulhrsw m5, m7 ;low:out8 high:-out11 + pmulhrsw m4, m7 ;low:-out7 high:out4 + punpckhqdq m7, m4, m5 ;low:out4 high:-out11 + punpcklqdq m4, m5 ;low:-out7 high:out8 + punpckhqdq m5, m2, m1 ;low:-out5 high:out10 + punpcklqdq m2, m1 ;low:out6 high:-out9 + ret + + +INV_TXFM_16X4_FN flipadst, dct +INV_TXFM_16X4_FN flipadst, adst +INV_TXFM_16X4_FN flipadst, flipadst +INV_TXFM_16X4_FN flipadst, identity + +cglobal iflipadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_7ROWS coeffq, 16 + call m(iadst_16x4_internal).main + call m(iadst_16x4_internal).main_pass1_end + + punpcklwd m6, m7, m0 ;packed out11, out15 + punpckhwd m0, m7 ;packed -out0, -out4 + punpckhwd m7, m3, m4 ;packed out3, out7 + punpcklwd m4, m3 ;packed -out8, -out12 + mova m1, [coeffq+16*6] + punpckhwd m3, m1, m5 ;packed out1, out5 + punpcklwd m5, m1 ;packed -out10, -out14 + mova m1, [coeffq+16*7] + mova [coeffq+16*6], m3 + mova [coeffq+16*7], m7 + punpcklwd m3, m2, m1 ;packed out9, out13 + punpckhwd m1, m2 ;packed -out2, -out6 + + mova m7, [o(pw_m16384)] + jmp m(iadst_16x4_internal).pass1_end + +.pass2: + lea tx2q, [o(m(iflipadst_8x4_internal).pass2)] + jmp m(idct_16x4_internal).pass2_end + + +INV_TXFM_16X4_FN identity, dct +INV_TXFM_16X4_FN identity, adst +INV_TXFM_16X4_FN identity, flipadst +INV_TXFM_16X4_FN identity, identity + +cglobal iidentity_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m1, [coeffq+16*6] + mova m0, [coeffq+16*5] + mova m2, [coeffq+16*7] + mova m6, [o(pw_1697x16)] + mova m7, [o(pw_16384)] + pmulhrsw m4, m6, m1 + pmulhrsw m3, m6, m0 + pmulhrsw m5, m6, m2 + pmulhrsw m4, m7 + pmulhrsw m3, m7 + pmulhrsw m5, m7 + paddsw m1, m4 + paddsw m0, m3 + paddsw m5, m2 + mova m2, [coeffq+16*2] + mova m3, [coeffq+16*3] + mova m4, [coeffq+16*4] + mova [coeffq+16*6], m1 + mova [coeffq+16*5], m0 + mova [coeffq+16*7], m5 + pmulhrsw m0, m6, m2 + pmulhrsw m1, m6, m3 + pmulhrsw m5, m6, m4 + pmulhrsw m0, m7 + pmulhrsw m1, m7 + pmulhrsw m5, m7 + paddsw m2, m0 + paddsw m3, m1 + paddsw m4, m5 + mova m0, [coeffq+16*0] + mova m1, [coeffq+16*1] + pmulhrsw m5, m6, m0 + pmulhrsw m6, m1 + pmulhrsw m5, m7 + pmulhrsw m6, m7 + paddsw m0, m5 + paddsw m1, m6 + mova m6, [coeffq+16*6] + mova m5, [coeffq+16*5] + punpckhwd m7, m0, m2 ;packed out1, out5 + punpcklwd m0, m2 ;packed out0, out4 + punpckhwd m2, m1, m3 ;packed out3, out7 + punpcklwd m1, m3 ;packed out2, out6 + mova [coeffq+16*6], m7 + mova m7, [coeffq+16*7] + punpckhwd m3, m4, m6 ;packed out9, out13 + punpcklwd m4, m6 ;packed out8, out12 + punpckhwd m6, m5, m7 ;packed out11, out15 + punpcklwd m5, m7 ;packed out10, out14 + jmp m(idct_16x4_internal).pass1_end3 + +.pass2: + lea tx2q, [o(m(iidentity_8x4_internal).pass2)] + jmp m(idct_16x4_internal).pass2_end + + +%macro SAVE_8ROWS 2 ;src, stride + mova [%1+%2*0], m0 + mova [%1+%2*1], m1 + mova [%1+%2*2], m2 + mova [%1+%2*3], m3 + mova [%1+%2*4], m4 + mova [%1+%2*5], m5 + mova [%1+%2*6], m6 + mova [%1+%2*7], m7 +%endmacro + +%macro INV_TXFM_8X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x16, 8, 16*16 +%ifidn %1_%2, dct_dct + pshuflw m0, [coeffq], q0000 + punpcklwd m0, m0 + mova m1, [o(pw_2896x8)] + pmulhrsw m0, m1 + mova m2, [o(pw_16384)] + mov [coeffq], eobd + pmulhrsw m0, m1 + pmulhrsw m0, m2 + psrlw m2, 3 ; pw_2048 + pmulhrsw m0, m1 + pmulhrsw m0, m2 + mov r3d, 4 + lea tx2q, [o(m(inv_txfm_add_dct_dct_8x16).end)] + jmp m(inv_txfm_add_dct_dct_8x8).loop +.end: + RET +%endif +%endmacro + +INV_TXFM_8X16_FN dct, dct +INV_TXFM_8X16_FN dct, adst +INV_TXFM_8X16_FN dct, flipadst +INV_TXFM_8X16_FN dct, identity + +cglobal idct_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + lea r3, [o(m(idct_8x8_internal).pass1)] + +.pass1: + LOAD_8ROWS coeffq+16*1, 32, 1 + mov [rsp+gprsize+16*11], tx2q + lea tx2q, [o(m(idct_8x16_internal).pass1_end)] + jmp r3 + +.pass1_end: + SAVE_8ROWS coeffq+16*1, 32 + LOAD_8ROWS coeffq+16*0, 32, 1 + mov tx2q, [rsp+gprsize+16*11] + jmp r3 + +.pass2: + lea tx2q, [o(m(idct_8x16_internal).end)] + +.pass2_pre: + mova [coeffq+16*2 ], m1 + mova [coeffq+16*6 ], m3 + mova [coeffq+16*10], m5 + mova [coeffq+16*14], m7 + mova m1, m2 + mova m2, m4 + mova m3, m6 + mova m4, [coeffq+16*1 ] + mova m5, [coeffq+16*5 ] + mova m6, [coeffq+16*9 ] + mova m7, [coeffq+16*13] + +.pass2_main: + call m(idct_8x8_internal).main + + SAVE_7ROWS rsp+gprsize+16*3, 16 + mova m0, [coeffq+16*2 ] + mova m1, [coeffq+16*6 ] + mova m2, [coeffq+16*10] + mova m3, [coeffq+16*14] + mova m4, [coeffq+16*3 ] + mova m5, [coeffq+16*7 ] + mova m6, [coeffq+16*11] + mova m7, [coeffq+16*15] + call m(idct_16x8_internal).main + + mov r3, dstq + lea dstq, [dstq+strideq*8] + jmp m(idct_8x8_internal).end + +.end: + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_8x16_internal).end1)] + mov dstq, r3 + jmp m(idct_8x8_internal).end + +.end1: + pxor m7, m7 + REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + ret + +INV_TXFM_8X16_FN adst, dct +INV_TXFM_8X16_FN adst, adst +INV_TXFM_8X16_FN adst, flipadst +INV_TXFM_8X16_FN adst, identity + +cglobal iadst_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + lea r3, [o(m(iadst_8x8_internal).pass1)] + jmp m(idct_8x16_internal).pass1 + +.pass2: + lea tx2q, [o(m(iadst_8x16_internal).end)] + +.pass2_pre: + mova [rsp+gprsize+16*7], m0 + mova [rsp+gprsize+16*8], m1 + mova [rsp+gprsize+16*5], m6 + mova [rsp+gprsize+16*6], m7 + mova m0, m2 + mova m1, m3 + mova m2, m4 + mova m3, m5 + +.pass2_main: + mova m4, [coeffq+16*1 ] + mova m5, [coeffq+16*3 ] + mova m6, [coeffq+16*13] + mova m7, [coeffq+16*15] + mova [rsp+gprsize+16*3], m4 + mova [rsp+gprsize+16*4], m5 + mova [rsp+gprsize+16*9], m6 + mova [rsp+gprsize+32*5], m7 + mova m4, [coeffq+16*5 ] + mova m5, [coeffq+16*7 ] + mova m6, [coeffq+16*9 ] + mova m7, [coeffq+16*11] + + call m(iadst_16x8_internal).main + call m(iadst_16x8_internal).main_pass2_end + + mov r3, dstq + lea dstq, [dstq+strideq*8] + jmp m(iadst_8x8_internal).end + +.end: + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_8x16_internal).end1)] + mov dstq, r3 + jmp m(iadst_8x8_internal).end + + +INV_TXFM_8X16_FN flipadst, dct +INV_TXFM_8X16_FN flipadst, adst +INV_TXFM_8X16_FN flipadst, flipadst +INV_TXFM_8X16_FN flipadst, identity + +cglobal iflipadst_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + lea r3, [o(m(iflipadst_8x8_internal).pass1)] + jmp m(idct_8x16_internal).pass1 + +.pass2: + lea tx2q, [o(m(iflipadst_8x16_internal).end)] + lea r3, [dstq+strideq*8] + +.pass2_pre: + mova [rsp+gprsize+16*7], m0 + mova [rsp+gprsize+16*8], m1 + mova [rsp+gprsize+16*5], m6 + mova [rsp+gprsize+16*6], m7 + mova m0, m2 + mova m1, m3 + mova m2, m4 + mova m3, m5 + +.pass2_main: + mova m4, [coeffq+16*1 ] + mova m5, [coeffq+16*3 ] + mova m6, [coeffq+16*13] + mova m7, [coeffq+16*15] + mova [rsp+gprsize+16*3], m4 + mova [rsp+gprsize+16*4], m5 + mova [rsp+gprsize+16*9], m6 + mova [rsp+gprsize+32*5], m7 + mova m4, [coeffq+16*5 ] + mova m5, [coeffq+16*7 ] + mova m6, [coeffq+16*9 ] + mova m7, [coeffq+16*11] + + call m(iadst_16x8_internal).main + call m(iadst_16x8_internal).main_pass2_end + jmp m(iflipadst_8x8_internal).end + +.end: + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_8x16_internal).end1)] + mov dstq, r3 + jmp m(iflipadst_8x8_internal).end + + +INV_TXFM_8X16_FN identity, dct +INV_TXFM_8X16_FN identity, adst +INV_TXFM_8X16_FN identity, flipadst +INV_TXFM_8X16_FN identity, identity + +cglobal iidentity_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_8ROWS coeffq+16*1, 32, 1 + mov r3, tx2q + lea tx2q, [o(m(iidentity_8x16_internal).pass1_end)] + mova [rsp+gprsize+16*1], m6 + jmp m(idct_8x8_internal).pass1_end3 + +.pass1_end: + SAVE_8ROWS coeffq+16*1, 32 + LOAD_8ROWS coeffq+16*0, 32, 1 + mov tx2q, r3 + mova [rsp+gprsize+16*1], m6 + jmp m(idct_8x8_internal).pass1_end3 + +.pass2: + lea tx2q, [o(m(iidentity_8x16_internal).end1)] + +.end: + mova [rsp+gprsize+16*0], m7 + mova [rsp+gprsize+16*1], m6 + mova m7, [o(pw_1697x16)] + REPX {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5 + mova m6, [rsp+gprsize+16*1] + mova [rsp+gprsize+16*2], m5 + IDTX16 6, 5, 7 + mova m5, [rsp+gprsize+16*0] + IDTX16 5, 7, 7 + mova m7, [o(pw_2048)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulhrsw m7, [rsp+gprsize+16*2] + mova [rsp+gprsize+16*0], m5 + mova [rsp+gprsize+16*1], m6 + mova [rsp+gprsize+16*2], m7 + jmp m(idct_8x8_internal).end3 + +.end1: + LOAD_8ROWS coeffq+16*1, 32 + lea tx2q, [o(m(idct_8x16_internal).end1)] + lea dstq, [dstq+strideq*2] + jmp .end + + +%macro INV_TXFM_16X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x8, 8, 16*16 +%ifidn %1_%2, dct_dct + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_16384)] + mov [coeffq], eobd + pmulhrsw m0, m1 + mov r2d, 4 + lea tx2q, [o(m(inv_txfm_add_dct_dct_16x8).end)] + jmp m(inv_txfm_add_dct_dct_16x4).dconly +.end: + RET +%endif +%endmacro + +INV_TXFM_16X8_FN dct, dct +INV_TXFM_16X8_FN dct, adst +INV_TXFM_16X8_FN dct, flipadst +INV_TXFM_16X8_FN dct, identity + +cglobal idct_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_8ROWS coeffq+16*0, 32, 1 + call m(idct_8x8_internal).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + LOAD_8ROWS coeffq+16*1, 32, 1 + call .main + mov r3, tx2q + lea tx2q, [o(m(idct_16x8_internal).pass1_end)] + jmp m(idct_8x8_internal).pass1_end + +.pass1_end: + SAVE_8ROWS coeffq+16*1, 32 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + mov tx2q, r3 + jmp m(idct_8x8_internal).pass1_end + +.pass2: + lea tx2q, [o(m(idct_16x8_internal).end)] + lea r3, [dstq+8] + jmp m(idct_8x8_internal).pass2_main + +.end: + LOAD_8ROWS coeffq+16*1, 32 + lea tx2q, [o(m(idct_8x16_internal).end1)] + mov dstq, r3 + jmp m(idct_8x8_internal).pass2_main + + +ALIGN function_align +.main: + mova [rsp+gprsize*2+16*1], m2 + mova [rsp+gprsize*2+16*2], m6 + mova [rsp+gprsize*2+32*5], m5 + + mova m6, [o(pd_2048)] + ITX_MULSUB_2W 0, 7, 2, 5, 6, 401, 4076 ;t8a, t15a + ITX_MULSUB_2W 4, 3, 2, 5, 6, 3166, 2598 ;t9a, t14a + psubsw m2, m0, m4 ;t9 + paddsw m0, m4 ;t8 + psubsw m4, m7, m3 ;t14 + paddsw m7, m3 ;t15 + ITX_MULSUB_2W 4, 2, 3, 5, 6, 1567, 3784 ;t9a, t14a + mova m3, [rsp+gprsize*2+16*1] + mova m5, [rsp+gprsize*2+32*5] + mova [rsp+gprsize*2+16*1], m2 + mova [rsp+gprsize*2+32*5], m4 + mova m2, [rsp+gprsize*2+16*2] + mova [rsp+gprsize*2+16*2], m7 + ITX_MULSUB_2W 3, 5, 7, 4, 6, 1931, 3612 ;t10a, t13a + ITX_MULSUB_2W 2, 1, 7, 4, 6, 3920, 1189 ;t11a, t12a + psubsw m4, m2, m3 ;t10 + paddsw m2, m3 ;t11 + psubsw m3, m1, m5 ;t13 + paddsw m1, m5 ;t12 + ITX_MULSUB_2W 3, 4, 7, 5, 6, m3784, 1567 ;t10a, t13a + mova m7, [rsp+gprsize*2+32*5] + psubsw m6, m0, m2 ;t11a + paddsw m0, m2 ;t8a + paddsw m2, m7, m3 ;t9 + psubsw m7, m3 ;t10 + mova m5, [rsp+gprsize*2+16*0] + psubsw m3, m5, m0 ;out8 + paddsw m0, m5 ;out7 + mova [rsp+gprsize*2+32*5], m0 + mova m5, [rsp+gprsize*2+16*9] + psubsw m0, m5, m2 ;out9 + paddsw m2, m5 ;out6 + mova [rsp+gprsize*2+16*0], m0 + mova [rsp+gprsize*2+16*9], m2 + mova m0, [rsp+gprsize*2+16*1] + mova m2, [rsp+gprsize*2+16*2] + mova [rsp+gprsize*2+16*1], m3 + psubsw m5, m0, m4 ;t13 + paddsw m0, m4 ;t14 + mova m3, [o(pd_2048)] + psubsw m4, m2, m1 ;t12a + paddsw m1, m2 ;t15a + mova [rsp+gprsize*2+16*2], m1 + ITX_MULSUB_2W 5, 7, 1, 2, 3, 2896, 2896 ;t10a, t13a + ITX_MULSUB_2W 4, 6, 1, 2, 3, 2896, 2896 ;t11, t12 + mova m3, [rsp+gprsize*2+16*8] + psubsw m2, m3, m5 ;out10 + paddsw m3, m5 ;out5 + mova m5, [rsp+gprsize*2+16*7] + mova [rsp+gprsize*2+16*8], m3 + psubsw m3, m5, m4 ;out11 + paddsw m5, m4 ;out4 + mova m4, [rsp+gprsize*2+16*6] + mova [rsp+gprsize*2+16*7], m5 + paddsw m5, m4, m6 ;out3 + psubsw m4, m6 ;out12 + mova m6, [rsp+gprsize*2+16*5] + mova [rsp+gprsize*2+16*6], m5 + psubsw m5, m6, m7 ;out13 + paddsw m6, m7 ;out2 + mova m7, [rsp+gprsize*2+16*4] + mova [rsp+gprsize*2+16*5], m6 + psubsw m6, m7, m0 ;out14 + paddsw m7, m0 ;out1 + mova m1, [rsp+gprsize*2+16*2] + mova m0, [rsp+gprsize*2+16*3] + mova [rsp+gprsize*2+16*4], m7 + psubsw m7, m0, m1 ;out15 + paddsw m0, m1 ;out0 + mova [rsp+gprsize*2+16*3], m0 + mova m1, [rsp+gprsize*2+16*0] + mova m0, [rsp+gprsize*2+16*1] + mova [rsp+gprsize*2+16*0], m7 + ret + +INV_TXFM_16X8_FN adst, dct +INV_TXFM_16X8_FN adst, adst +INV_TXFM_16X8_FN adst, flipadst +INV_TXFM_16X8_FN adst, identity + +cglobal iadst_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m7, [o(pw_2896x8)] + pmulhrsw m0, m7, [coeffq+16*0 ] + pmulhrsw m1, m7, [coeffq+16*1 ] + pmulhrsw m2, m7, [coeffq+16*14] + pmulhrsw m3, m7, [coeffq+16*15] + mova [rsp+gprsize+16*7], m0 + mova [rsp+gprsize+16*8], m1 + mova [rsp+gprsize+16*9], m2 + mova [rsp+gprsize+32*5], m3 + pmulhrsw m0, m7, [coeffq+16*6 ] + pmulhrsw m1, m7, [coeffq+16*7 ] + pmulhrsw m2, m7, [coeffq+16*8 ] + pmulhrsw m3, m7, [coeffq+16*9 ] + mova [rsp+gprsize+16*3], m2 + mova [rsp+gprsize+16*4], m3 + mova [rsp+gprsize+16*5], m0 + mova [rsp+gprsize+16*6], m1 + pmulhrsw m0, m7, [coeffq+16*2 ] + pmulhrsw m1, m7, [coeffq+16*3 ] + pmulhrsw m2, m7, [coeffq+16*4 ] + pmulhrsw m3, m7, [coeffq+16*5 ] + pmulhrsw m4, m7, [coeffq+16*10] + pmulhrsw m5, m7, [coeffq+16*11] + pmulhrsw m6, m7, [coeffq+16*12] + pmulhrsw m7, [coeffq+16*13] + + call .main + call .main_pass1_end + mov r3, tx2q + lea tx2q, [o(m(iadst_16x8_internal).pass1_end)] + jmp m(iadst_8x8_internal).pass1_end + +.pass1_end: + SAVE_8ROWS coeffq+16*1, 32 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + mov tx2q, r3 + jmp m(iadst_8x8_internal).pass1_end + +.pass2: + lea tx2q, [o(m(iadst_16x8_internal).end)] + lea r3, [dstq+8] + jmp m(iadst_8x8_internal).pass2_main + +.end: + LOAD_8ROWS coeffq+16*1, 32 + lea tx2q, [o(m(idct_8x16_internal).end1)] + mov dstq, r3 + jmp m(iadst_8x8_internal).pass2_main + +ALIGN function_align +.main: + mova [rsp+gprsize*2+16*0], m1 + mova [rsp+gprsize*2+16*1], m2 + mova [rsp+gprsize*2+16*2], m6 + + mova m6, [o(pd_2048)] + ITX_MULSUB_2W 7, 0, 1, 2, 6, 995, 3973 ;t3, t2 + ITX_MULSUB_2W 3, 4, 1, 2, 6, 3513, 2106 ;t11, t10 + psubsw m1, m0, m4 ;t10a + paddsw m0, m4 ;t2a + psubsw m4, m7, m3 ;t11a + paddsw m3, m7 ;t3a + ITX_MULSUB_2W 1, 4, 7, 2, 6, 3406, 2276 ;t11, t10 + mova m2, [rsp+gprsize*2+16*0] ;in3 + mova m7, [rsp+gprsize*2+16*1] ;in4 + mova [rsp+gprsize*2+16*0], m1 ;t11 + mova [rsp+gprsize*2+16*1], m4 ;t10 + mova m1, [rsp+gprsize*2+16*2] ;in12 + mova [rsp+gprsize*2+16*2], m0 ;t2a + ITX_MULSUB_2W 5, 7, 0, 4, 6, 1751, 3703 ;t5, t4 + ITX_MULSUB_2W 2, 1, 0, 4, 6, 3857, 1380 ;t13, t12 + psubsw m0, m7, m1 ;t12a + paddsw m1, m7 ;t4a + psubsw m4, m5, m2 ;t13a + paddsw m5, m2 ;t5a + ITX_MULSUB_2W 4, 0, 7, 2, 6, 4017, 799 ;t12, t13 + mova m2, [rsp+gprsize*2+16*8] ;in1 + mova m7, [rsp+gprsize*2+16*9] ;in14 + mova [rsp+gprsize*2+16*8], m4 ;t12 + mova [rsp+gprsize*2+16*9], m0 ;t13 + mova m4, [rsp+gprsize*2+16*4] ;in9 + mova m0, [rsp+gprsize*2+16*5] ;in6 + mova [rsp+gprsize*2+16*4], m1 ;t4a + mova [rsp+gprsize*2+16*5], m5 ;t5a + ITX_MULSUB_2W 2, 7, 1, 5, 6, 4052, 601 ;t15, t14 + ITX_MULSUB_2W 4, 0, 1, 5, 6, 2440, 3290 ;t7, t6 + psubsw m1, m0, m7 ;t14a + paddsw m0, m7 ;t6a + psubsw m5, m4, m2 ;t15a + paddsw m4, m2 ;t7a + ITX_MULSUB_2W 5, 1, 7, 2, 6, 2276, 3406 ;t14, t15 + mova m2, [rsp+gprsize*2+16*2] ;t2a + mova [rsp+gprsize*2+16*2], m5 ;t14 + psubsw m7, m2, m0 ;t6 + paddsw m2, m0 ;t2 + psubsw m0, m3, m4 ;t7 + paddsw m3, m4 ;t3 + ITX_MULSUB_2W 0, 7, 4, 5, 6, 3784, 1567 ;t6a, t7a + mova m4, [rsp+gprsize*2+16*7] ;in0 + mova m5, [rsp+gprsize*2+32*5] ;in15 + mova [rsp+gprsize*2+16*7], m3 ;t3 + mova [rsp+gprsize*2+32*5], m1 ;t15 + mova m1, [rsp+gprsize*2+16*6] ;in7 + mova m3, [rsp+gprsize*2+16*3] ;in8 + mova [rsp+gprsize*2+16*6], m7 ;t7a + mova [rsp+gprsize*2+16*3], m0 ;t6a + ITX_MULSUB_2W 5, 4, 0, 7, 6, 201, 4091 ;t1, t0 + ITX_MULSUB_2W 1, 3, 0, 7, 6, 3035, 2751 ;t9, t8 + psubsw m0, m4, m3 ;t8a + paddsw m4, m3 ;t0a + psubsw m3, m5, m1 ;t9a + paddsw m5, m1 ;t1a + ITX_MULSUB_2W 0, 3, 1, 7, 6, 799, 4017 ;t9, t8 + mova m1, [rsp+gprsize*2+16*4] ;t4a + mova m7, [rsp+gprsize*2+16*5] ;t5a + mova [rsp+gprsize*2+16*4], m3 ;t8 + mova [rsp+gprsize*2+16*5], m0 ;t9 + psubsw m0, m4, m1 ;t4 + paddsw m4, m1 ;t0 + psubsw m3, m5, m7 ;t5 + paddsw m5, m7 ;t1 + ITX_MULSUB_2W 0, 3, 1, 7, 6, 1567, 3784 ;t5a, t4a + mova m7, [rsp+gprsize*2+16*3] ;t6a + psubsw m1, m4, m2 ;t2a + paddsw m4, m2 ;out0 + mova [rsp+gprsize*2+16*3], m4 ;out0 + mova m4, [rsp+gprsize*2+16*6] ;t7a + psubsw m2, m3, m7 ;t6 + paddsw m3, m7 ;-out3 + mova [rsp+gprsize*2+16*6], m3 ;-out3 + psubsw m3, m0, m4 ;t7 + paddsw m0, m4 ;out12 + mova [rsp+gprsize*2+16*12], m3 + mova m3, [rsp+gprsize*2+16*7] ;t3 + mova [rsp+gprsize*2+16* 7], m2 ;out4 + psubsw m2, m5, m3 ;t3a + paddsw m5, m3 ;-out15 + mova [rsp+gprsize*2+16*11], m2 + mova m2, [rsp+gprsize*2+32*5] ;t15 + mova [rsp+gprsize*2+16*10], m1 ;-out7 + mova m1, [rsp+gprsize*2+16*0] ;t11 + mova [rsp+gprsize*2+16*0 ], m5 ;-out15 + mova m3, [rsp+gprsize*2+16*1] ;t10 + mova [rsp+gprsize*2+16*1 ], m4 ;-out11 + mova m4, [rsp+gprsize*2+16*2] ;t14 + mova [rsp+gprsize*2+16*2 ], m0 ;out12 + psubsw m0, m3, m4 ;t14a + paddsw m3, m4 ;t10a + psubsw m5, m1, m2 ;t15a + paddsw m1, m2 ;t11a + ITX_MULSUB_2W 5, 0, 2, 4, 6, 3784, 1567 ;t14, t15 + mova m2, [rsp+gprsize*2+16*4] ;t8 + mova m4, [rsp+gprsize*2+16*5] ;t9 + mova [rsp+gprsize*2+16*4], m3 ;t10a + mova [rsp+gprsize*2+16*5], m1 ;t11a + mova m3, [rsp+gprsize*2+16*8] ;t12 + mova m1, [rsp+gprsize*2+16*9] ;t13 + mova [rsp+gprsize*2+16*8], m5 ;t14 + mova [rsp+gprsize*2+16*9], m0 ;t15 + psubsw m5, m2, m3 ;t12a + paddsw m2, m3 ;t8a + psubsw m0, m4, m1 ;t13a + paddsw m4, m1 ;t9a + ITX_MULSUB_2W 5, 0, 1, 3, 6, 1567, 3784 ;t13, t12 + mova m6, [rsp+gprsize*2+16*4] ;t10a + mova m1, [rsp+gprsize*2+16*5] ;t11a + psubsw m3, m2, m6 ;t10 + paddsw m2, m6 ;-out1 + paddsw m6, m4, m1 ;out14 + psubsw m4, m1 ;t11 + mova [rsp+gprsize*2+16*14], m4 + mova [rsp+gprsize*2+16* 4], m2 ;-out1 + mova m4, [rsp+gprsize*2+16*8] ;t14 + mova m2, [rsp+gprsize*2+16*9] ;t15 + mova [rsp+gprsize*2+16* 9], m3 ;out6 + psubsw m3, m0, m4 ;t14a + paddsw m0, m4 ;out2 + psubsw m4, m5, m2 ;t15a + paddsw m5, m2 ;-out13 + mova [rsp+gprsize*2+16* 5], m0 ;out2 + ret +ALIGN function_align +.main_pass1_end: + mova m0, [rsp+gprsize*2+16*14] + mova [rsp+gprsize*2+16*14], m5 + mova [rsp+gprsize*2+16*15], m6 + mova m5, [o(pw_2896_2896)] + mova m6, [o(pw_2896_m2896)] + mova m7, [o(pd_2048)] + punpcklwd m2, m3, m4 + punpckhwd m3, m4 + pmaddwd m4, m5, m2 + pmaddwd m2, m6 + pmaddwd m1, m5, m3 + pmaddwd m3, m6 + REPX {paddd x, m7}, m4, m2, m1, m3 + REPX {psrad x, 12}, m4, m1, m2, m3 + packssdw m4, m1 ;-out5 + packssdw m2, m3 ;out10 + mova [rsp+gprsize*2+16* 8], m4 + mova m3, [rsp+gprsize*2+16* 9] + punpcklwd m1, m3, m0 + punpckhwd m3, m0 + pmaddwd m0, m5, m1 + pmaddwd m1, m6 + pmaddwd m4, m5, m3 + pmaddwd m3, m6 + REPX {paddd x, m7}, m0, m1, m4, m3 + REPX {psrad x, 12}, m0, m4, m1, m3 + packssdw m0, m4 ;out6 + packssdw m1, m3 ;-out9 + mova [rsp+gprsize*2+16* 9], m0 + mova m0, [rsp+gprsize*2+16* 7] + mova m4, [rsp+gprsize*2+16*12] + punpcklwd m3, m0, m4 + punpckhwd m0, m4 + pmaddwd m4, m5, m3 + pmaddwd m3, m6 + pmaddwd m5, m0 + pmaddwd m0, m6 + REPX {paddd x, m7}, m4, m3, m5, m0 + REPX {psrad x, 12}, m4, m5, m3, m0 + packssdw m4, m5 ;out4 + packssdw m3, m0 ;-out11 + mova [rsp+gprsize*2+16* 7], m4 + mova m4, [rsp+gprsize*2+16*10] + mova m5, [rsp+gprsize*2+16*11] + punpcklwd m0, m4, m5 + punpckhwd m4, m5 + pmaddwd m5, m0, [o(pw_2896_2896)] + pmaddwd m0, m6 + pmaddwd m6, m4 + pmaddwd m4, [o(pw_2896_2896)] + REPX {paddd x, m7}, m5, m0, m6, m4 + REPX {psrad x, 12}, m0, m6, m5, m4 + packssdw m0, m6 ;out8 + packssdw m5, m4 ;-out7 + mova [rsp+gprsize*2+16*10], m5 + mova m4, [rsp+gprsize*2+16* 2] ;out12 + mova m5, [rsp+gprsize*2+16*14] ;-out13 + mova m6, [rsp+gprsize*2+16*15] ;out14 + ret +ALIGN function_align +.main_pass2_end: + mova m7, [o(pw_2896x8)] + mova m1, [rsp+gprsize*2+16* 9] + mova m2, [rsp+gprsize*2+16*14] + paddsw m0, m1, m2 + psubsw m1, m2 + pmulhrsw m0, m7 ;out6 + pmulhrsw m1, m7 ;-out9 + mova [rsp+gprsize*2+16* 9], m0 + psubsw m2, m3, m4 + paddsw m3, m4 + pmulhrsw m2, m7 ;out10 + pmulhrsw m3, m7 ;-out5 + mova [rsp+gprsize*2+16* 8], m3 + mova m3, [rsp+gprsize*2+16* 7] + mova m4, [rsp+gprsize*2+16*12] + paddsw m0, m3, m4 + psubsw m3, m4 + pmulhrsw m0, m7 ;out4 + pmulhrsw m3, m7 ;-out11 + mova [rsp+gprsize*2+16* 7], m0 + mova m0, [rsp+gprsize*2+16*10] + paddsw m4, m0, [rsp+gprsize*2+16*11] + psubsw m0, [rsp+gprsize*2+16*11] + pmulhrsw m4, m7 ;-out7 + pmulhrsw m0, m7 ;out8 + mova [rsp+gprsize*2+16*10], m4 + mova m4, [rsp+gprsize*2+16*2 ] ;out12 + ret + +INV_TXFM_16X8_FN flipadst, dct +INV_TXFM_16X8_FN flipadst, adst +INV_TXFM_16X8_FN flipadst, flipadst +INV_TXFM_16X8_FN flipadst, identity + +cglobal iflipadst_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m7, [o(pw_2896x8)] + pmulhrsw m0, m7, [coeffq+16*0 ] + pmulhrsw m1, m7, [coeffq+16*1 ] + pmulhrsw m2, m7, [coeffq+16*14] + pmulhrsw m3, m7, [coeffq+16*15] + mova [rsp+gprsize+16*7], m0 + mova [rsp+gprsize+16*8], m1 + mova [rsp+gprsize+16*9], m2 + mova [rsp+gprsize+32*5], m3 + pmulhrsw m0, m7, [coeffq+16*6 ] + pmulhrsw m1, m7, [coeffq+16*7 ] + pmulhrsw m2, m7, [coeffq+16*8 ] + pmulhrsw m3, m7, [coeffq+16*9 ] + mova [rsp+gprsize+16*3], m2 + mova [rsp+gprsize+16*4], m3 + mova [rsp+gprsize+16*5], m0 + mova [rsp+gprsize+16*6], m1 + pmulhrsw m0, m7, [coeffq+16*2 ] + pmulhrsw m1, m7, [coeffq+16*3 ] + pmulhrsw m2, m7, [coeffq+16*4 ] + pmulhrsw m3, m7, [coeffq+16*5 ] + pmulhrsw m4, m7, [coeffq+16*10] + pmulhrsw m5, m7, [coeffq+16*11] + pmulhrsw m6, m7, [coeffq+16*12] + pmulhrsw m7, [coeffq+16*13] + + call m(iadst_16x8_internal).main + call m(iadst_16x8_internal).main_pass1_end + + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS coeffq+16*0, 32 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + mov r3, tx2q + lea tx2q, [o(m(iflipadst_16x8_internal).pass1_end)] + jmp m(iflipadst_8x8_internal).pass1_end + +.pass1_end: + SAVE_8ROWS coeffq+16*1, 32 + LOAD_8ROWS coeffq+16*0, 32 + mova [rsp+gprsize+16*0], m7 + mov tx2q, r3 + jmp m(iflipadst_8x8_internal).pass1_end + +.pass2: + lea tx2q, [o(m(iflipadst_16x8_internal).end)] + lea r3, [dstq+8] + jmp m(iflipadst_8x8_internal).pass2_main + +.end: + LOAD_8ROWS coeffq+16*1, 32 + lea tx2q, [o(m(idct_8x16_internal).end1)] + mov dstq, r3 + jmp m(iflipadst_8x8_internal).pass2_main + + +INV_TXFM_16X8_FN identity, dct +INV_TXFM_16X8_FN identity, adst +INV_TXFM_16X8_FN identity, flipadst +INV_TXFM_16X8_FN identity, identity + +cglobal iidentity_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + add coeffq, 16*16 + mova m4, [coeffq-16*7] + mova m5, [coeffq-16*5] + mova m6, [coeffq-16*3] + mova m7, [coeffq-16*1] + mov r3, tx2q + lea tx2q, [o(m(iidentity_16x8_internal).pass1_end)] + +.pass1: + mova m0, [o(pw_2896x8)] + mova m2, [o(pw_1697x16)] + mova m3, [o(pw_16384)] + sub coeffq, 8*16 + REPX {pmulhrsw x, m0}, m4, m5, m6, m7 + pmulhrsw m1, m2, m4 + pmulhrsw m1, m3 + paddsw m1, m4 ; 1 + pmulhrsw m4, m2, m5 + pmulhrsw m4, m3 + paddsw m4, m5 ; 3 + pmulhrsw m5, m2, m6 + pmulhrsw m5, m3 + paddsw m5, m6 ; 5 + pmulhrsw m6, m2, m7 + pmulhrsw m6, m3 + paddsw m7, m6 ; 7 + pmulhrsw m6, m0, [coeffq+16*6] + mova [rsp+gprsize+16*0], m4 + pmulhrsw m4, m2, m6 + pmulhrsw m4, m3 + paddsw m6, m4 ; 6 + pmulhrsw m4, m0, [coeffq+16*4] + mova [rsp+gprsize+16*1], m6 + pmulhrsw m6, m2, m4 + pmulhrsw m6, m3 + paddsw m4, m6 ; 4 + pmulhrsw m6, m0, [coeffq+16*2] + pmulhrsw m0, [coeffq+16*0] + pmulhrsw m2, m6 + pmulhrsw m2, m3 + paddsw m2, m6 ; 2 + pmulhrsw m6, m0, [o(pw_1697x16)] + pmulhrsw m6, m3 + mova m3, [rsp+gprsize+16*0] + paddsw m0, m6 + jmp m(idct_8x8_internal).pass1_end3 + +.pass1_end: + mova [coeffq+16*1], m4 + mova [coeffq+16*3], m5 + mova [coeffq+16*5], m6 + mova [coeffq+16*7], m7 + mova m4, [coeffq-16*7] + mova m5, [coeffq-16*5] + mova m6, [coeffq-16*3] + mova m7, [coeffq-16*1] + mova [coeffq-16*7], m0 + mova [coeffq-16*5], m1 + mova [coeffq-16*3], m2 + mova [coeffq-16*1], m3 + mov tx2q, r3 + jmp .pass1 + +.pass2: + lea tx2q, [o(m(iidentity_16x8_internal).end)] + lea r3, [dstq+8] + jmp m(iidentity_8x8_internal).end + +.end: + LOAD_8ROWS coeffq+16*1, 32 + lea tx2q, [o(m(idct_8x16_internal).end1)] + mov dstq, r3 + jmp m(iidentity_8x8_internal).end + + +%macro INV_TXFM_16X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x16, 8, 16*16 +%ifidn %1_%2, dct_dct + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_8192)] + mov [coeffq], eobd + mov r2d, 8 + lea tx2q, [o(m(inv_txfm_add_dct_dct_16x16).end)] + jmp m(inv_txfm_add_dct_dct_16x4).dconly +.end: + RET +%endif +%endmacro + +INV_TXFM_16X16_FN dct, dct +INV_TXFM_16X16_FN dct, adst +INV_TXFM_16X16_FN dct, flipadst +INV_TXFM_16X16_FN dct, identity + +cglobal idct_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_8ROWS coeffq+16*1, 64 + call m(idct_8x8_internal).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_8ROWS coeffq+16*3, 64 + call m(idct_16x8_internal).main + mov r3, tx2q + lea tx2q, [o(m(idct_16x16_internal).pass1_end)] + mova m7, [o(pw_8192)] + jmp m(idct_8x8_internal).pass1_end1 + +.pass1_end: + SAVE_8ROWS coeffq+16*17, 32 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_16x16_internal).pass1_end1)] + mova m7, [o(pw_8192)] + jmp m(idct_8x8_internal).pass1_end1 + +.pass1_end1: + SAVE_8ROWS coeffq+16*1, 32 + LOAD_8ROWS coeffq+16*0, 64 + call m(idct_8x8_internal).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_8ROWS coeffq+16*2, 64 + call m(idct_16x8_internal).main + lea tx2q, [o(m(idct_16x16_internal).pass1_end2)] + mova m7, [o(pw_8192)] + jmp m(idct_8x8_internal).pass1_end1 + +.pass1_end2: + SAVE_8ROWS coeffq+16*16, 32 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + mov tx2q, r3 + mova m7, [o(pw_8192)] + jmp m(idct_8x8_internal).pass1_end1 + +.pass2: + lea tx2q, [o(m(idct_16x16_internal).end)] + jmp m(idct_8x16_internal).pass2_pre + +.end: + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_16x16_internal).end1)] + mov dstq, r3 + lea r3, [dstq+8] + jmp m(idct_8x8_internal).end + +.end1: + pxor m7, m7 + REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + + add coeffq, 32*8 + mov dstq, r3 + + mova m0, [coeffq+16*0 ] + mova m1, [coeffq+16*4 ] + mova m2, [coeffq+16*8 ] + mova m3, [coeffq+16*12] + mova m4, [coeffq+16*1 ] + mova m5, [coeffq+16*5 ] + mova m6, [coeffq+16*9 ] + mova m7, [coeffq+16*13] + lea tx2q, [o(m(idct_8x16_internal).end)] + jmp m(idct_8x16_internal).pass2_main + + +%macro ITX_16X16_ADST_LOAD_ODD_COEFS 0 + mova m0, [coeffq+16*1 ] + mova m1, [coeffq+16*3 ] + mova m2, [coeffq+16*29] + mova m3, [coeffq+16*31] + mova [rsp+gprsize+16*7], m0 + mova [rsp+gprsize+16*8], m1 + mova [rsp+gprsize+16*9], m2 + mova [rsp+gprsize+32*5], m3 + mova m0, [coeffq+16*13] + mova m1, [coeffq+16*15] + mova m2, [coeffq+16*17] + mova m3, [coeffq+16*19] + mova [rsp+gprsize+16*3], m2 + mova [rsp+gprsize+16*4], m3 + mova [rsp+gprsize+16*5], m0 + mova [rsp+gprsize+16*6], m1 + mova m0, [coeffq+16*5 ] + mova m1, [coeffq+16*7 ] + mova m2, [coeffq+16*9 ] + mova m3, [coeffq+16*11] + mova m4, [coeffq+16*21] + mova m5, [coeffq+16*23] + mova m6, [coeffq+16*25] + mova m7, [coeffq+16*27] +%endmacro + +%macro ITX_16X16_ADST_LOAD_EVEN_COEFS 0 + mova m0, [coeffq+16*0 ] + mova m1, [coeffq+16*2 ] + mova m2, [coeffq+16*28] + mova m3, [coeffq+16*30] + mova [rsp+gprsize+16*7], m0 + mova [rsp+gprsize+16*8], m1 + mova [rsp+gprsize+16*9], m2 + mova [rsp+gprsize+32*5], m3 + mova m0, [coeffq+16*12] + mova m1, [coeffq+16*14] + mova m2, [coeffq+16*16] + mova m3, [coeffq+16*18] + mova [rsp+gprsize+16*3], m2 + mova [rsp+gprsize+16*4], m3 + mova [rsp+gprsize+16*5], m0 + mova [rsp+gprsize+16*6], m1 + mova m0, [coeffq+16*4 ] + mova m1, [coeffq+16*6 ] + mova m2, [coeffq+16*8 ] + mova m3, [coeffq+16*10] + mova m4, [coeffq+16*20] + mova m5, [coeffq+16*22] + mova m6, [coeffq+16*24] + mova m7, [coeffq+16*26] +%endmacro + +INV_TXFM_16X16_FN adst, dct +INV_TXFM_16X16_FN adst, adst +INV_TXFM_16X16_FN adst, flipadst + +cglobal iadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + ITX_16X16_ADST_LOAD_ODD_COEFS + call m(iadst_16x8_internal).main + call m(iadst_16x8_internal).main_pass1_end + + mov r3, tx2q + lea tx2q, [o(m(iadst_16x16_internal).pass1_end)] + mova m7, [o(pw_8192)] + jmp m(iadst_8x8_internal).pass1_end1 + +.pass1_end: + SAVE_8ROWS coeffq+16*17, 32 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(iadst_16x16_internal).pass1_end1)] + mova m7, [o(pw_8192)] + jmp m(iadst_8x8_internal).pass1_end1 + +.pass1_end1: + SAVE_8ROWS coeffq+16*1, 32 + ITX_16X16_ADST_LOAD_EVEN_COEFS + call m(iadst_16x8_internal).main + call m(iadst_16x8_internal).main_pass1_end + + lea tx2q, [o(m(iadst_16x16_internal).pass1_end2)] + mova m7, [o(pw_8192)] + jmp m(iadst_8x8_internal).pass1_end1 + +.pass1_end2: + SAVE_8ROWS coeffq+16*16, 32 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + mov tx2q, r3 + mova m7, [o(pw_8192)] + jmp m(iadst_8x8_internal).pass1_end1 + +.pass2: + lea tx2q, [o(m(iadst_16x16_internal).end)] + jmp m(iadst_8x16_internal).pass2_pre + +.end: + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(iadst_16x16_internal).end1)] + mov dstq, r3 + lea r3, [dstq+8] + jmp m(iadst_8x8_internal).end + +.end1: + pxor m7, m7 + REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + + add coeffq, 32*8 + mov dstq, r3 + + mova m4, [coeffq+16*0 ] + mova m5, [coeffq+16*2 ] + mova m0, [coeffq+16*4 ] + mova m1, [coeffq+16*6 ] + mova m2, [coeffq+16*8 ] + mova m3, [coeffq+16*10] + mova m6, [coeffq+16*12] + mova m7, [coeffq+16*14] + mova [rsp+gprsize+16*7], m4 + mova [rsp+gprsize+16*8], m5 + mova [rsp+gprsize+16*5], m6 + mova [rsp+gprsize+16*6], m7 + lea tx2q, [o(m(iadst_8x16_internal).end)] + jmp m(iadst_8x16_internal).pass2_main + + +INV_TXFM_16X16_FN flipadst, dct +INV_TXFM_16X16_FN flipadst, adst +INV_TXFM_16X16_FN flipadst, flipadst + +cglobal iflipadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + ITX_16X16_ADST_LOAD_ODD_COEFS + call m(iadst_16x8_internal).main + call m(iadst_16x8_internal).main_pass1_end + + mov r3, tx2q + lea tx2q, [o(m(iflipadst_16x16_internal).pass1_end)] + mova m7, [o(pw_m8192)] + jmp m(iflipadst_8x8_internal).pass1_end1 + +.pass1_end: + SAVE_8ROWS coeffq+16*1, 32 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(iflipadst_16x16_internal).pass1_end1)] + mova m7, [o(pw_m8192)] + jmp m(iflipadst_8x8_internal).pass1_end1 + +.pass1_end1: + SAVE_8ROWS coeffq+16*17, 32 + ITX_16X16_ADST_LOAD_EVEN_COEFS + call m(iadst_16x8_internal).main + call m(iadst_16x8_internal).main_pass1_end + + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS coeffq+16*0, 32 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(iflipadst_16x16_internal).pass1_end2)] + mova m7, [o(pw_m8192)] + jmp m(iflipadst_8x8_internal).pass1_end1 + +.pass1_end2: + SAVE_8ROWS coeffq+16*16, 32 + LOAD_8ROWS coeffq+16* 0, 32 + mova [rsp+gprsize+16*0], m7 + mov tx2q, r3 + mova m7, [o(pw_m8192)] + jmp m(iflipadst_8x8_internal).pass1_end1 + +.pass2: + lea tx2q, [o(m(iflipadst_16x16_internal).end)] + lea r3, [dstq+8] + jmp m(iflipadst_8x16_internal).pass2_pre + +.end: + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(iflipadst_16x16_internal).end1)] + lea dstq, [dstq+strideq*2] + jmp m(iflipadst_8x8_internal).end + +.end1: + pxor m7, m7 + REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + + add coeffq, 32*8 + + mova m4, [coeffq+16*0 ] + mova m5, [coeffq+16*2 ] + mova m0, [coeffq+16*4 ] + mova m1, [coeffq+16*6 ] + mova m2, [coeffq+16*8 ] + mova m3, [coeffq+16*10] + mova m6, [coeffq+16*12] + mova m7, [coeffq+16*14] + mova [rsp+gprsize+16*7], m4 + mova [rsp+gprsize+16*8], m5 + mova [rsp+gprsize+16*5], m6 + mova [rsp+gprsize+16*6], m7 + + lea tx2q, [o(m(iflipadst_16x16_internal).end2)] + mov dstq, r3 + jmp m(iflipadst_8x16_internal).pass2_main + +.end2: + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_8x16_internal).end1)] + lea dstq, [dstq+strideq*2] + jmp m(iflipadst_8x8_internal).end + + +%macro IDTX16B 3 ; src/dst, tmp, pw_1697x16 + pmulhrsw m%2, m%3, m%1 + psraw m%2, 1 + pavgw m%1, m%2 +%endmacro + +INV_TXFM_16X16_FN identity, dct +INV_TXFM_16X16_FN identity, identity + +cglobal iidentity_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + add coeffq, 16*17 + mov r3, tx2q + lea tx2q, [o(m(iidentity_16x16_internal).pass1_end)] + +.pass1: + mova m6, [o(pw_1697x16)] + mova m7, [coeffq+32*6] + mova m0, [coeffq+32*0] + mova m1, [coeffq+32*1] + mova m2, [coeffq+32*2] + mova m3, [coeffq+32*3] + mova m4, [coeffq+32*4] + REPX {IDTX16B x, 5, 6}, 7, 0, 1, 2, 3, 4 + mova m5, [coeffq+32*5] + mova [rsp+gprsize+16*1], m7 + IDTX16B 5, 7, 6 + mova m7, [coeffq+32*7] + IDTX16B 7, 6, 6 + jmp m(idct_8x8_internal).pass1_end3 + +.pass1_end: + SAVE_8ROWS coeffq, 32 + sub coeffq, 16 + lea tx2q, [o(m(iidentity_16x16_internal).pass1_end1)] + jmp .pass1 + +.pass1_end1: + SAVE_8ROWS coeffq, 32 + sub coeffq, 15*16 + lea tx2q, [o(m(iidentity_16x16_internal).pass1_end2)] + jmp .pass1 + +.pass1_end2: + SAVE_8ROWS coeffq, 32 + sub coeffq, 16 + mov tx2q, r3 + jmp .pass1 + +.pass2: + lea r3, [dstq+8] + lea tx2q, [o(m(iidentity_16x16_internal).end1)] + +.end: + mova [rsp+gprsize+16*0], m7 + mova [rsp+gprsize+16*1], m4 + mova m7, [o(pw_1697x16)] + REPX {IDTX16 x, 4, 7}, 5, 6, 0, 1, 2, 3 + mova m4, [o(pw_2048)] + pmulhrsw m5, m4 + pmulhrsw m6, m4 + mova [rsp+gprsize+16*2], m5 + mova m5, [rsp+gprsize+16*1] + mova [rsp+gprsize+16*1], m6 + IDTX16 5, 6, 7 + mova m6, [rsp+gprsize+16*0] + IDTX16 6, 7, 7 + REPX {pmulhrsw x, m4}, m0, m1, m2, m3, m6 + pmulhrsw m4, m5 + mova [rsp+gprsize+16*0], m6 + jmp m(idct_8x8_internal).end3 + +.end1: + LOAD_8ROWS coeffq+16*1, 32 + lea tx2q, [o(m(iidentity_16x16_internal).end2)] + lea dstq, [dstq+strideq*2] + jmp .end + +.end2: + pxor m7, m7 + REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + + add coeffq, 32*8 + LOAD_8ROWS coeffq, 32 + lea tx2q, [o(m(iidentity_16x16_internal).end3)] + mov dstq, r3 + jmp .end + +.end3: + LOAD_8ROWS coeffq+16*1, 32 + lea tx2q, [o(m(idct_8x16_internal).end1)] + lea dstq, [dstq+strideq*2] + jmp .end + + +cglobal inv_txfm_add_dct_dct_8x32, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 +%if ARCH_X86_32 + LEA r5, $$ +%endif + test eobd, eobd + jz .dconly + call m(idct_8x32_internal) + RET + +.dconly: + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_8192)] + mov [coeffq], eobd + pmulhrsw m0, m2 + psrlw m2, 2 ;pw_2048 + pmulhrsw m0, m1 + pmulhrsw m0, m2 + pshuflw m0, m0, q0000 + punpcklwd m0, m0 + mov r3d, 8 + lea tx2q, [o(m(inv_txfm_add_dct_dct_8x32).end)] + jmp m(inv_txfm_add_dct_dct_8x8).loop + +.end: + RET + + + +cglobal idct_8x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + %undef cmp + cmp eobd, 106 + jle .fast + + LOAD_8ROWS coeffq+16*3, 64 + call m(idct_8x8_internal).main + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_8x32_internal).pass1)] + jmp m(idct_8x8_internal).pass1_end1 + +.pass1: + mova [rsp+gprsize+16*9 ], m0 ;in24 + mova [rsp+gprsize+16*10], m4 ;in28 + mova [rsp+gprsize+16*17], m2 ;in26 + mova [rsp+gprsize+16*18], m6 ;in30 + mova [rsp+gprsize+16*31], m1 ;in25 + mova [rsp+gprsize+16*30], m3 ;in27 + mova [rsp+gprsize+16*27], m5 ;in29 + mova [rsp+gprsize+16*34], m7 ;in31 + LOAD_8ROWS coeffq+16*2, 64 + call m(idct_8x8_internal).main + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_8x32_internal).pass1_1)] + jmp m(idct_8x8_internal).pass1_end1 + +.pass1_1: + mova [rsp+gprsize+16*7 ], m0 ;in16 + mova [rsp+gprsize+16*8 ], m4 ;in20 + mova [rsp+gprsize+16*15], m2 ;in18 + mova [rsp+gprsize+16*16], m6 ;in22 + mova [rsp+gprsize+16*33], m1 ;in17 + mova [rsp+gprsize+16*28], m3 ;in19 + mova [rsp+gprsize+16*29], m5 ;in21 + mova [rsp+gprsize+16*32], m7 ;in23 + +.fast: + LOAD_8ROWS coeffq+16*1, 64 + call m(idct_8x8_internal).main + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_8x32_internal).pass1_end)] + jmp m(idct_8x8_internal).pass1_end1 + +.pass1_end: + mova [rsp+gprsize+16*5 ], m0 ;in8 + mova [rsp+gprsize+16*6 ], m4 ;in12 + mova [rsp+gprsize+16*13], m2 ;in10 + mova [rsp+gprsize+16*14], m6 ;in14 + mova [rsp+gprsize+16*21], m1 ;in9 + mova [rsp+gprsize+16*24], m3 ;in11 + mova [rsp+gprsize+16*25], m5 ;in13 + mova [rsp+gprsize+16*20], m7 ;in15 + LOAD_8ROWS coeffq+16*0, 64 + call m(idct_8x8_internal).main + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_8x32_internal).pass1_end1)] + jmp m(idct_8x8_internal).pass1_end1 + +.pass1_end1: + mova [rsp+gprsize+16*11], m2 ;in2 + mova [rsp+gprsize+16*12], m6 ;in6 + mova [rsp+gprsize+16*19], m1 ;in1 + mova [rsp+gprsize+16*26], m3 ;in3 + mova [rsp+gprsize+16*23], m5 ;in5 + mova [rsp+gprsize+16*22], m7 ;in7 + mova m1, m4 ;in4 + mova m2, [rsp+gprsize+16*5 ] ;in8 + mova m3, [rsp+gprsize+16*6 ] ;in12 + + cmp eobd, 106 + jg .full + + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal).main + SAVE_7ROWS rsp+gprsize+16*3 , 16 + mova m0, [rsp+gprsize+16*11] + mova m1, [rsp+gprsize+16*12] + mova m2, [rsp+gprsize+16*13] + mova m3, [rsp+gprsize+16*14] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_16x8_internal).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + call .main_fast + jmp .pass2 + +.full: + mova m4, [rsp+gprsize+16*7 ] ;in16 + mova m5, [rsp+gprsize+16*8 ] ;in20 + mova m6, [rsp+gprsize+16*9 ] ;in24 + mova m7, [rsp+gprsize+16*10] ;in28 + call m(idct_8x8_internal).main + SAVE_7ROWS rsp+gprsize+16*3 , 16 + LOAD_8ROWS rsp+gprsize+16*11, 16 + call m(idct_16x8_internal).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + call .main + +.pass2: + lea r3, [o(m(idct_8x32_internal).end6)] + +.end: + mova [rsp+gprsize+16*0 ], m7 + lea tx2q, [o(m(idct_8x32_internal).end2)] + +.end1: + pxor m7, m7 + REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, \ + 8, 9, 10, 11, 12, 13, 14, 15, \ + 16, 17, 18, 19, 20, 21, 22, 23, \ + 24, 25, 26, 27, 28, 29, 30, 31 + + jmp tx2q + +.end2: + lea tx2q, [o(m(idct_8x32_internal).end3)] + jmp m(idct_8x8_internal).end + +.end3: + LOAD_8ROWS rsp+gprsize+16*11, 16 + mova [rsp+gprsize+16*0 ], m7 + lea dstq, [dstq+strideq*2] + lea tx2q, [o(m(idct_8x32_internal).end4)] + jmp m(idct_8x8_internal).end + +.end4: + LOAD_8ROWS rsp+gprsize+16*19, 16 + mova [rsp+gprsize+16*0 ], m7 + lea dstq, [dstq+strideq*2] + lea tx2q, [o(m(idct_8x32_internal).end5)] + jmp m(idct_8x8_internal).end + +.end5: + LOAD_8ROWS rsp+gprsize+16*27, 16 + mova [rsp+gprsize+16*0 ], m7 + lea dstq, [dstq+strideq*2] + mov tx2q, r3 + jmp m(idct_8x8_internal).end + +.end6: + ret + +ALIGN function_align +.main_veryfast: + mova m0, [rsp+gprsize*2+16*19] ;in1 + pmulhrsw m3, m0, [o(pw_4091x8)] ;t30,t31 + pmulhrsw m0, [o(pw_201x8)] ;t16,t17 + mova m7, [o(pd_2048)] + mova [rsp+gprsize*2+16*19], m0 ;t16 + mova [rsp+gprsize*2+16*34], m3 ;t31 + ITX_MULSUB_2W 3, 0, 1, 2, 7, 799, 4017 ;t17a, t30a + mova [rsp+gprsize*2+16*20], m3 ;t17a + mova [rsp+gprsize*2+16*33], m0 ;t30a + mova m1, [rsp+gprsize*2+16*22] ;in7 + pmulhrsw m2, m1, [o(pw_3857x8)] ;t28,t29 + pmulhrsw m1, [o(pw_m1380x8)] ;t18,t19 + mova [rsp+gprsize*2+16*22], m1 ;t19 + mova [rsp+gprsize*2+16*31], m2 ;t28 + ITX_MULSUB_2W 2, 1, 0, 3, 7, m4017, 799 ;t18a, t29a + mova [rsp+gprsize*2+16*21], m2 ;t18a + mova [rsp+gprsize*2+16*32], m1 ;t29a + mova m0, [rsp+gprsize*2+16*23] ;in5 + pmulhrsw m3, m0, [o(pw_3973x8)] ;t26, t27 + pmulhrsw m0, [o(pw_995x8)] ;t20, t21 + mova [rsp+gprsize*2+16*23], m0 ;t20 + mova [rsp+gprsize*2+16*30], m3 ;t27 + ITX_MULSUB_2W 3, 0, 1, 2, 7, 3406, 2276 ;t21a, t26a + mova [rsp+gprsize*2+16*24], m3 ;t21a + mova [rsp+gprsize*2+16*29], m0 ;t26a + mova m2, [rsp+gprsize*2+16*26] ;in3 + pxor m0, m0 + mova m3, m0 + pmulhrsw m1, m2, [o(pw_4052x8)] + pmulhrsw m2, [o(pw_m601x8)] + jmp .main2 + +ALIGN function_align +.main_fast: ;bottom half is zero + mova m0, [rsp+gprsize*2+16*19] ;in1 + mova m1, [rsp+gprsize*2+16*20] ;in15 + pmulhrsw m3, m0, [o(pw_4091x8)] ;t31a + pmulhrsw m0, [o(pw_201x8)] ;t16a + pmulhrsw m2, m1, [o(pw_3035x8)] ;t30a + pmulhrsw m1, [o(pw_m2751x8)] ;t17a + mova m7, [o(pd_2048)] + psubsw m4, m0, m1 ;t17 + paddsw m0, m1 ;t16 + psubsw m5, m3, m2 ;t30 + paddsw m3, m2 ;t31 + ITX_MULSUB_2W 5, 4, 1, 2, 7, 799, 4017 ;t17a, t30a + mova [rsp+gprsize*2+16*19], m0 ;t16 + mova [rsp+gprsize*2+16*20], m5 ;t17a + mova [rsp+gprsize*2+16*33], m4 ;t30a + mova [rsp+gprsize*2+16*34], m3 ;t31 + mova m0, [rsp+gprsize*2+16*21] ;in9 + mova m1, [rsp+gprsize*2+16*22] ;in7 + pmulhrsw m3, m0, [o(pw_3703x8)] + pmulhrsw m0, [o(pw_1751x8)] + pmulhrsw m2, m1, [o(pw_3857x8)] + pmulhrsw m1, [o(pw_m1380x8)] + psubsw m4, m1, m0 ;t18 + paddsw m0, m1 ;t19 + psubsw m5, m2, m3 ;t29 + paddsw m3, m2 ;t28 + ITX_MULSUB_2W 5, 4, 1, 2, 7, m4017, 799 ;t18a, t29a + mova [rsp+gprsize*2+16*21], m5 ;t18a + mova [rsp+gprsize*2+16*22], m0 ;t19 + mova [rsp+gprsize*2+16*31], m3 ;t28 + mova [rsp+gprsize*2+16*32], m4 ;t29a + mova m0, [rsp+gprsize*2+16*23] ;in5 + mova m1, [rsp+gprsize*2+16*24] ;in11 + pmulhrsw m3, m0, [o(pw_3973x8)] + pmulhrsw m0, [o(pw_995x8)] + pmulhrsw m2, m1, [o(pw_3513x8)] + pmulhrsw m1, [o(pw_m2106x8)] + psubsw m4, m0, m1 ;t21 + paddsw m0, m1 ;t20 + psubsw m5, m3, m2 ;t26 + paddsw m3, m2 ;t27 + ITX_MULSUB_2W 5, 4, 1, 2, 7, 3406, 2276 ;t21a, t26a + mova [rsp+gprsize*2+16*23], m0 ;t20 + mova [rsp+gprsize*2+16*24], m5 ;t21a + mova [rsp+gprsize*2+16*29], m4 ;t26a + mova [rsp+gprsize*2+16*30], m3 ;t27 + mova m0, [rsp+gprsize*2+16*25] ;in13 + mova m2, [rsp+gprsize*2+16*26] ;in3 + pmulhrsw m3, m0, [o(pw_3290x8)] + pmulhrsw m0, [o(pw_2440x8)] + pmulhrsw m1, m2, [o(pw_4052x8)] + pmulhrsw m2, [o(pw_m601x8)] + jmp .main2 + +ALIGN function_align +.main: + mova m7, [o(pd_2048)] + mova m0, [rsp+gprsize*2+16*19] ;in1 + mova m1, [rsp+gprsize*2+16*20] ;in15 + mova m2, [rsp+gprsize*2+16*33] ;in17 + mova m3, [rsp+gprsize*2+16*34] ;in31 + ITX_MULSUB_2W 0, 3, 4, 5, 7, 201, 4091 ;t16a, t31a + ITX_MULSUB_2W 2, 1, 4, 5, 7, 3035, 2751 ;t17a, t30a + psubsw m4, m0, m2 ;t17 + paddsw m0, m2 ;t16 + psubsw m5, m3, m1 ;t30 + paddsw m3, m1 ;t31 + ITX_MULSUB_2W 5, 4, 1, 2, 7, 799, 4017 ;t17a, t30a + mova [rsp+gprsize*2+16*19], m0 ;t16 + mova [rsp+gprsize*2+16*20], m5 ;t17a + mova [rsp+gprsize*2+16*33], m4 ;t30a + mova [rsp+gprsize*2+16*34], m3 ;t31 + mova m0, [rsp+gprsize*2+16*21] ;in9 + mova m1, [rsp+gprsize*2+16*22] ;in7 + mova m2, [rsp+gprsize*2+16*31] ;in25 + mova m3, [rsp+gprsize*2+16*32] ;in23 + ITX_MULSUB_2W 0, 3, 4, 5, 7, 1751, 3703 ;t18a, t29a + ITX_MULSUB_2W 2, 1, 4, 5, 7, 3857, 1380 ;t19a, t28a + psubsw m4, m2, m0 ;t18 + paddsw m0, m2 ;t19 + psubsw m5, m1, m3 ;t29 + paddsw m3, m1 ;t28 + ITX_MULSUB_2W 5, 4, 1, 2, 7, m4017, 799 ;t18a, t29a + mova [rsp+gprsize*2+16*21], m5 ;t18a + mova [rsp+gprsize*2+16*22], m0 ;t19 + mova [rsp+gprsize*2+16*31], m3 ;t28 + mova [rsp+gprsize*2+16*32], m4 ;t29a + mova m0, [rsp+gprsize*2+16*23] ;in5 + mova m1, [rsp+gprsize*2+16*24] ;in11 + mova m2, [rsp+gprsize*2+16*29] ;in21 + mova m3, [rsp+gprsize*2+16*30] ;in27 + ITX_MULSUB_2W 0, 3, 4, 5, 7, 995, 3973 ;t20a, t27a + ITX_MULSUB_2W 2, 1, 4, 5, 7, 3513, 2106 ;t21a, t26a + psubsw m4, m0, m2 ;t21 + paddsw m0, m2 ;t20 + psubsw m5, m3, m1 ;t26 + paddsw m3, m1 ;t27 + ITX_MULSUB_2W 5, 4, 1, 2, 7, 3406, 2276 ;t21a, t26a + mova [rsp+gprsize*2+16*23], m0 ;t20 + mova [rsp+gprsize*2+16*24], m5 ;t21a + mova [rsp+gprsize*2+16*29], m4 ;t26a + mova [rsp+gprsize*2+16*30], m3 ;t27 + mova m0, [rsp+gprsize*2+16*25] ;in13 + mova m1, [rsp+gprsize*2+16*26] ;in3 + mova m2, [rsp+gprsize*2+16*27] ;in29 + mova m3, [rsp+gprsize*2+16*28] ;in19 + ITX_MULSUB_2W 0, 3, 4, 5, 7, 2440, 3290 ;t22a, t25a + ITX_MULSUB_2W 2, 1, 4, 5, 7, 4052, 601 ;t23a, t24a + +.main2: + psubsw m4, m2, m0 ;t22 + paddsw m0, m2 ;t23 + psubsw m5, m1, m3 ;t25 + paddsw m3, m1 ;t24 + ITX_MULSUB_2W 5, 4, 1, 2, 7, m2276, 3406 ;t22a, t25a + mova m2, [rsp+gprsize*2+16*24] ;t21a + psubsw m1, m5, m2 ;t21 + paddsw m5, m2 ;t22 + mova [rsp+gprsize*2+16*25], m5 ;t22 + mova m2, [rsp+gprsize*2+16*29] ;t26a + psubsw m5, m4, m2 ;t26 + paddsw m4, m2 ;t25 + mova [rsp+gprsize*2+16*28], m4 ;t25 + ITX_MULSUB_2W 5, 1, 2, 4, 7, m3784, 1567 ;t21a, t26a + mova [rsp+gprsize*2+16*24], m5 ;t21a + mova [rsp+gprsize*2+16*29], m1 ;t26a + + mova m1, [rsp+gprsize*2+16*23] ;t20 + mova m5, [rsp+gprsize*2+16*30] ;t27 + psubsw m2, m0, m1 ;t20a + paddsw m0, m1 ;t23a + psubsw m6, m3, m5 ;t27a + paddsw m3, m5 ;t24a + ITX_MULSUB_2W 6, 2, 1, 5, 7, m3784, 1567 ;t20, t27 + mova [rsp+gprsize*2+16*26], m0 ;t23a + mova [rsp+gprsize*2+16*27], m3 ;t24a + mova [rsp+gprsize*2+16*30], m2 ;t27 + + mova m0, [rsp+gprsize*2+16*20] ;t17a + mova m1, [rsp+gprsize*2+16*21] ;t18a + mova m2, [rsp+gprsize*2+16*32] ;t29a + mova m3, [rsp+gprsize*2+16*33] ;t30a + psubsw m4, m0, m1 ;t18 + paddsw m0, m1 ;t17 + psubsw m5, m3, m2 ;t29 + paddsw m3, m2 ;t30 + ITX_MULSUB_2W 5, 4, 1, 2, 7, 1567, 3784 ;t18a, t29a + mova [rsp+gprsize*2+16*20], m0 ;t17 + mova [rsp+gprsize*2+16*21], m5 ;t18a + mova [rsp+gprsize*2+16*32], m4 ;t29a + mova [rsp+gprsize*2+16*33], m3 ;t30 + mova m0, [rsp+gprsize*2+16*19] ;t16 + mova m1, [rsp+gprsize*2+16*22] ;t19 + mova m2, [rsp+gprsize*2+16*31] ;t28 + mova m3, [rsp+gprsize*2+16*34] ;t31 + psubsw m4, m0, m1 ;t19a + paddsw m0, m1 ;t16a + psubsw m5, m3, m2 ;t28a + paddsw m3, m2 ;t31a + ITX_MULSUB_2W 5, 4, 1, 2, 7, 1567, 3784 ;t19, t28 + mova m2, [rsp+gprsize*2+16*15] ;tmp12 + psubsw m1, m5, m6 ;t20a + paddsw m5, m6 ;t19a + psubsw m6, m2, m5 ;out19 + paddsw m2, m5 ;out12 + mova m5, [rsp+gprsize*2+16*30] ;t27 + mova [rsp+gprsize*2+16*22], m6 ;out19 + mova [rsp+gprsize*2+16*15], m2 ;out12 + psubsw m6, m4, m5 ;t27a + paddsw m4, m5 ;t28a + ITX_MULSUB_2W 6, 1, 2, 5, 7, 2896, 2896 ;t20, t27 + mova m2, [rsp+gprsize*2+16*6 ] ;tmp3 + psubsw m5, m2, m4 ;out28 + paddsw m2, m4 ;out3 + mova m4, [rsp+gprsize*2+16*14] ;tmp11 + mova [rsp+gprsize*2+16*31], m5 ;out28 + mova [rsp+gprsize*2+16*6 ], m2 ;out3 + psubsw m5, m4, m6 ;out20 + paddsw m4, m6 ;out11 + mova m2, [rsp+gprsize*2+16*7 ] ;tmp4 + mova [rsp+gprsize*2+16*23], m5 ;out20 + mova [rsp+gprsize*2+16*14], m4 ;out11 + psubsw m5, m2, m1 ;out27 + paddsw m2, m1 ;out4 + mova m1, [rsp+gprsize*2+16*26] ;t23a + mova m4, [rsp+gprsize*2+16*27] ;t24a + mova [rsp+gprsize*2+16*30], m5 ;out27 + mova [rsp+gprsize*2+16*7 ], m2 ;out4 + psubsw m5, m0, m1 ;t23 + paddsw m0, m1 ;t16 + psubsw m2, m3, m4 ;t24 + paddsw m3, m4 ;t31 + ITX_MULSUB_2W 2, 5, 4, 6, 7, 2896, 2896 ;t23a, t24a + mova m6, [rsp+gprsize*2+16*18] ;tmp15 + psubsw m4, m6, m0 ;out16 + paddsw m6, m0 ;out15 + mova m0, [rsp+gprsize*2+16*3 ] ;tmp0 + mova m1, [rsp+gprsize*2+16*11] ;tmp8 + mova [rsp+gprsize*2+16*18], m6 ;out15 + mova [rsp+gprsize*2+16*19], m4 ;out16 + psubsw m6, m0, m3 ;out31 + paddsw m0, m3 ;out0 + psubsw m4, m1, m2 ;out23 + paddsw m1, m2 ;out8 + mova m3, [rsp+gprsize*2+16*10] ;tmp7 + mova [rsp+gprsize*2+16*34], m6 ;out31 + mova [rsp+gprsize*2+16*11], m1 ;out8 + mova [rsp+gprsize*2+16*26], m4 ;out23 + paddsw m6, m3, m5 ;out7 + psubsw m3, m5 ;out24 + mova m1, [rsp+gprsize*2+16*20] ;t17 + mova m5, [rsp+gprsize*2+16*25] ;t22 + mova m2, [rsp+gprsize*2+16*17] ;tmp14 + mova [rsp+gprsize*2+16*27], m3 ;out24 + psubsw m4, m1, m5 ;t22a + paddsw m1, m5 ;t17a + psubsw m3, m2, m1 ;out17 + paddsw m2, m1 ;out14 + mova m5, [rsp+gprsize*2+16*28] ;t25 + mova m1, [rsp+gprsize*2+16*33] ;t30 + mova [rsp+gprsize*2+16*17], m2 ;out14 + mova [rsp+gprsize*2+16*20], m3 ;out17 + psubsw m2, m1, m5 ;t25a + paddsw m1, m5 ;t30a + ITX_MULSUB_2W 2, 4, 3, 5, 7, 2896, 2896 ;t22, t25 + mova m5, [rsp+gprsize*2+16*4 ] ;tmp1 + psubsw m3, m5, m1 ;out30 + paddsw m5, m1 ;out1 + mova m1, [rsp+gprsize*2+16*12] ;tmp9 + mova [rsp+gprsize*2+16*33], m3 ;out30 + mova [rsp+gprsize*2+16*4 ], m5 ;out1 + psubsw m3, m1, m2 ;out22 + paddsw m1, m2 ;out9 + mova m5, [rsp+gprsize*2+16*9 ] ;tmp6 + mova [rsp+gprsize*2+16*25], m3 ;out22 + mova [rsp+gprsize*2+16*12], m1 ;out9 + psubsw m3, m5, m4 ;out25 + paddsw m5, m4 ;out6 + mova m4, [rsp+gprsize*2+16*21] ;t18a + mova m1, [rsp+gprsize*2+16*24] ;t21a + mova m2, [rsp+gprsize*2+16*16] ;tmp13 + mova [rsp+gprsize*2+16*28], m3 ;out25 + mova [rsp+gprsize*2+16*9 ], m5 ;out6 + paddsw m3, m4, m1 ;t18 + psubsw m4, m1 ;t21 + psubsw m5, m2, m3 ;out18 + paddsw m2, m3 ;out13 + mova m1, [rsp+gprsize*2+16*29] ;t26a + mova m3, [rsp+gprsize*2+16*32] ;t29a + mova [rsp+gprsize*2+16*21], m5 ;out18 + mova [rsp+gprsize*2+16*16], m2 ;out13 + psubsw m5, m3, m1 ;t26 + paddsw m3, m1 ;t29 + ITX_MULSUB_2W 5, 4, 1, 2, 7, 2896, 2896 ;t21a, t26a + mova m2, [rsp+gprsize*2+16*5 ] ;tmp2 + psubsw m1, m2, m3 ;out29 + paddsw m2, m3 ;out2 + mova m3, [rsp+gprsize*2+16*13] ;tmp10 + mova [rsp+gprsize*2+16*32], m1 ;out29 + psubsw m7, m3, m5 ;out21 + paddsw m3, m5 ;out10 + mova m5, [rsp+gprsize*2+16*8 ] ;tmp5 + mova [rsp+gprsize*2+16*24], m7 ;out21 + mova [rsp+gprsize*2+16*13], m3 ;out10 + psubsw m1, m5, m4 ;out26 + paddsw m5, m4 ;out5 + mova m7, m6 ;out7 + mova m3, [rsp+gprsize*2+16*6 ] ;out3 + mova m4, [rsp+gprsize*2+16*7 ] ;out4 + mova [rsp+gprsize*2+16*29], m1 ;out26 + mova m6, [rsp+gprsize*2+16*9 ] ;out6 + mova m1, [rsp+gprsize*2+16*4 ] ;out1 + ret + + +cglobal inv_txfm_add_dct_dct_32x8, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 +%if ARCH_X86_32 + LEA r5, $$ +%endif + test eobd, eobd + jz .dconly + call m(idct_32x8_internal) + RET + +.dconly: + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_8192)] + mov [coeffq], eobd + mov r3d, 8 + lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8).end)] + +.body: + pmulhrsw m0, m2 + movd m2, [o(pw_2048)] ;intentionally rip-relative + pmulhrsw m0, m1 + pmulhrsw m0, m2 + pshuflw m0, m0, q0000 + punpcklwd m0, m0 + pxor m5, m5 + +.loop: + mova m1, [dstq+16*0] + mova m3, [dstq+16*1] + punpckhbw m2, m1, m5 + punpcklbw m1, m5 + punpckhbw m4, m3, m5 + punpcklbw m3, m5 + paddw m2, m0 + paddw m1, m0 + paddw m4, m0 + paddw m3, m0 + packuswb m1, m2 + packuswb m3, m4 + mova [dstq+16*0], m1 + mova [dstq+16*1], m3 + add dstq, strideq + dec r3d + jg .loop + jmp tx2q + +.end: + RET + + +cglobal idct_32x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + %undef cmp + LOAD_8ROWS coeffq+16*0, 64 + call m(idct_8x8_internal).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + LOAD_8ROWS coeffq+16*2, 64 + call m(idct_16x8_internal).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + LOAD_8ROWS coeffq+16*1, 32 + mova [rsp+gprsize+16*19], m0 ;in1 + mova [rsp+gprsize+16*26], m1 ;in3 + mova [rsp+gprsize+16*23], m2 ;in5 + mova [rsp+gprsize+16*22], m3 ;in7 + mova [rsp+gprsize+16*21], m4 ;in9 + mova [rsp+gprsize+16*24], m5 ;in11 + mova [rsp+gprsize+16*25], m6 ;in13 + mova [rsp+gprsize+16*20], m7 ;in15 + + cmp eobd, 106 + jg .full + call m(idct_8x32_internal).main_fast + jmp .pass2 + +.full: + LOAD_8ROWS coeffq+16*17, 32 + mova [rsp+gprsize+16*33], m0 ;in17 + mova [rsp+gprsize+16*28], m1 ;in19 + mova [rsp+gprsize+16*29], m2 ;in21 + mova [rsp+gprsize+16*32], m3 ;in23 + mova [rsp+gprsize+16*31], m4 ;in25 + mova [rsp+gprsize+16*30], m5 ;in27 + mova [rsp+gprsize+16*27], m6 ;in29 + mova [rsp+gprsize+16*34], m7 ;in31 + call m(idct_8x32_internal).main + +.pass2: + mova [rsp+gprsize+16*0 ], m7 + lea tx2q, [o(m(idct_32x8_internal).end)] + jmp m(idct_8x32_internal).end1 + +.end: + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_32x8_internal).end1)] + jmp m(idct_8x8_internal).pass1_end1 + +.end1: + lea r3, [dstq+8] + lea tx2q, [o(m(idct_32x8_internal).end2)] + jmp m(idct_8x8_internal).pass2_main + +.end2: + LOAD_8ROWS rsp+gprsize+16*11, 16 + mova [rsp+gprsize+16*0 ], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_32x8_internal).end3)] + jmp m(idct_8x8_internal).pass1_end1 + +.end3: + mov dstq, r3 + add r3, 8 + lea tx2q, [o(m(idct_32x8_internal).end4)] + jmp m(idct_8x8_internal).pass2_main + +.end4: + LOAD_8ROWS rsp+gprsize+16*19, 16 + mova [rsp+gprsize+16*0 ], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_32x8_internal).end5)] + jmp m(idct_8x8_internal).pass1_end1 + +.end5: + mov dstq, r3 + add r3, 8 + lea tx2q, [o(m(idct_32x8_internal).end6)] + jmp m(idct_8x8_internal).pass2_main + +.end6: + LOAD_8ROWS rsp+gprsize+16*27, 16 + mova [rsp+gprsize+16*0 ], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_32x8_internal).end7)] + jmp m(idct_8x8_internal).pass1_end1 + +.end7: + mov dstq, r3 + lea tx2q, [o(m(idct_32x8_internal).end8)] + jmp m(idct_8x8_internal).pass2_main + +.end8: + ret + + +cglobal inv_txfm_add_identity_identity_8x32, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 + mov r5d, 4 + mov tx2d, 2 + cmp eobd, 107 + cmovns tx2d, r5d + mov r3d, tx2d +%if ARCH_X86_32 + LEA r5, $$ +%endif + lea tx2q, [o(m(idct_32x8_internal).end8)] +.loop: + LOAD_8ROWS coeffq+16*0, 64 + paddsw m6, [o(pw_5)] + mova [rsp+16*1], m6 + mova m6, [o(pw_5)] + REPX {paddsw x, m6}, m0, m1, m2, m3, m4, m5, m7 + call m(idct_8x8_internal).pass1_end3 + REPX {psraw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7 + mova [rsp+16*2], m5 + mova [rsp+16*1], m6 + mova [rsp+16*0], m7 + call m(idct_8x8_internal).end3 + lea dstq, [dstq+strideq*2] + pxor m7, m7 + REPX {mova [coeffq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 + add coeffq, 16 + dec r3d + jg .loop + RET + +cglobal inv_txfm_add_identity_identity_32x8, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 + mov r5d, 4 + mov tx2d, 2 + cmp eobd, 107 + cmovns tx2d, r5d + mov r3d, tx2d +%if ARCH_X86_32 + LEA r5, $$ +%endif + +.loop: + LOAD_8ROWS coeffq+16*0, 16 + pmulhrsw m6, [o(pw_4096)] + mova [rsp+16*1], m6 + mova m6, [o(pw_4096)] + REPX {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7 + lea tx2q, [o(m(idct_32x8_internal).end8)] + call m(idct_8x8_internal).pass1_end3 + + mov [rsp+16*3], dstq + mova [rsp+16*2], m5 + mova [rsp+16*1], m6 + mova [rsp+16*0], m7 + lea tx2q, [o(m(idct_8x8_internal).end4)] + call m(idct_8x8_internal).end3 + + add coeffq, 16*8 + mov dstq, [rsp+16*3] + lea dstq, [dstq+8] + dec r3d + jg .loop + jnc .loop + RET + + +cglobal inv_txfm_add_dct_dct_16x32, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 +%if ARCH_X86_32 + LEA r5, $$ +%endif + test eobd, eobd + jz .dconly + call m(idct_16x32_internal) + RET + +.dconly: + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_16384)] + mov [coeffq], eobd + pmulhrsw m0, m1 + mov r2d, 16 + lea tx2q, [o(m(inv_txfm_add_dct_dct_16x32).end)] + jmp m(inv_txfm_add_dct_dct_16x4).dconly + +.end: + RET + +cglobal idct_16x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + %undef cmp + + LOAD_8ROWS coeffq+16*1, 128, 1 + call m(idct_8x8_internal).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_8ROWS coeffq+16*5, 128, 1 + call m(idct_16x8_internal).main + lea tx2q, [o(m(idct_16x32_internal).pass1_end)] + jmp m(idct_8x8_internal).pass1_end + +.pass1_end: + SAVE_8ROWS coeffq+16*33, 64 ;in8~in15 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_16x32_internal).pass1_end1)] + jmp m(idct_8x8_internal).pass1_end + +.pass1_end1: + mova [coeffq+16*1 ], m0 ;in8 + mova [coeffq+16*5 ], m4 ;in12 + mova [rsp+gprsize+16*13], m2 ;in10 + mova [rsp+gprsize+16*14], m6 ;in14 + mova [rsp+gprsize+16*21], m1 ;in9 + mova [rsp+gprsize+16*24], m3 ;in11 + mova [rsp+gprsize+16*25], m5 ;in13 + mova [rsp+gprsize+16*20], m7 ;in15 + LOAD_8ROWS coeffq+16*0, 128, 1 + call m(idct_8x8_internal).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_8ROWS coeffq+16*4, 128, 1 + call m(idct_16x8_internal).main + lea tx2q, [o(m(idct_16x32_internal).pass1_end2)] + jmp m(idct_8x8_internal).pass1_end + +.pass1_end2: + SAVE_8ROWS coeffq+16*32, 64 ;in0~in7 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_16x32_internal).pass1_end3)] + jmp m(idct_8x8_internal).pass1_end + +.pass1_end3: + mova [rsp+gprsize+16*11], m2 ;in2 + mova [rsp+gprsize+16*12], m6 ;in6 + mova [rsp+gprsize+16*19], m1 ;in1 + mova [rsp+gprsize+16*26], m3 ;in3 + mova [rsp+gprsize+16*23], m5 ;in5 + mova [rsp+gprsize+16*22], m7 ;in7 + + cmp eobd, 150 + jg .full + + mova m1, m4 ;in4 + mova m2, [coeffq+16*1 ] ;in8 + mova m3, [coeffq+16*5 ] ;in12 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + mova m0, [rsp+gprsize+16*11] ;in2 + mova m1, [rsp+gprsize+16*12] ;in6 + mova m2, [rsp+gprsize+16*13] ;in10 + mova m3, [rsp+gprsize+16*14] ;in14 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_16x8_internal).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + call m(idct_8x32_internal).main_fast + jmp .pass2 + +.full: + mova [coeffq+16*0 ], m0 ;in0 + mova [coeffq+16*4 ], m4 ;in4 + + LOAD_8ROWS coeffq+16*2, 128, 1 + call m(idct_8x8_internal).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_8ROWS coeffq+16*6, 128, 1 + call m(idct_16x8_internal).main + lea tx2q, [o(m(idct_16x32_internal).pass1_end4)] + jmp m(idct_8x8_internal).pass1_end + +.pass1_end4: + SAVE_8ROWS coeffq+16*34, 64 ;in16~in23 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_16x32_internal).pass1_end5)] + jmp m(idct_8x8_internal).pass1_end + +.pass1_end5: + mova [coeffq+16*2 ], m0 ;in16 + mova [coeffq+16*6 ], m4 ;in20 + mova [rsp+gprsize+16*15], m2 ;in18 + mova [rsp+gprsize+16*16], m6 ;in22 + mova [rsp+gprsize+16*33], m1 ;in17 + mova [rsp+gprsize+16*28], m3 ;in19 + mova [rsp+gprsize+16*29], m5 ;in21 + mova [rsp+gprsize+16*32], m7 ;in23 + + LOAD_8ROWS coeffq+16*3, 128, 1 + call m(idct_8x8_internal).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_8ROWS coeffq+16*7, 128, 1 + call m(idct_16x8_internal).main + lea tx2q, [o(m(idct_16x32_internal).pass1_end6)] + jmp m(idct_8x8_internal).pass1_end + +.pass1_end6: + SAVE_8ROWS coeffq+16*35, 64 ;in24~in31 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_16x32_internal).pass1_end7)] + jmp m(idct_8x8_internal).pass1_end + +.pass1_end7: + mova [rsp+gprsize+16*17], m2 ;in26 + mova [rsp+gprsize+16*18], m6 ;in30 + mova [rsp+gprsize+16*31], m1 ;in25 + mova [rsp+gprsize+16*30], m3 ;in27 + mova [rsp+gprsize+16*27], m5 ;in29 + mova [rsp+gprsize+16*34], m7 ;in31 + + mova m6, m0 ;in24 + mova m7, m4 ;in28 + mova m0, [coeffq+16*0 ] ;in0 + mova m1, [coeffq+16*4 ] ;in4 + mova m2, [coeffq+16*1 ] ;in8 + mova m3, [coeffq+16*5 ] ;in12 + mova m4, [coeffq+16*2 ] ;in16 + mova m5, [coeffq+16*6 ] ;in20 + call m(idct_8x8_internal).main + SAVE_7ROWS rsp+gprsize+16*3 , 16 + LOAD_8ROWS rsp+gprsize+16*11, 16 + call m(idct_16x8_internal).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + call m(idct_8x32_internal).main + +.pass2: + mov [rsp+gprsize*1+16*35], eobd + lea r3, [dstq+8] + mov [rsp+gprsize*2+16*35], r3 + lea r3, [o(m(idct_16x32_internal).end)] + jmp m(idct_8x32_internal).end + +.end: + mov dstq, [rsp+gprsize*2+16*35] + mov eobd, [rsp+gprsize*1+16*35] + add coeffq, 16*32 + + mova m0, [coeffq+16*4 ] ;in1 + mova m1, [coeffq+16*12] ;in3 + mova m2, [coeffq+16*20] ;in5 + mova m3, [coeffq+16*28] ;in7 + mova m4, [coeffq+16*5 ] ;in9 + mova m5, [coeffq+16*13] ;in11 + mova m6, [coeffq+16*21] ;in13 + mova m7, [coeffq+16*29] ;in15 + + mova [rsp+gprsize+16*19], m0 ;in1 + mova [rsp+gprsize+16*26], m1 ;in3 + mova [rsp+gprsize+16*23], m2 ;in5 + mova [rsp+gprsize+16*22], m3 ;in7 + mova [rsp+gprsize+16*21], m4 ;in9 + mova [rsp+gprsize+16*24], m5 ;in11 + mova [rsp+gprsize+16*25], m6 ;in13 + mova [rsp+gprsize+16*20], m7 ;in15 + + mova m0, [coeffq+16*0 ] ;in0 + mova m1, [coeffq+16*16] ;in4 + mova m2, [coeffq+16*1 ] ;in8 + mova m3, [coeffq+16*17] ;in12 + + cmp eobd, 150 + jg .full1 + + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + mova m0, [coeffq+16*8 ] ;in2 + mova m1, [coeffq+16*24] ;in6 + mova m2, [coeffq+16*9 ] ;in10 + mova m3, [coeffq+16*25] ;in14 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_16x8_internal).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + call m(idct_8x32_internal).main_fast + jmp .end1 + +.full1: + mova m4, [coeffq+16*2 ] ;in16 + mova m5, [coeffq+16*18] ;in20 + mova m6, [coeffq+16*3 ] ;in24 + mova m7, [coeffq+16*19] ;in26 + call m(idct_8x8_internal).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + mova m0, [coeffq+16*8 ] ;in2 + mova m1, [coeffq+16*24] ;in6 + mova m2, [coeffq+16*9 ] ;in10 + mova m3, [coeffq+16*25] ;in14 + mova m4, [coeffq+16*10] ;in18 + mova m5, [coeffq+16*26] ;in22 + mova m6, [coeffq+16*11] ;in26 + mova m7, [coeffq+16*27] ;in30 + call m(idct_16x8_internal).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + mova m0, [coeffq+16*6 ] ;in17 + mova m1, [coeffq+16*14] ;in19 + mova m2, [coeffq+16*22] ;in21 + mova m3, [coeffq+16*30] ;in23 + mova m4, [coeffq+16*7 ] ;in25 + mova m5, [coeffq+16*15] ;in27 + mova m6, [coeffq+16*23] ;in29 + mova m7, [coeffq+16*31] ;in31 + + mova [rsp+gprsize+16*33], m0 ;in17 + mova [rsp+gprsize+16*28], m1 ;in19 + mova [rsp+gprsize+16*29], m2 ;in21 + mova [rsp+gprsize+16*32], m3 ;in23 + mova [rsp+gprsize+16*31], m4 ;in25 + mova [rsp+gprsize+16*30], m5 ;in27 + mova [rsp+gprsize+16*27], m6 ;in29 + mova [rsp+gprsize+16*34], m7 ;in31 + + call m(idct_8x32_internal).main + +.end1: + jmp m(idct_8x32_internal).pass2 + + + +cglobal inv_txfm_add_dct_dct_32x16, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 +%if ARCH_X86_32 + LEA r5, $$ +%endif + test eobd, eobd + jz .dconly + + call m(idct_32x16_internal) + call m(idct_8x16_internal).pass2 + + add coeffq, 16*16 + lea dstq, [r3+8] + LOAD_8ROWS rsp+16*11, 16 + mova [rsp+16*0], m7 + lea tx2q, [o(m(idct_32x16_internal).end)] + call m(idct_8x8_internal).pass1_end + call m(idct_8x16_internal).pass2 + + add coeffq, 16*16 + lea dstq, [r3+8] + LOAD_8ROWS rsp+16*19, 16 + mova [rsp+16*0], m7 + lea tx2q, [o(m(idct_32x16_internal).end)] + call m(idct_8x8_internal).pass1_end + call m(idct_8x16_internal).pass2 + + add coeffq, 16*16 + lea dstq, [r3+8] + LOAD_8ROWS rsp+16*27, 16 + mova [rsp+16*0], m7 + lea tx2q, [o(m(idct_32x16_internal).end)] + call m(idct_8x8_internal).pass1_end + call m(idct_8x16_internal).pass2 + RET + +.dconly: + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_16384)] + mov [coeffq], eobd + pmulhrsw m0, m1 + mov r3d, 16 + lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8).end)] + jmp m(inv_txfm_add_dct_dct_32x8).body + + +cglobal idct_32x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + %undef cmp + + add coeffq, 16 + lea r3, [o(m(idct_32x16_internal).pass1_end1)] +.pass1: + LOAD_8ROWS coeffq+16*0, 128, 1 + call m(idct_8x8_internal).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + LOAD_8ROWS coeffq+16*4, 128, 1 + call m(idct_16x8_internal).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + LOAD_8ROWS coeffq+16*2, 64, 1 + mova [rsp+gprsize+16*19], m0 ;in1 + mova [rsp+gprsize+16*26], m1 ;in3 + mova [rsp+gprsize+16*23], m2 ;in5 + mova [rsp+gprsize+16*22], m3 ;in7 + mova [rsp+gprsize+16*21], m4 ;in9 + mova [rsp+gprsize+16*24], m5 ;in11 + mova [rsp+gprsize+16*25], m6 ;in13 + mova [rsp+gprsize+16*20], m7 ;in15 + + LOAD_8ROWS coeffq+16*34, 64, 1 + mova [rsp+gprsize+16*33], m0 ;in17 + mova [rsp+gprsize+16*28], m1 ;in19 + mova [rsp+gprsize+16*29], m2 ;in21 + mova [rsp+gprsize+16*32], m3 ;in23 + mova [rsp+gprsize+16*31], m4 ;in25 + mova [rsp+gprsize+16*30], m5 ;in27 + mova [rsp+gprsize+16*27], m6 ;in29 + mova [rsp+gprsize+16*34], m7 ;in31 + call m(idct_8x32_internal).main + +.pass1_end: + mova [rsp+gprsize+16*0 ], m7 + mov tx2q, r3 + jmp m(idct_8x8_internal).pass1_end + +.pass1_end1: + SAVE_8ROWS coeffq+16*0, 32 + LOAD_8ROWS rsp+gprsize+16*11, 16 + mova [rsp+gprsize+16*0 ], m7 + lea tx2q, [o(m(idct_32x16_internal).pass1_end2)] + jmp m(idct_8x8_internal).pass1_end + +.pass1_end2: + SAVE_8ROWS coeffq+16*16, 32 + LOAD_8ROWS rsp+gprsize+16*19, 16 + mova [rsp+gprsize+16*0 ], m7 + lea tx2q, [o(m(idct_32x16_internal).pass1_end3)] + jmp m(idct_8x8_internal).pass1_end + +.pass1_end3: + SAVE_8ROWS coeffq+16*32, 32 + LOAD_8ROWS rsp+gprsize+16*27, 16 + mova [rsp+gprsize+16*0 ], m7 + lea tx2q, [o(m(idct_32x16_internal).pass1_end4)] + jmp m(idct_8x8_internal).pass1_end + +.pass1_end4: + SAVE_8ROWS coeffq+16*48, 32 + + sub coeffq, 16 + lea r3, [o(m(idct_32x16_internal).end)] + jmp .pass1 + +.end: + ret + + +cglobal inv_txfm_add_identity_identity_16x32, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 + %undef cmp + + mov r4d, eobd + cmp eobd, 43 ;if (eob > 43) + sbb r3d, r3d ; iteration_count++ + cmp r4d, 150 ;if (eob > 150) + sbb r3d, 0 ; iteration_count++ + cmp r4d, 278 ;if (eob > 278) + sbb r3d, -4 ; iteration_count++ + +%if ARCH_X86_32 + LEA r5, $$ +%endif + lea r4, [dstq+8] + mov [rsp+16*3], r4 + mov [rsp+gprsize+16*3], r3d + mov [rsp+gprsize*2+16*3], coeffq + +.loop: + LOAD_8ROWS coeffq, 64, 1 + mova [rsp+16*1], m6 + pxor m6, m6 + REPX {mova [coeffq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 + lea tx2q, [o(m(idct_32x16_internal).end)] + call m(idct_8x8_internal).pass1_end3 + mova [rsp+16*0], m2 + mova [rsp+16*1], m3 + mova [rsp+16*2], m4 + mova m3, [o(pw_1697x16)] + mova m4, [o(pw_16384)] + REPX {IDTX16 x, 2, 3, 4}, 5, 6, 7, 0, 1 + mova m2, [o(pw_8192)] + REPX {pmulhrsw x, m2}, m5, m6, m7, m0, m1 + mova m2, [rsp+16*0] + mova [rsp+16*0], m7 + IDTX16 2, 7, 3, 4 + mova m7, [rsp+16*2] + mova [rsp+16*2], m5 + IDTX16 7, 5, 3, 4 + mova m5, [rsp+16*1] + mova [rsp+16*1], m6 + pmulhrsw m3, m5 + pmulhrsw m3, m4 + psrlw m4, 1 ; pw_8192 + paddsw m3, m5 + pmulhrsw m2, m4 + pmulhrsw m3, m4 + pmulhrsw m4, m7 + call m(idct_8x8_internal).end3 + lea dstq, [dstq+strideq*2] + add coeffq, 16 + dec r3d + jg .loop + mov coeffq, [rsp+gprsize*2+16*3] + add coeffq, 64*8 + mov r3d, [rsp+gprsize+16*3] + xor dstq, dstq + mov [rsp+gprsize+16*3], dstq + mov dstq, [rsp+16*3] + test r3d, r3d + jnz .loop + RET + + +cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 + %undef cmp + + mov r4d, 12 ;0100b + mov r5d, 136 ;1000 1000b + cmp eobd, 44 ;if (eob > 43) + cmovns r4d, r5d ; iteration_count+2 + cmp eobd, 151 ;if (eob > 150) + mov r3d, 34952 ;1000 1000 1000 1000b + cmovs r3d, r4d ; iteration_count += 4 + +%if ARCH_X86_32 + LEA r5, $$ +%endif + lea r4, [dstq+8] + mov [rsp+16*3], r4 + +.loop: + LOAD_8ROWS coeffq, 32, 1 + REPX {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7 + mova [rsp+16*1], m6 + lea tx2q, [o(m(idct_32x16_internal).end)] + call m(idct_8x8_internal).pass1_end3 + mova [rsp+16*1], m5 + mova [rsp+16*2], m6 + mova m6, [o(pw_1697x16)] + REPX {IDTX16 x, 5, 6}, 7, 0, 1, 2, 3, 4 + pmulhrsw m7, [o(pw_2048)] + mova m5, [rsp+16*1] + mova [rsp+16*0], m7 + IDTX16 5, 7, 6 + mova m7, [rsp+16*2] + IDTX16 7, 6, 6 + mova m6, [o(pw_2048)] + REPX {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7 + mova [rsp+16*2], m5 + mova [rsp+16*1], m7 + call m(idct_8x8_internal).end3 + lea dstq, [dstq+strideq*2] + pxor m7, m7 + REPX {mova [coeffq+32*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 + +.loop_end: + add coeffq, 16 + shr r3d, 2 + jz .ret + test r3d, 2 + jnz .loop + mov r4d, r3d + and r4d, 1 + lea coeffq, [coeffq+r4*8+32*7] + mov dstq, [rsp+16*3] + lea r4, [dstq+8] + mov [rsp+16*3], r4 + jmp .loop + +.ret: + RET + + +cglobal inv_txfm_add_dct_dct_32x32, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 +%if ARCH_X86_32 + LEA r5, $$ +%endif + test eobd, eobd + jz .dconly + + call m(idct_32x32_internal) + RET + +.dconly: + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_8192)] + mov [coeffq], eobd + mov r3d, 32 + lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8).end)] + jmp m(inv_txfm_add_dct_dct_32x8).body + + +cglobal idct_32x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + %undef cmp + + mov r4d, 2 + sub eobd, 136 + mov [rsp+gprsize*1+16*35], eobd + mov r3d, 4 + cmovs r3d, r4d + +%if ARCH_X86_32 + LEA r5, $$ +%endif + + mov [rsp+gprsize*2+16*35], coeffq + +.pass1_loop: + LOAD_8ROWS coeffq+64*1, 64*2 + mova [rsp+gprsize+16*19], m0 ;in1 + mova [rsp+gprsize+16*26], m1 ;in3 + mova [rsp+gprsize+16*23], m2 ;in5 + mova [rsp+gprsize+16*22], m3 ;in7 + mova [rsp+gprsize+16*21], m4 ;in9 + mova [rsp+gprsize+16*24], m5 ;in11 + mova [rsp+gprsize+16*25], m6 ;in13 + mova [rsp+gprsize+16*20], m7 ;in15 + + mov tx2d, [rsp+gprsize*1+16*35] + test tx2d, tx2d + jl .fast + +.full: + LOAD_8ROWS coeffq+64*0, 64*4 + call m(idct_8x8_internal).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_8ROWS coeffq+64*2, 64*4 + call m(idct_16x8_internal).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + LOAD_8ROWS coeffq+64*17, 64*2 + mova [rsp+gprsize+16*33], m0 ;in17 + mova [rsp+gprsize+16*28], m1 ;in19 + mova [rsp+gprsize+16*29], m2 ;in21 + mova [rsp+gprsize+16*32], m3 ;in23 + mova [rsp+gprsize+16*31], m4 ;in25 + mova [rsp+gprsize+16*30], m5 ;in27 + mova [rsp+gprsize+16*27], m6 ;in29 + mova [rsp+gprsize+16*34], m7 ;in31 + + call m(idct_8x32_internal).main + jmp .pass1_end + +.fast: + mova m0, [coeffq+256*0] + mova m1, [coeffq+256*1] + mova m2, [coeffq+256*2] + mova m3, [coeffq+256*3] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal).main + + SAVE_7ROWS rsp+gprsize+16*3, 16 + mova m0, [coeffq+128*1] + mova m1, [coeffq+128*3] + mova m2, [coeffq+128*5] + mova m3, [coeffq+128*7] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_16x8_internal).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + call m(idct_8x32_internal).main_fast + +.pass1_end: + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_32x32_internal).pass1_end1)] + jmp m(idct_8x8_internal).pass1_end1 + +.pass1_end1: + SAVE_8ROWS coeffq+64*0, 64 + LOAD_8ROWS rsp+gprsize+16*11, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_32x32_internal).pass1_end2)] + jmp m(idct_8x8_internal).pass1_end1 + +.pass1_end2: + SAVE_8ROWS coeffq+64*8, 64 + LOAD_8ROWS rsp+gprsize+16*19, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_32x32_internal).pass1_end3)] + jmp m(idct_8x8_internal).pass1_end1 + +.pass1_end3: + SAVE_8ROWS coeffq+64*16, 64 + LOAD_8ROWS rsp+gprsize+16*27, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_32x32_internal).pass1_end4)] + jmp m(idct_8x8_internal).pass1_end1 + +.pass1_end4: + SAVE_8ROWS coeffq+64*24, 64 + + add coeffq, 16 + dec r3d + jg .pass1_loop + + +.pass2: + mov coeffq, [rsp+gprsize*2+16*35] + mov r3d, 4 + lea tx2q, [o(m(idct_32x32_internal).pass2_end)] + +.pass2_loop: + mov [rsp+gprsize*3+16*35], r3d + lea r3, [dstq+8] + mov [rsp+gprsize*2+16*35], r3 + + mova m0, [coeffq+16*4 ] + mova m1, [coeffq+16*12] + mova m2, [coeffq+16*20] + mova m3, [coeffq+16*28] + mova m4, [coeffq+16*5 ] + mova m5, [coeffq+16*13] + mova m6, [coeffq+16*21] + mova m7, [coeffq+16*29] + mova [rsp+gprsize+16*19], m0 ;in1 + mova [rsp+gprsize+16*26], m1 ;in3 + mova [rsp+gprsize+16*23], m2 ;in5 + mova [rsp+gprsize+16*22], m3 ;in7 + mova [rsp+gprsize+16*21], m4 ;in9 + mova [rsp+gprsize+16*24], m5 ;in11 + mova [rsp+gprsize+16*25], m6 ;in13 + mova [rsp+gprsize+16*20], m7 ;in15 + + mov eobd, [rsp+gprsize*1+16*35] + test eobd, eobd + jl .fast1 + +.full1: + mova m0, [coeffq+16*0 ] + mova m1, [coeffq+16*16] + mova m2, [coeffq+16*1 ] + mova m3, [coeffq+16*17] + mova m4, [coeffq+16*2 ] + mova m5, [coeffq+16*18] + mova m6, [coeffq+16*3 ] + mova m7, [coeffq+16*19] + call m(idct_8x8_internal).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + mova m0, [coeffq+16*8 ] + mova m1, [coeffq+16*24] + mova m2, [coeffq+16*9 ] + mova m3, [coeffq+16*25] + mova m4, [coeffq+16*10] + mova m5, [coeffq+16*26] + mova m6, [coeffq+16*11] + mova m7, [coeffq+16*27] + call m(idct_16x8_internal).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + mova m0, [coeffq+16*6 ] + mova m1, [coeffq+16*14] + mova m2, [coeffq+16*22] + mova m3, [coeffq+16*30] + mova m4, [coeffq+16*7 ] + mova m5, [coeffq+16*15] + mova m6, [coeffq+16*23] + mova m7, [coeffq+16*31] + mova [rsp+gprsize+16*33], m0 ;in17 + mova [rsp+gprsize+16*28], m1 ;in19 + mova [rsp+gprsize+16*29], m2 ;in21 + mova [rsp+gprsize+16*32], m3 ;in23 + mova [rsp+gprsize+16*31], m4 ;in25 + mova [rsp+gprsize+16*30], m5 ;in27 + mova [rsp+gprsize+16*27], m6 ;in29 + mova [rsp+gprsize+16*34], m7 ;in31 + + call m(idct_8x32_internal).main + jmp tx2q + +.fast1: + mova m0, [coeffq+16*0 ] + mova m1, [coeffq+16*16] + mova m2, [coeffq+16*1 ] + mova m3, [coeffq+16*17] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + mova m0, [coeffq+16*8 ] + mova m1, [coeffq+16*24] + mova m2, [coeffq+16*9 ] + mova m3, [coeffq+16*25] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_16x8_internal).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + call m(idct_8x32_internal).main_fast + jmp tx2q + +.pass2_end: + lea r3, [o(m(idct_32x32_internal).pass2_end1)] + jmp m(idct_8x32_internal).end + +.pass2_end1: + lea tx2q, [o(m(idct_32x32_internal).pass2_end)] + add coeffq, 16*32 + mov dstq, [rsp+gprsize*2+16*35] + mov r3d, [rsp+gprsize*3+16*35] + dec r3d + jg .pass2_loop + + ret + + +cglobal inv_txfm_add_identity_identity_32x32, 4, 6, 8, 16*5, dst, stride, coeff, eob, tx2 + %undef cmp + + mov r4d, 2 + cmp eobd, 136 + mov r3d, 4 + cmovs r3d, r4d + +%if ARCH_X86_32 + LEA r5, $$ +%endif + + lea r4, [dstq+8] + mov [rsp+gprsize*0+16*3], r4 + mov [rsp+gprsize*1+16*3], r3d + mov [rsp+gprsize*2+16*3], r3d + mov [rsp+gprsize*3+16*3], coeffq + +.loop: + LOAD_8ROWS coeffq, 64 + mova [rsp+16*1], m6 + lea tx2q, [o(m(idct_32x16_internal).end)] + call m(idct_8x8_internal).pass1_end3 + pmulhrsw m7, [o(pw_8192)] + mova [rsp+16*0], m7 + mova m7, [o(pw_8192)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + mova [rsp+16*1], m6 + mova [rsp+16*2], m5 + call m(idct_8x8_internal).end3 + lea dstq, [dstq+strideq*2] + + pxor m7, m7 + REPX {mova [coeffq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 + + add coeffq, 16 + dec r3d + jg .loop + + mov r4d, [rsp+gprsize*2+16*3] + dec r4d + jle .ret + + mov dstq, [rsp+gprsize*0+16*3] + mov coeffq, [rsp+gprsize*3+16*3] + mov [rsp+gprsize*2+16*3], r4 + lea r3, [dstq+8] + add coeffq, 64*8 + mov [rsp+gprsize*0+16*3], r3 + mov r3d, [rsp+gprsize*1+16*3] + mov [rsp+gprsize*3+16*3], coeffq + jmp .loop + +.ret: + RET + + +cglobal inv_txfm_add_dct_dct_16x64, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2 +%if ARCH_X86_32 + LEA r5, $$ +%endif + test eobd, eobd + jz .dconly + + call m(idct_16x64_internal) + RET + +.dconly: + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_8192)] + mov [coeffq], eobd + mov r2d, 32 + lea tx2q, [o(m(inv_txfm_add_dct_dct_16x64).end)] + jmp m(inv_txfm_add_dct_dct_16x4).dconly + +.end: + RET + + +cglobal idct_16x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + %undef cmp + + mov r4d, 2 + sub eobd, 151 + mov [rsp+gprsize*1+16*67], eobd + mov r3d, 4 + cmovs r3d, r4d + +%if ARCH_X86_32 + LEA r5, $$ +%endif + + mov [rsp+gprsize*2+16*67], coeffq + +.pass1_loop: + LOAD_8ROWS coeffq+64*0, 64*2 + call m(idct_8x8_internal).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_8ROWS coeffq+64*1, 64*2 + call m(idct_16x8_internal).main + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_16x64_internal).pass1_end)] + jmp m(idct_8x8_internal).pass1_end1 + +.pass1_end: + SAVE_8ROWS coeffq+64*8, 64 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_16x64_internal).pass1_end1)] + jmp m(idct_8x8_internal).pass1_end1 + +.pass1_end1: + SAVE_8ROWS coeffq+64*0, 64 + + add coeffq, 16 + dec r3d + jg .pass1_loop + + mov coeffq, [rsp+gprsize*2+16*67] + mov r3d, 2 + lea r4, [dstq+8] + mov [rsp+gprsize*2+16*67], r4 + lea r4, [o(m(idct_16x64_internal).end1)] + +.pass2_loop: + mov [rsp+gprsize*3+16*67], r3d + mov eobd, [rsp+gprsize*1+16*67] + + mova m0, [coeffq+16*4 ] ;in1 + mova m1, [coeffq+16*12] ;in3 + mova m2, [coeffq+16*20] ;in5 + mova m3, [coeffq+16*28] ;in7 + mova m4, [coeffq+16*5 ] ;in9 + mova m5, [coeffq+16*13] ;in11 + mova m6, [coeffq+16*21] ;in13 + mova m7, [coeffq+16*29] ;in15 + mova [rsp+gprsize+16*35], m0 ;in1 + mova [rsp+gprsize+16*49], m1 ;in3 + mova [rsp+gprsize+16*43], m2 ;in5 + mova [rsp+gprsize+16*41], m3 ;in7 + mova [rsp+gprsize+16*39], m4 ;in9 + mova [rsp+gprsize+16*45], m5 ;in11 + mova [rsp+gprsize+16*47], m6 ;in13 + mova [rsp+gprsize+16*37], m7 ;in15 + + pxor m4, m4 + mova m0, [coeffq+16*0] + mova m1, [coeffq+16*1] + + test eobd, eobd + jl .fast + +.full: + mova m2, [coeffq+16*2] + mova m3, [coeffq+16*3] + + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + pxor m4, m4 + mova m0, [coeffq+16*16] + mova m1, [coeffq+16*17] + mova m2, [coeffq+16*18] + mova m3, [coeffq+16*19] + + REPX {mova x, m4}, m5, m6, m7 + call m(idct_16x8_internal).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + mova m0, [coeffq+16*8 ] + mova m1, [coeffq+16*24] + mova m2, [coeffq+16*9 ] + mova m3, [coeffq+16*25] + mova m4, [coeffq+16*10] + mova m5, [coeffq+16*26] + mova m6, [coeffq+16*11] + mova m7, [coeffq+16*27] + mova [rsp+gprsize+16*19], m0 + mova [rsp+gprsize+16*26], m1 + mova [rsp+gprsize+16*23], m2 + mova [rsp+gprsize+16*22], m3 + mova [rsp+gprsize+16*21], m4 + mova [rsp+gprsize+16*24], m5 + mova [rsp+gprsize+16*25], m6 + mova [rsp+gprsize+16*20], m7 + + call m(idct_8x32_internal).main_fast + SAVE_8ROWS rsp+gprsize+16*3, 16 + + mova m0, [coeffq+16*6 ] ;in17 + mova m1, [coeffq+16*14] ;in19 + mova m2, [coeffq+16*22] ;in21 + mova m3, [coeffq+16*30] ;in23 + mova m4, [coeffq+16*7 ] ;in25 + mova m5, [coeffq+16*15] ;in27 + mova m6, [coeffq+16*23] ;in29 + mova m7, [coeffq+16*31] ;in31 + mova [rsp+gprsize+16*63], m0 ;in17 + mova [rsp+gprsize+16*53], m1 ;in19 + mova [rsp+gprsize+16*55], m2 ;in21 + mova [rsp+gprsize+16*61], m3 ;in23 + mova [rsp+gprsize+16*59], m4 ;in25 + mova [rsp+gprsize+16*57], m5 ;in27 + mova [rsp+gprsize+16*51], m6 ;in29 + mova [rsp+gprsize+16*65], m7 ;in31 + + call .main + jmp .end + +.fast: + REPX {mova x, m4}, m2, m3, m5, m6, m7 + call m(idct_8x8_internal).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + pxor m4, m4 + mova m0, [coeffq+16*16] + mova m1, [coeffq+16*17] + + REPX {mova x, m4}, m2, m3, m5, m6, m7 + call m(idct_16x8_internal).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + mova m0, [coeffq+16*8 ] + mova m1, [coeffq+16*24] + mova m2, [coeffq+16*9 ] + mova m3, [coeffq+16*25] + mova [rsp+gprsize+16*19], m0 ;in1 + mova [rsp+gprsize+16*26], m1 ;in3 + mova [rsp+gprsize+16*23], m2 ;in5 + mova [rsp+gprsize+16*22], m3 ;in7 + + call m(idct_8x32_internal).main_veryfast + SAVE_8ROWS rsp+gprsize+16*3, 16 + + call .main_fast + +.end: + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + mov r3, r4 + jmp m(idct_8x32_internal).end2 + +.end1: + LOAD_8ROWS rsp+gprsize+16*35, 16 + lea dstq, [dstq+strideq*2] + add rsp, 16*32 + lea r3, [o(m(idct_16x64_internal).end2)] + jmp m(idct_8x32_internal).end + +.end2: + add coeffq, 16*32 + sub rsp, 16*32 + + mov dstq, [rsp+gprsize*2+16*67] + mov r3d, [rsp+gprsize*3+16*67] + lea r4, [dstq+8] + mov [rsp+gprsize*2+16*67], r4 + lea r4, [o(m(idct_16x64_internal).end1)] + + dec r3d + jg .pass2_loop + ret + + +ALIGN function_align +.main_fast: + mova m0, [rsp+gprsize*2+16*35] ;in1 + pmulhrsw m3, m0, [o(pw_4095x8)] ;t62,t63 + pmulhrsw m0, [o(pw_101x8)] ;t32,t33 + mova m7, [o(pd_2048)] + mova [rsp+gprsize*2+16*35], m0 ;t32 + mova [rsp+gprsize*2+16*66], m3 ;t63 + ITX_MULSUB_2W 3, 0, 1, 2, 7, 401, 4076 ;t33a, t62a + mova [rsp+gprsize*2+16*36], m3 ;t33a + mova [rsp+gprsize*2+16*65], m0 ;t62a + + mova m1, [rsp+gprsize*2+16*37] ;in15 + pmulhrsw m2, m1, [o(pw_3822x8)] ;t60,t61 + pmulhrsw m1, [o(pw_m1474x8)] ;t34,t35 + mova [rsp+gprsize*2+16*38], m1 ;t35 + mova [rsp+gprsize*2+16*63], m2 ;t60 + ITX_MULSUB_2W 2, 1, 0, 3, 7, m4076, 401 ;t34a, t61a + mova [rsp+gprsize*2+16*37], m2 ;t34a + mova [rsp+gprsize*2+16*64], m1 ;t61a + + mova m0, [rsp+gprsize*2+16*39] ;in9 + pmulhrsw m3, m0, [o(pw_3996x8)] ;t58,t59 + pmulhrsw m0, [o(pw_897x8)] ;t36,t37 + mova [rsp+gprsize*2+16*39], m0 ;t36 + mova [rsp+gprsize*2+16*62], m3 ;t59 + ITX_MULSUB_2W 3, 0, 1, 2, 7, 3166, 2598 ;t37a, t58a + mova [rsp+gprsize*2+16*40], m3 ;t37a + mova [rsp+gprsize*2+16*61], m0 ;t58a + + mova m1, [rsp+gprsize*2+16*41] ;in7 + pmulhrsw m2, m1, [o(pw_4036x8)] ;t56,t57 + pmulhrsw m1, [o(pw_m700x8)] ;t38,t39 + mova [rsp+gprsize*2+16*42], m1 ;t39 + mova [rsp+gprsize*2+16*59], m2 ;t56 + ITX_MULSUB_2W 2, 1, 0, 3, 7, m2598, 3166 ;t38a, t57a + mova [rsp+gprsize*2+16*41], m2 ;t38a + mova [rsp+gprsize*2+16*60], m1 ;t57a + + mova m0, [rsp+gprsize*2+16*43] ;in5 + pmulhrsw m3, m0, [o(pw_4065x8)] ;t54,t55 + pmulhrsw m0, [o(pw_501x8)] ;t40,t41 + mova [rsp+gprsize*2+16*43], m0 ;t40 + mova [rsp+gprsize*2+16*58], m3 ;t55 + ITX_MULSUB_2W 3, 0, 1, 2, 7, 1931, 3612 ;t41a, t54a + mova [rsp+gprsize*2+16*44], m3 ;t41a + mova [rsp+gprsize*2+16*57], m0 ;t54a + + mova m1, [rsp+gprsize*2+16*45] ;in11 + pmulhrsw m2, m1, [o(pw_3948x8)] ;t52,t53 + pmulhrsw m1, [o(pw_m1092x8)] ;t42,t43 + mova [rsp+gprsize*2+16*46], m1 ;t43 + mova [rsp+gprsize*2+16*55], m2 ;t52 + ITX_MULSUB_2W 2, 1, 0, 3, 7, m3612, 1931 ;t42a, t53a + mova [rsp+gprsize*2+16*45], m2 ;t42a + mova [rsp+gprsize*2+16*56], m1 ;t53a + + mova m0, [rsp+gprsize*2+16*47] ;in13 + pmulhrsw m3, m0, [o(pw_3889x8)] ;t50,t51 + pmulhrsw m0, [o(pw_1285x8)] ;t44,t45 + mova m6, m0 + mova [rsp+gprsize*2+16*54], m3 ;t51 + ITX_MULSUB_2W 3, 0, 1, 2, 7, 3920, 1189 ;t45a, t50a + mova [rsp+gprsize*2+16*48], m3 ;t45a + mova [rsp+gprsize*2+16*53], m0 ;t50a + + mova m0, [rsp+gprsize*2+16*49] ;in3 + pmulhrsw m3, m0, [o(pw_4085x8)] ;t48,t49 + pmulhrsw m0, [o(pw_m301x8)] ;t46,t47 + mova m4, m3 + mova m5, m0 + + jmp .main2 + +ALIGN function_align +.main: + mova m0, [rsp+gprsize*2+16*35] ;in1 + mova m1, [rsp+gprsize*2+16*65] ;in31 + pmulhrsw m3, m0, [o(pw_4095x8)] ;t63a + pmulhrsw m0, [o(pw_101x8)] ;t32a + pmulhrsw m2, m1, [o(pw_2967x8)] ;t62a + pmulhrsw m1, [o(pw_m2824x8)] ;t33a + mova m7, [o(pd_2048)] + psubsw m4, m0, m1 ;t33 + paddsw m0, m1 ;t32 + psubsw m5, m3, m2 ;t62 + paddsw m3, m2 ;t63 + ITX_MULSUB_2W 5, 4, 1, 2, 7, 401, 4076 ;t33a, t62a + mova [rsp+gprsize*2+16*35], m0 ;t32 + mova [rsp+gprsize*2+16*36], m5 ;t33a + mova [rsp+gprsize*2+16*65], m4 ;t62a + mova [rsp+gprsize*2+16*66], m3 ;t63 + + mova m0, [rsp+gprsize*2+16*63] ;in17 + mova m1, [rsp+gprsize*2+16*37] ;in15 + pmulhrsw m3, m0, [o(pw_3745x8)] ;t61a + pmulhrsw m0, [o(pw_1660x8)] ;t34a + pmulhrsw m2, m1, [o(pw_3822x8)] ;t60a + pmulhrsw m1, [o(pw_m1474x8)] ;t35a + psubsw m4, m1, m0 ;t34 + paddsw m0, m1 ;t35 + psubsw m5, m2, m3 ;t61 + paddsw m3, m2 ;t60 + ITX_MULSUB_2W 5, 4, 1, 2, 7, m4076, 401 ;t34a, t61a + mova [rsp+gprsize*2+16*37], m5 ;t34a + mova [rsp+gprsize*2+16*38], m0 ;t35 + mova [rsp+gprsize*2+16*63], m3 ;t60 + mova [rsp+gprsize*2+16*64], m4 ;t61a + + mova m0, [rsp+gprsize*2+16*39] ;in9 + mova m1, [rsp+gprsize*2+16*61] ;in23 + pmulhrsw m3, m0, [o(pw_3996x8)] ;t59a + pmulhrsw m0, [o(pw_897x8)] ;t36a + pmulhrsw m2, m1, [o(pw_3461x8)] ;t58a + pmulhrsw m1, [o(pw_m2191x8)] ;t37a + psubsw m4, m0, m1 ;t37 + paddsw m0, m1 ;t36 + psubsw m5, m3, m2 ;t58 + paddsw m3, m2 ;t59 + ITX_MULSUB_2W 5, 4, 1, 2, 7, 3166, 2598 ;t37a, t58a + mova [rsp+gprsize*2+16*39], m0 ;t36 + mova [rsp+gprsize*2+16*40], m5 ;t37a + mova [rsp+gprsize*2+16*61], m4 ;t58a + mova [rsp+gprsize*2+16*62], m3 ;t59 + + mova m0, [rsp+gprsize*2+16*59] ;in25 + mova m1, [rsp+gprsize*2+16*41] ;in7 + pmulhrsw m3, m0, [o(pw_3349x8)] ;t57a + pmulhrsw m0, [o(pw_2359x8)] ;t38a + pmulhrsw m2, m1, [o(pw_4036x8)] ;t56a + pmulhrsw m1, [o(pw_m700x8)] ;t39a + psubsw m4, m1, m0 ;t38 + paddsw m0, m1 ;t39 + psubsw m5, m2, m3 ;t57 + paddsw m3, m2 ;t56 + ITX_MULSUB_2W 5, 4, 1, 2, 7, m2598, 3166 ;t38a, t57a + mova [rsp+gprsize*2+16*41], m5 ;t38a + mova [rsp+gprsize*2+16*42], m0 ;t39 + mova [rsp+gprsize*2+16*59], m3 ;t56 + mova [rsp+gprsize*2+16*60], m4 ;t57a + + mova m0, [rsp+gprsize*2+16*43] ;in5 + mova m1, [rsp+gprsize*2+16*57] ;in27 + pmulhrsw m3, m0, [o(pw_4065x8)] ;t55a + pmulhrsw m0, [o(pw_501x8)] ;t40a + pmulhrsw m2, m1, [o(pw_3229x8)] ;t54a + pmulhrsw m1, [o(pw_m2520x8)] ;t41a + psubsw m4, m0, m1 ;t41 + paddsw m0, m1 ;t40 + psubsw m5, m3, m2 ;t54 + paddsw m3, m2 ;t55 + ITX_MULSUB_2W 5, 4, 1, 2, 7, 1931, 3612 ;t41a, t54a + mova [rsp+gprsize*2+16*43], m0 ;t40 + mova [rsp+gprsize*2+16*44], m5 ;t41a + mova [rsp+gprsize*2+16*57], m4 ;t54a + mova [rsp+gprsize*2+16*58], m3 ;t55 + + mova m0, [rsp+gprsize*2+16*55] ;in21 + mova m1, [rsp+gprsize*2+16*45] ;in11 + pmulhrsw m3, m0, [o(pw_3564x8)] ;t53a + pmulhrsw m0, [o(pw_2019x8)] ;t42a + pmulhrsw m2, m1, [o(pw_3948x8)] ;t52a + pmulhrsw m1, [o(pw_m1092x8)] ;t43a + psubsw m4, m1, m0 ;t42 + paddsw m0, m1 ;t43 + psubsw m5, m2, m3 ;t53 + paddsw m3, m2 ;t52 + ITX_MULSUB_2W 5, 4, 1, 2, 7, m3612, 1931 ;t42a, t53a + mova [rsp+gprsize*2+16*45], m5 ;t42a + mova [rsp+gprsize*2+16*46], m0 ;t43 + mova [rsp+gprsize*2+16*55], m3 ;t52 + mova [rsp+gprsize*2+16*56], m4 ;t53a + + mova m0, [rsp+gprsize*2+16*47] ;in13 + mova m1, [rsp+gprsize*2+16*53] ;in19 + pmulhrsw m3, m0, [o(pw_3889x8)] ;t51a + pmulhrsw m0, [o(pw_1285x8)] ;t44a + pmulhrsw m2, m1, [o(pw_3659x8)] ;t50a + pmulhrsw m1, [o(pw_m1842x8)] ;t45a + psubsw m4, m0, m1 ;t45 + paddsw m0, m1 ;t44 + psubsw m5, m3, m2 ;t50 + paddsw m3, m2 ;t51 + ITX_MULSUB_2W 5, 4, 1, 2, 7, 3920, 1189 ;t45a, t50a + mova m6, m0 + mova [rsp+gprsize*2+16*48], m5 ;t45a + mova [rsp+gprsize*2+16*53], m4 ;t50a + mova [rsp+gprsize*2+16*54], m3 ;t51 + + mova m0, [rsp+gprsize*2+16*51] ;in29 + mova m1, [rsp+gprsize*2+16*49] ;in3 + pmulhrsw m3, m0, [o(pw_3102x8)] ;t49a + pmulhrsw m0, [o(pw_2675x8)] ;t46a + pmulhrsw m2, m1, [o(pw_4085x8)] ;t48a + pmulhrsw m1, [o(pw_m301x8)] ;t47a + psubsw m5, m1, m0 ;t46 + paddsw m0, m1 ;t47 + psubsw m4, m2, m3 ;t49 + paddsw m3, m2 ;t48 + +ALIGN function_align +.main2: + ITX_MULSUB_2W 4, 5, 1, 2, 7, m1189, 3920 ;t46a, t49a + mova m1, [rsp+gprsize*2+16*54] ;t51 + psubsw m2, m0, m6 ;t44a + paddsw m0, m6 ;t47a + psubsw m6, m3, m1 ;t51a + paddsw m3, m1 ;t48a + mova [rsp+gprsize*2+16*50], m0 ;t47a + mova [rsp+gprsize*2+16*51], m3 ;t48a + ITX_MULSUB_2W 6, 2, 0, 3, 7, m2276, 3406 ;t44, t51 + mova [rsp+gprsize*2+16*47], m6 ;t44 + mova [rsp+gprsize*2+16*54], m2 ;t51 + + mova m0, [rsp+gprsize*2+16*48] ;t45a + mova m3, [rsp+gprsize*2+16*53] ;t50a + psubsw m2, m4, m0 ;t45 + paddsw m4, m0 ;t46 + psubsw m6, m5, m3 ;t50 + paddsw m5, m3 ;t49 + ITX_MULSUB_2W 6, 2, 0, 3, 7, m2276, 3406 ;t45a, t50a + mova [rsp+gprsize*2+16*48], m6 ;t45a + mova [rsp+gprsize*2+16*49], m4 ;t46 + mova [rsp+gprsize*2+16*52], m5 ;t49 + mova [rsp+gprsize*2+16*53], m2 ;t50a + + mova m0, [rsp+gprsize*2+16*43] ;t40 + mova m2, [rsp+gprsize*2+16*46] ;t43 + mova m3, [rsp+gprsize*2+16*55] ;t52 + mova m1, [rsp+gprsize*2+16*58] ;t55 + psubsw m4, m0, m2 ;t43a + paddsw m0, m2 ;t40a + psubsw m5, m1, m3 ;t52a + paddsw m1, m3 ;t55a + ITX_MULSUB_2W 5, 4, 2, 3, 7, 3406, 2276 ;t43, t52 + mova [rsp+gprsize*2+16*43], m0 ;t40a + mova [rsp+gprsize*2+16*46], m5 ;t43 + mova [rsp+gprsize*2+16*55], m4 ;t52 + mova [rsp+gprsize*2+16*58], m1 ;t55a + + mova m0, [rsp+gprsize*2+16*44] ;t41a + mova m2, [rsp+gprsize*2+16*45] ;t42a + mova m3, [rsp+gprsize*2+16*56] ;t53a + mova m1, [rsp+gprsize*2+16*57] ;t54a + psubsw m4, m0, m2 ;t42 + paddsw m0, m2 ;t41 + psubsw m5, m1, m3 ;t53 + paddsw m1, m3 ;t54 + ITX_MULSUB_2W 5, 4, 2, 3, 7, 3406, 2276 ;t42a, t53a + mova [rsp+gprsize*2+16*44], m0 ;t41 + mova [rsp+gprsize*2+16*45], m5 ;t42a + mova [rsp+gprsize*2+16*56], m4 ;t53a + mova [rsp+gprsize*2+16*57], m1 ;t54 + + mova m0, [rsp+gprsize*2+16*41] ;t38a + mova m2, [rsp+gprsize*2+16*40] ;t37a + mova m3, [rsp+gprsize*2+16*61] ;t58a + mova m1, [rsp+gprsize*2+16*60] ;t57a + psubsw m4, m0, m2 ;t37 + paddsw m0, m2 ;t38 + psubsw m5, m1, m3 ;t58 + paddsw m1, m3 ;t57 + ITX_MULSUB_2W 5, 4, 2, 3, 7, m4017, 799 ;t37a, t58a + mova [rsp+gprsize*2+16*41], m0 ;t38 + mova [rsp+gprsize*2+16*40], m5 ;t37a + mova [rsp+gprsize*2+16*61], m4 ;t58a + mova [rsp+gprsize*2+16*60], m1 ;t57 + + mova m0, [rsp+gprsize*2+16*42] ;t39 + mova m2, [rsp+gprsize*2+16*39] ;t36 + mova m3, [rsp+gprsize*2+16*62] ;t59 + mova m1, [rsp+gprsize*2+16*59] ;t56 + psubsw m4, m0, m2 ;t36a + paddsw m0, m2 ;t39a + psubsw m5, m1, m3 ;t59a + paddsw m1, m3 ;t56a + ITX_MULSUB_2W 5, 4, 2, 3, 7, m4017, 799 ;t36, t59 + mova [rsp+gprsize*2+16*42], m0 ;t39a + mova [rsp+gprsize*2+16*39], m5 ;t36 + mova [rsp+gprsize*2+16*62], m4 ;t59 + mova [rsp+gprsize*2+16*59], m1 ;t56a + + mova m0, [rsp+gprsize*2+16*35] ;t32 + mova m2, [rsp+gprsize*2+16*38] ;t35 + mova m3, [rsp+gprsize*2+16*63] ;t60 + mova m1, [rsp+gprsize*2+16*66] ;t63 + psubsw m4, m0, m2 ;t35a + paddsw m0, m2 ;t32a + psubsw m5, m1, m3 ;t60a + paddsw m1, m3 ;t63a + ITX_MULSUB_2W 5, 4, 2, 3, 7, 799, 4017 ;t35, t60 + mova [rsp+gprsize*2+16*35], m0 ;t32a + mova [rsp+gprsize*2+16*38], m5 ;t35 + mova [rsp+gprsize*2+16*63], m4 ;t60 + mova [rsp+gprsize*2+16*66], m1 ;t63a + + mova m0, [rsp+gprsize*2+16*36] ;t33a + mova m2, [rsp+gprsize*2+16*37] ;t34a + mova m3, [rsp+gprsize*2+16*64] ;t61a + mova m1, [rsp+gprsize*2+16*65] ;t62a + psubsw m4, m0, m2 ;t34 + paddsw m0, m2 ;t33 + psubsw m5, m1, m3 ;t61 + paddsw m1, m3 ;t62 + ITX_MULSUB_2W 5, 4, 2, 3, 7, 799, 4017 ;t34a, t61a + + mova m2, [rsp+gprsize*2+16*41] ;t38 + mova m3, [rsp+gprsize*2+16*60] ;t57 + psubsw m6, m0, m2 ;t38a + paddsw m0, m2 ;t33a + psubsw m2, m1, m3 ;t57a + paddsw m1, m3 ;t62a + mova [rsp+gprsize*2+16*36], m0 ;t33a + mova [rsp+gprsize*2+16*65], m1 ;t62a + ITX_MULSUB_2W 2, 6, 0, 3, 7, 1567, 3784 ;t38, t57 + mova [rsp+gprsize*2+16*41], m2 ;t38 + mova [rsp+gprsize*2+16*60], m6 ;t57 + + mova m2, [rsp+gprsize*2+16*40] ;t37 + mova m3, [rsp+gprsize*2+16*61] ;t58 + psubsw m0, m5, m2 ;t37 + paddsw m5, m2 ;t34 + psubsw m1, m4, m3 ;t58 + paddsw m4, m3 ;t61 + ITX_MULSUB_2W 1, 0, 2, 3, 7, 1567, 3784 ;t37a, t58a + mova [rsp+gprsize*2+16*37], m5 ;t34 + mova [rsp+gprsize*2+16*64], m4 ;t61 + mova [rsp+gprsize*2+16*40], m1 ;t37a + mova [rsp+gprsize*2+16*61], m0 ;t58a + + mova m0, [rsp+gprsize*2+16*38] ;t35 + mova m2, [rsp+gprsize*2+16*39] ;t36 + mova m3, [rsp+gprsize*2+16*62] ;t59 + mova m1, [rsp+gprsize*2+16*63] ;t60 + psubsw m4, m0, m2 ;t36a + paddsw m0, m2 ;t35a + psubsw m5, m1, m3 ;t59a + paddsw m1, m3 ;t60a + ITX_MULSUB_2W 5, 4, 2, 3, 7, 1567, 3784 ;t36, t59 + mova [rsp+gprsize*2+16*38], m0 ;t35a + mova [rsp+gprsize*2+16*39], m5 ;t36 + mova [rsp+gprsize*2+16*62], m4 ;t59 + mova [rsp+gprsize*2+16*63], m1 ;t60a + + mova m0, [rsp+gprsize*2+16*35] ;t32a + mova m2, [rsp+gprsize*2+16*42] ;t39a + mova m3, [rsp+gprsize*2+16*59] ;t56a + mova m1, [rsp+gprsize*2+16*66] ;t63a + psubsw m4, m0, m2 ;t39 + paddsw m0, m2 ;t32 + psubsw m5, m1, m3 ;t56 + paddsw m1, m3 ;t63 + ITX_MULSUB_2W 5, 4, 2, 3, 7, 1567, 3784 ;t39a, t56a + mova [rsp+gprsize*2+16*35], m0 ;t32 + mova [rsp+gprsize*2+16*42], m5 ;t39a + mova [rsp+gprsize*2+16*59], m4 ;t56a + mova [rsp+gprsize*2+16*66], m1 ;t63 + + mova m0, [rsp+gprsize*2+16*50] ;t47a + mova m2, [rsp+gprsize*2+16*43] ;t40a + mova m3, [rsp+gprsize*2+16*58] ;t55a + mova m1, [rsp+gprsize*2+16*51] ;t48a + psubsw m4, m0, m2 ;t40 + paddsw m0, m2 ;t47 + psubsw m5, m1, m3 ;t55 + paddsw m1, m3 ;t48 + ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t40a, t55a + mova [rsp+gprsize*2+16*50], m0 ;t47 + mova [rsp+gprsize*2+16*43], m5 ;t40a + mova [rsp+gprsize*2+16*58], m4 ;t55a + mova [rsp+gprsize*2+16*51], m1 ;t48 + + mova m0, [rsp+gprsize*2+16*49] ;t46 + mova m2, [rsp+gprsize*2+16*44] ;t41 + mova m3, [rsp+gprsize*2+16*57] ;t54 + mova m1, [rsp+gprsize*2+16*52] ;t49 + psubsw m4, m0, m2 ;t41a + paddsw m0, m2 ;t46a + psubsw m5, m1, m3 ;t54a + paddsw m1, m3 ;t49a + ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t41, t54 + mova [rsp+gprsize*2+16*49], m0 ;t46a + mova [rsp+gprsize*2+16*44], m5 ;t41 + mova [rsp+gprsize*2+16*57], m4 ;t54 + mova [rsp+gprsize*2+16*52], m1 ;t49a + + mova m0, [rsp+gprsize*2+16*48] ;t45a + mova m2, [rsp+gprsize*2+16*45] ;t42a + mova m3, [rsp+gprsize*2+16*56] ;t53a + mova m1, [rsp+gprsize*2+16*53] ;t50a + psubsw m4, m0, m2 ;t42 + paddsw m0, m2 ;t45 + psubsw m5, m1, m3 ;t53 + paddsw m1, m3 ;t50 + ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t42a, t53a + mova [rsp+gprsize*2+16*48], m0 ;t45 + mova [rsp+gprsize*2+16*45], m5 ;t42a + mova [rsp+gprsize*2+16*56], m4 ;t53a + mova [rsp+gprsize*2+16*53], m1 ;t50 + + mova m0, [rsp+gprsize*2+16*47] ;t44 + mova m2, [rsp+gprsize*2+16*46] ;t43 + mova m3, [rsp+gprsize*2+16*55] ;t52 + mova m1, [rsp+gprsize*2+16*54] ;t51 + psubsw m4, m0, m2 ;t43a + paddsw m0, m2 ;t44a + psubsw m5, m1, m3 ;t52a + paddsw m1, m3 ;t51a + ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t43, t52 + + mova m2, [rsp+gprsize*2+16*38] ;t35a + mova m3, [rsp+gprsize*2+16*31] ;tmp[28] + psubsw m6, m2, m0 ;t44 + paddsw m2, m0 ;t35 + psubsw m0, m3, m2 ;out35 + paddsw m2, m3 ;out28 + mova m3, [rsp+gprsize*2+16*63] ;t60a + mova [rsp+gprsize*2+16*38], m0 ;out35 + mova [rsp+gprsize*2+16*31], m2 ;out28 + psubsw m0, m3, m1 ;t51 + paddsw m3, m1 ;t60 + ITX_MULSUB_2W 0, 6, 1, 2, 7, 2896, 2896 ;t44a, t51a + mova m2, [rsp+gprsize*2+16*6 ] ;tmp[3] + psubsw m1, m2, m3 ;out60 + paddsw m2, m3 ;out3 + mova m3, [rsp+gprsize*2+16*22] ;tmp[19] + mova [rsp+gprsize*2+16*63], m1 ;out60 + mova [rsp+gprsize*2+16*6 ], m2 ;out3 + psubsw m1, m3, m0 ;out44 + paddsw m3, m0 ;out19 + mova m2, [rsp+gprsize*2+16*15] ;tmp[12] + + mova m0, [rsp+gprsize*2+16*39] ;t36 + mova [rsp+gprsize*2+16*47], m1 ;out44 + mova [rsp+gprsize*2+16*22], m3 ;out19 + mova m1, [rsp+gprsize*2+16*62] ;t59 + psubsw m3, m2, m6 ;out51 + paddsw m2, m6 ;out12 + mova [rsp+gprsize*2+16*54], m3 ;out51 + mova [rsp+gprsize*2+16*15], m2 ;out12 + psubsw m2, m0, m5 ;t43a + paddsw m0, m5 ;t36a + mova m5, [rsp+gprsize*2+16*30] ;tmp[27] + psubsw m3, m1, m4 ;t52a + paddsw m1, m4 ;t59a + ITX_MULSUB_2W 3, 2, 4, 6, 7, 2896, 2896 ;t43, t52 + mova m4, [rsp+gprsize*2+16*7 ] ;tmp[4 ] + psubsw m6, m5, m0 ;out36 + paddsw m5, m0 ;out27 + psubsw m0, m4, m1 ;out59 + paddsw m4, m1 ;out4 + mova [rsp+gprsize*2+16*39], m6 ;out36 + mova [rsp+gprsize*2+16*30], m5 ;out27 + mova [rsp+gprsize*2+16*62], m0 ;out59 + mova [rsp+gprsize*2+16*7 ], m4 ;out4 + mova m0, [rsp+gprsize*2+16*23] ;tmp[20] + mova m5, [rsp+gprsize*2+16*14] ;tmp[11] + psubsw m4, m0, m3 ;out43 + paddsw m0, m3 ;out20 + psubsw m6, m5, m2 ;out52 + paddsw m5, m2 ;out11 + mova [rsp+gprsize*2+16*46], m4 ;out43 + mova [rsp+gprsize*2+16*23], m0 ;out20 + mova [rsp+gprsize*2+16*55], m6 ;out52 + mova [rsp+gprsize*2+16*14], m5 ;out11 + + mova m0, [rsp+gprsize*2+16*40] ;t37a + mova m5, [rsp+gprsize*2+16*45] ;t42a + mova m3, [rsp+gprsize*2+16*56] ;t53a + mova m1, [rsp+gprsize*2+16*61] ;t58a + mova m2, [rsp+gprsize*2+16*29] ;tmp[26] + psubsw m4, m0, m5 ;t42 + paddsw m0, m5 ;t37 + psubsw m5, m1, m3 ;t53 + paddsw m1, m3 ;t58 + ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t43, t52 + mova m3, [rsp+gprsize*2+16*8 ] ;tmp[5 ] + psubsw m6, m2, m0 ;out37 + paddsw m2, m0 ;out26 + psubsw m0, m3, m1 ;out58 + paddsw m3, m1 ;out5 + mova [rsp+gprsize*2+16*40], m6 ;out37 + mova [rsp+gprsize*2+16*29], m2 ;out26 + mova [rsp+gprsize*2+16*61], m0 ;out58 + mova [rsp+gprsize*2+16*8 ], m3 ;out5 + mova m0, [rsp+gprsize*2+16*24] ;tmp[21] + mova m1, [rsp+gprsize*2+16*13] ;tmp[10] + psubsw m2, m0, m5 ;out42 + paddsw m0, m5 ;out21 + psubsw m3, m1, m4 ;out53 + paddsw m1, m4 ;out10 + mova [rsp+gprsize*2+16*45], m2 ;out42 + mova [rsp+gprsize*2+16*24], m0 ;out21 + mova [rsp+gprsize*2+16*56], m3 ;out53 + mova [rsp+gprsize*2+16*13], m1 ;out10 + + mova m0, [rsp+gprsize*2+16*41] ;t38 + mova m5, [rsp+gprsize*2+16*44] ;t41 + mova m3, [rsp+gprsize*2+16*57] ;t54 + mova m1, [rsp+gprsize*2+16*60] ;t57 + mova m2, [rsp+gprsize*2+16*28] ;tmp[25] + psubsw m4, m0, m5 ;t41a + paddsw m0, m5 ;t38a + psubsw m5, m1, m3 ;t54a + paddsw m1, m3 ;t57a + ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t41a, t54a + mova m3, [rsp+gprsize*2+16*9 ] ;tmp[6 ] + psubsw m6, m2, m0 ;out38 + paddsw m2, m0 ;out25 + psubsw m0, m3, m1 ;out57 + paddsw m3, m1 ;out6 + mova [rsp+gprsize*2+16*41], m6 ;out38 + mova [rsp+gprsize*2+16*28], m2 ;out25 + mova [rsp+gprsize*2+16*60], m0 ;out57 + mova [rsp+gprsize*2+16*9 ], m3 ;out6 + mova m0, [rsp+gprsize*2+16*25] ;tmp[22] + mova m1, [rsp+gprsize*2+16*12] ;tmp[9 ] + psubsw m2, m0, m5 ;out41 + paddsw m0, m5 ;out22 + psubsw m3, m1, m4 ;out54 + paddsw m1, m4 ;out9 + mova [rsp+gprsize*2+16*44], m2 ;out41 + mova [rsp+gprsize*2+16*25], m0 ;out22 + mova [rsp+gprsize*2+16*57], m3 ;out54 + mova [rsp+gprsize*2+16*12], m1 ;out9 + + mova m0, [rsp+gprsize*2+16*42] ;t39a + mova m5, [rsp+gprsize*2+16*43] ;t40a + mova m3, [rsp+gprsize*2+16*58] ;t55a + mova m1, [rsp+gprsize*2+16*59] ;t56a + mova m2, [rsp+gprsize*2+16*27] ;tmp[24] + psubsw m4, m0, m5 ;t40 + paddsw m0, m5 ;t39 + psubsw m5, m1, m3 ;t55 + paddsw m1, m3 ;t56 + ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t40a, t55a + mova m3, [rsp+gprsize*2+16*10] ;tmp[7 ] + psubsw m6, m2, m0 ;out39 + paddsw m2, m0 ;out24 + psubsw m0, m3, m1 ;out56 + paddsw m3, m1 ;out7 + mova [rsp+gprsize*2+16*42], m6 ;out39 + mova [rsp+gprsize*2+16*27], m2 ;out24 + mova [rsp+gprsize*2+16*59], m0 ;out56 + mova [rsp+gprsize*2+16*10], m3 ;out7 + mova m0, [rsp+gprsize*2+16*26] ;tmp[23] + mova m1, [rsp+gprsize*2+16*11] ;tmp[8 ] + psubsw m2, m0, m5 ;out40 + paddsw m0, m5 ;out23 + psubsw m3, m1, m4 ;out55 + paddsw m1, m4 ;out8 + mova [rsp+gprsize*2+16*43], m2 ;out40 + mova [rsp+gprsize*2+16*26], m0 ;out23 + mova [rsp+gprsize*2+16*58], m3 ;out55 + mova [rsp+gprsize*2+16*11], m1 ;out8 + + mova m0, [rsp+gprsize*2+16*37] ;t34 + mova m5, [rsp+gprsize*2+16*48] ;t45 + mova m3, [rsp+gprsize*2+16*53] ;t50 + mova m1, [rsp+gprsize*2+16*64] ;t61 + mova m2, [rsp+gprsize*2+16*32] ;tmp[29] + psubsw m4, m0, m5 ;t45a + paddsw m0, m5 ;t34a + psubsw m5, m1, m3 ;t50a + paddsw m1, m3 ;t61a + ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t45, t50 + mova m3, [rsp+gprsize*2+16*5 ] ;tmp[2 ] + psubsw m6, m2, m0 ;out34 + paddsw m2, m0 ;out29 + psubsw m0, m3, m1 ;out61 + paddsw m3, m1 ;out2 + mova [rsp+gprsize*2+16*37], m6 ;out34 + mova [rsp+gprsize*2+16*32], m2 ;out29 + mova [rsp+gprsize*2+16*64], m0 ;out61 + mova [rsp+gprsize*2+16*5 ], m3 ;out2 + mova m0, [rsp+gprsize*2+16*21] ;tmp[18] + mova m1, [rsp+gprsize*2+16*16] ;tmp[13] + psubsw m2, m0, m5 ;out45 + paddsw m0, m5 ;out18 + psubsw m3, m1, m4 ;out50 + paddsw m1, m4 ;out13 + mova [rsp+gprsize*2+16*48], m2 ;out45 + mova [rsp+gprsize*2+16*21], m0 ;out18 + mova [rsp+gprsize*2+16*53], m3 ;out50 + mova [rsp+gprsize*2+16*16], m1 ;out13 + + mova m0, [rsp+gprsize*2+16*36] ;t33a + mova m5, [rsp+gprsize*2+16*49] ;t46a + mova m3, [rsp+gprsize*2+16*52] ;t49a + mova m1, [rsp+gprsize*2+16*65] ;t62a + mova m2, [rsp+gprsize*2+16*33] ;tmp[30] + psubsw m4, m0, m5 ;t46 + paddsw m0, m5 ;t33 + psubsw m5, m1, m3 ;t49 + paddsw m1, m3 ;t62 + ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t45, t50 + mova m3, [rsp+gprsize*2+16*4 ] ;tmp[1 ] + psubsw m6, m2, m0 ;out33 + paddsw m2, m0 ;out30 + psubsw m0, m3, m1 ;out62 + paddsw m3, m1 ;out1 + mova [rsp+gprsize*2+16*36], m6 ;out33 + mova [rsp+gprsize*2+16*33], m2 ;out30 + mova [rsp+gprsize*2+16*65], m0 ;out62 + mova [rsp+gprsize*2+16*4 ], m3 ;out1 + mova m0, [rsp+gprsize*2+16*20] ;tmp[17] + mova m1, [rsp+gprsize*2+16*17] ;tmp[14] + psubsw m2, m0, m5 ;out46 + paddsw m0, m5 ;out17 + psubsw m3, m1, m4 ;out49 + paddsw m1, m4 ;out14 + mova [rsp+gprsize*2+16*49], m2 ;out46 + mova [rsp+gprsize*2+16*20], m0 ;out17 + mova [rsp+gprsize*2+16*52], m3 ;out49 + mova [rsp+gprsize*2+16*17], m1 ;out14 + + mova m0, [rsp+gprsize*2+16*35] ;t32 + mova m5, [rsp+gprsize*2+16*50] ;t47 + mova m3, [rsp+gprsize*2+16*51] ;t48 + mova m1, [rsp+gprsize*2+16*66] ;t63 + mova m2, [rsp+gprsize*2+16*34] ;tmp[31] + psubsw m4, m0, m5 ;t47a + paddsw m0, m5 ;t32a + psubsw m5, m1, m3 ;t48a + paddsw m1, m3 ;t63a + ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t47, t48 + mova m3, [rsp+gprsize*2+16*3 ] ;tmp[0 ] + psubsw m6, m2, m0 ;out32 + paddsw m2, m0 ;out31 + psubsw m0, m3, m1 ;out63 + paddsw m3, m1 ;out0 + mova [rsp+gprsize*2+16*35], m6 ;out32 + mova [rsp+gprsize*2+16*34], m2 ;out31 + mova [rsp+gprsize*2+16*66], m0 ;out63 + mova [rsp+gprsize*2+16*3 ], m3 ;out0 + mova m0, [rsp+gprsize*2+16*19] ;tmp[16] + mova m1, [rsp+gprsize*2+16*18] ;tmp[15] + psubsw m2, m0, m5 ;out47 + paddsw m0, m5 ;out16 + psubsw m3, m1, m4 ;out48 + paddsw m1, m4 ;out15 + mova [rsp+gprsize*2+16*50], m2 ;out47 + mova [rsp+gprsize*2+16*19], m0 ;out16 + mova [rsp+gprsize*2+16*51], m3 ;out48 + mova [rsp+gprsize*2+16*18], m1 ;out15 + ret + + +cglobal inv_txfm_add_dct_dct_64x16, 4, 6, 8, 16*132, dst, stride, coeff, eob, tx2 +%if ARCH_X86_32 + LEA r5, $$ +%endif + test eobd, eobd + jz .dconly + + call m(idct_64x16_internal) + RET + +.dconly: + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_8192)] + mov [coeffq], eobd + mov r3d, 16 + lea tx2q, [o(m(inv_txfm_add_dct_dct_64x16).end)] + +.body: + pmulhrsw m0, m2 + movd m2, [o(pw_2048)] ;intentionally rip-relative + pmulhrsw m0, m1 + pmulhrsw m0, m2 + pshuflw m0, m0, q0000 + punpcklwd m0, m0 + pxor m7, m7 + +.loop: + mova m1, [dstq+16*0] + mova m3, [dstq+16*1] + mova m5, [dstq+16*2] + mova m6, [dstq+16*3] + punpckhbw m2, m1, m7 + punpcklbw m1, m7 + punpckhbw m4, m3, m7 + punpcklbw m3, m7 + paddw m2, m0 + paddw m1, m0 + paddw m4, m0 + paddw m3, m0 + packuswb m1, m2 + packuswb m3, m4 + punpckhbw m2, m5, m7 + punpcklbw m5, m7 + punpckhbw m4, m6, m7 + punpcklbw m6, m7 + paddw m2, m0 + paddw m5, m0 + paddw m4, m0 + paddw m6, m0 + packuswb m5, m2 + packuswb m6, m4 + mova [dstq+16*0], m1 + mova [dstq+16*1], m3 + mova [dstq+16*2], m5 + mova [dstq+16*3], m6 + add dstq, strideq + dec r3d + jg .loop + jmp tx2q + +.end: + RET + + +%macro LOAD_4ROWS 2-3 0 ;src, stride, is_rect2 + +%if %3 + mova m3, [o(pw_2896x8)] + pmulhrsw m0, m3, [%1+%2*0] + pmulhrsw m1, m3, [%1+%2*1] + pmulhrsw m2, m3, [%1+%2*2] + pmulhrsw m3, [%1+%2*3] +%else + mova m0, [%1+%2*0] + mova m1, [%1+%2*1] + mova m2, [%1+%2*2] + mova m3, [%1+%2*3] +%endif +%endmacro + +%macro LOAD_4ROWS_H 2 ;src, stride + mova m4, [%1+%2*0] + mova m5, [%1+%2*1] + mova m6, [%1+%2*2] + mova m7, [%1+%2*3] +%endmacro + +cglobal idct_64x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + mov r3d, 2 + mov [rsp+gprsize*2+16*67], dstq + lea dstq, [rsp+gprsize+16*68] + +.pass1_loop: + LOAD_4ROWS coeffq+32*0, 32*8 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + pxor m4, m4 + LOAD_4ROWS coeffq+32*4, 32*8 + + REPX {mova x, m4}, m5, m6, m7 + call m(idct_16x8_internal).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + LOAD_8ROWS coeffq+32*2, 32*4 + mova [rsp+gprsize+16*19], m0 + mova [rsp+gprsize+16*26], m1 + mova [rsp+gprsize+16*23], m2 + mova [rsp+gprsize+16*22], m3 + mova [rsp+gprsize+16*21], m4 + mova [rsp+gprsize+16*24], m5 + mova [rsp+gprsize+16*25], m6 + mova [rsp+gprsize+16*20], m7 + + call m(idct_8x32_internal).main_fast + SAVE_8ROWS rsp+gprsize+16*3, 16 + + LOAD_8ROWS coeffq+32*1, 32*2 + mova [rsp+gprsize+16*35], m0 ;in1 + mova [rsp+gprsize+16*49], m1 ;in3 + mova [rsp+gprsize+16*43], m2 ;in5 + mova [rsp+gprsize+16*41], m3 ;in7 + mova [rsp+gprsize+16*39], m4 ;in9 + mova [rsp+gprsize+16*45], m5 ;in11 + mova [rsp+gprsize+16*47], m6 ;in13 + mova [rsp+gprsize+16*37], m7 ;in15 + + LOAD_8ROWS coeffq+32*17, 32*2 + mova [rsp+gprsize+16*63], m0 ;in17 + mova [rsp+gprsize+16*53], m1 ;in19 + mova [rsp+gprsize+16*55], m2 ;in21 + mova [rsp+gprsize+16*61], m3 ;in23 + mova [rsp+gprsize+16*59], m4 ;in25 + mova [rsp+gprsize+16*57], m5 ;in27 + mova [rsp+gprsize+16*51], m6 ;in29 + mova [rsp+gprsize+16*65], m7 ;in31 + + call m(idct_16x64_internal).main + + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_64x16_internal).pass1_end)] + jmp m(idct_8x8_internal).pass1_end1 + +.pass1_end: + SAVE_8ROWS coeffq+32*0, 32 + LOAD_8ROWS rsp+gprsize+16*11, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_64x16_internal).pass1_end1)] + jmp m(idct_8x8_internal).pass1_end1 + +.pass1_end1: + SAVE_8ROWS coeffq+32*8, 32 + LOAD_8ROWS rsp+gprsize+16*19, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_64x16_internal).pass1_end2)] + jmp m(idct_8x8_internal).pass1_end1 + +.pass1_end2: + SAVE_8ROWS coeffq+32*16, 32 + LOAD_8ROWS rsp+gprsize+16*27, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_64x16_internal).pass1_end3)] + jmp m(idct_8x8_internal).pass1_end1 + +.pass1_end3: + SAVE_8ROWS coeffq+32*24, 32 + LOAD_8ROWS rsp+gprsize+16*35, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_64x16_internal).pass1_end4)] + jmp m(idct_8x8_internal).pass1_end1 + +.pass1_end4: + SAVE_8ROWS dstq+32*0, 32 + LOAD_8ROWS rsp+gprsize+16*43, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_64x16_internal).pass1_end5)] + jmp m(idct_8x8_internal).pass1_end1 + +.pass1_end5: + SAVE_8ROWS dstq+32*8, 32 + LOAD_8ROWS rsp+gprsize+16*51, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_64x16_internal).pass1_end6)] + jmp m(idct_8x8_internal).pass1_end1 + +.pass1_end6: + SAVE_8ROWS dstq+32*16, 32 + LOAD_8ROWS rsp+gprsize+16*59, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_64x16_internal).pass1_end7)] + jmp m(idct_8x8_internal).pass1_end1 + +.pass1_end7: + SAVE_8ROWS dstq+32*24, 32 + + add coeffq, 16 + add dstq, 16 + dec r3d + jg .pass1_loop + +.pass2: + mov dstq, [rsp+gprsize*2+16*67] + sub coeffq, 32 + mov r3d, 4 + +.pass2_loop: + mov [rsp+gprsize*1+16*67], r3d + + LOAD_4ROWS coeffq+16*0, 32*2 + LOAD_4ROWS_H coeffq+16*1, 32*2 + call m(idct_8x8_internal).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_4ROWS coeffq+16*2, 32*2 + LOAD_4ROWS_H coeffq+16*3, 32*2 + call m(idct_16x8_internal).main + + mov r3, dstq + lea tx2q, [o(m(idct_64x16_internal).end)] + lea dstq, [dstq+strideq*8] + jmp m(idct_8x8_internal).end + +.end: + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_64x16_internal).end1)] + mov dstq, r3 + jmp m(idct_8x8_internal).end + +.end1: + pxor m7, m7 + REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + + add coeffq, 16*16 + mov r3d, [rsp+gprsize*1+16*67] + mov dstq, [rsp+gprsize*2+16*67] + add dstq, 8 + mov [rsp+gprsize*2+16*67], dstq + dec r3d + jg .pass2_loop + + mov r3d, 4 + lea coeffq, [rsp+gprsize+16*68] +.pass2_loop2: + mov [rsp+gprsize*1+16*67], r3d + + LOAD_4ROWS coeffq+16*0, 32*2 + LOAD_4ROWS_H coeffq+16*1, 32*2 + call m(idct_8x8_internal).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_4ROWS coeffq+16*2, 32*2 + LOAD_4ROWS_H coeffq+16*3, 32*2 + call m(idct_16x8_internal).main + + mov r3, dstq + lea tx2q, [o(m(idct_64x16_internal).end2)] + lea dstq, [dstq+strideq*8] + jmp m(idct_8x8_internal).end + +.end2: + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_64x16_internal).end3)] + mov dstq, r3 + jmp m(idct_8x8_internal).end + +.end3: + + add coeffq, 16*16 + mov r3d, [rsp+gprsize*1+16*67] + mov dstq, [rsp+gprsize*2+16*67] + add dstq, 8 + mov [rsp+gprsize*2+16*67], dstq + dec r3d + jg .pass2_loop2 + ret + + +cglobal inv_txfm_add_dct_dct_32x64, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2 +%if ARCH_X86_32 + LEA r5, $$ +%endif + test eobd, eobd + jz .dconly + + call m(idct_32x64_internal) + RET + +.dconly: + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_16384)] + mov [coeffq], eobd + pmulhrsw m0, m1 + mov r3d, 64 + lea tx2q, [o(m(inv_txfm_add_dct_dct_32x64).end)] + jmp m(inv_txfm_add_dct_dct_32x8).body + +.end: + RET + + +cglobal idct_32x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + %undef cmp + + mov r4d, 2 + sub eobd, 136 + mov [rsp+gprsize*1+16*67], eobd + mov r3d, 4 + cmovs r3d, r4d + +%if ARCH_X86_32 + LEA r5, $$ +%endif + + mov [rsp+gprsize*2+16*67], coeffq + +.pass1_loop: + LOAD_8ROWS coeffq+64*1, 64*2, 1 + mova [rsp+gprsize+16*19], m0 ;in1 + mova [rsp+gprsize+16*26], m1 ;in3 + mova [rsp+gprsize+16*23], m2 ;in5 + mova [rsp+gprsize+16*22], m3 ;in7 + mova [rsp+gprsize+16*21], m4 ;in9 + mova [rsp+gprsize+16*24], m5 ;in11 + mova [rsp+gprsize+16*25], m6 ;in13 + mova [rsp+gprsize+16*20], m7 ;in15 + + mov tx2d, [rsp+gprsize*1+16*67] + test tx2d, tx2d + jl .fast + +.full: + LOAD_8ROWS coeffq+64*0, 64*4, 1 + call m(idct_8x8_internal).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_8ROWS coeffq+64*2, 64*4, 1 + call m(idct_16x8_internal).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + LOAD_8ROWS coeffq+64*17, 64*2, 1 + mova [rsp+gprsize+16*33], m0 ;in17 + mova [rsp+gprsize+16*28], m1 ;in19 + mova [rsp+gprsize+16*29], m2 ;in21 + mova [rsp+gprsize+16*32], m3 ;in23 + mova [rsp+gprsize+16*31], m4 ;in25 + mova [rsp+gprsize+16*30], m5 ;in27 + mova [rsp+gprsize+16*27], m6 ;in29 + mova [rsp+gprsize+16*34], m7 ;in31 + + call m(idct_8x32_internal).main + jmp .pass1_end + +.fast: + LOAD_4ROWS coeffq, 256, 1 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal).main + + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_4ROWS coeffq+128*1, 256, 1 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_16x8_internal).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + call m(idct_8x32_internal).main_fast + +.pass1_end: + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_32x64_internal).pass1_end1)] + jmp m(idct_8x8_internal).pass1_end + +.pass1_end1: + SAVE_8ROWS coeffq+64*0, 64 + LOAD_8ROWS rsp+gprsize+16*11, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_32x64_internal).pass1_end2)] + jmp m(idct_8x8_internal).pass1_end + +.pass1_end2: + SAVE_8ROWS coeffq+64*8, 64 + LOAD_8ROWS rsp+gprsize+16*19, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_32x64_internal).pass1_end3)] + jmp m(idct_8x8_internal).pass1_end + +.pass1_end3: + SAVE_8ROWS coeffq+64*16, 64 + LOAD_8ROWS rsp+gprsize+16*27, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_32x64_internal).pass1_end4)] + jmp m(idct_8x8_internal).pass1_end + +.pass1_end4: + SAVE_8ROWS coeffq+64*24, 64 + + add coeffq, 16 + dec r3d + jg .pass1_loop + +.pass2: + mov coeffq, [rsp+gprsize*2+16*67] + mov r3d, 4 + lea r4, [dstq+8] + mov [rsp+gprsize*2+16*67], r4 + lea r4, [o(m(idct_16x64_internal).end1)] + jmp m(idct_16x64_internal).pass2_loop + + +cglobal inv_txfm_add_dct_dct_64x32, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2 +%if ARCH_X86_32 + LEA r5, $$ +%endif + test eobd, eobd + jz .dconly + + call m(idct_64x32_internal) + RET + +.dconly: + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_16384)] + pmulhrsw m0, m1 + mov [coeffq], eobd + mov r3d, 32 + lea tx2q, [o(m(inv_txfm_add_dct_dct_64x32).end)] + jmp m(inv_txfm_add_dct_dct_64x16).body + +.end: + RET + +cglobal idct_64x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + %undef cmp + + mov r4d, 2 + sub eobd, 136 + mov [rsp+gprsize*1+16*67], eobd + mov r3d, 4 + cmovs r3d, r4d + +%if ARCH_X86_32 + LEA r5, $$ +%endif + + mov [rsp+gprsize*2+16*67], coeffq + mov [rsp+gprsize*3+16*67], dstq + lea dstq, [rsp+gprsize+16*69] + mov [rsp+gprsize*4+16*67], dstq + +.pass1_loop: + LOAD_4ROWS coeffq+64*0, 64*8, 1 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + pxor m4, m4 + LOAD_4ROWS coeffq+64*4, 64*8, 1 + + REPX {mova x, m4}, m5, m6, m7 + call m(idct_16x8_internal).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + LOAD_8ROWS coeffq+64*2, 64*4, 1 + mova [rsp+gprsize+16*19], m0 + mova [rsp+gprsize+16*26], m1 + mova [rsp+gprsize+16*23], m2 + mova [rsp+gprsize+16*22], m3 + mova [rsp+gprsize+16*21], m4 + mova [rsp+gprsize+16*24], m5 + mova [rsp+gprsize+16*25], m6 + mova [rsp+gprsize+16*20], m7 + + call m(idct_8x32_internal).main_fast + SAVE_8ROWS rsp+gprsize+16*3, 16 + + LOAD_8ROWS coeffq+64*1, 64*2, 1 + mova [rsp+gprsize+16*35], m0 ;in1 + mova [rsp+gprsize+16*49], m1 ;in3 + mova [rsp+gprsize+16*43], m2 ;in5 + mova [rsp+gprsize+16*41], m3 ;in7 + mova [rsp+gprsize+16*39], m4 ;in9 + mova [rsp+gprsize+16*45], m5 ;in11 + mova [rsp+gprsize+16*47], m6 ;in13 + mova [rsp+gprsize+16*37], m7 ;in15 + + LOAD_8ROWS coeffq+64*17, 64*2, 1 + mova [rsp+gprsize+16*63], m0 ;in17 + mova [rsp+gprsize+16*53], m1 ;in19 + mova [rsp+gprsize+16*55], m2 ;in21 + mova [rsp+gprsize+16*61], m3 ;in23 + mova [rsp+gprsize+16*59], m4 ;in25 + mova [rsp+gprsize+16*57], m5 ;in27 + mova [rsp+gprsize+16*51], m6 ;in29 + mova [rsp+gprsize+16*65], m7 ;in31 + + call m(idct_16x64_internal).main + + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_64x32_internal).pass1_end)] + jmp m(idct_8x8_internal).pass1_end + +.pass1_end: + SAVE_8ROWS coeffq+64*0, 64 + LOAD_8ROWS rsp+gprsize+16*11, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_64x32_internal).pass1_end1)] + jmp m(idct_8x8_internal).pass1_end + +.pass1_end1: + SAVE_8ROWS coeffq+64*8, 64 + LOAD_8ROWS rsp+gprsize+16*19, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_64x32_internal).pass1_end2)] + jmp m(idct_8x8_internal).pass1_end + +.pass1_end2: + SAVE_8ROWS coeffq+64*16, 64 + LOAD_8ROWS rsp+gprsize+16*27, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_64x32_internal).pass1_end3)] + jmp m(idct_8x8_internal).pass1_end + +.pass1_end3: + SAVE_8ROWS coeffq+64*24, 64 + LOAD_8ROWS rsp+gprsize+16*35, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_64x32_internal).pass1_end4)] + jmp m(idct_8x8_internal).pass1_end + +.pass1_end4: + SAVE_8ROWS dstq+64*0, 64 + LOAD_8ROWS rsp+gprsize+16*43, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_64x32_internal).pass1_end5)] + jmp m(idct_8x8_internal).pass1_end + +.pass1_end5: + SAVE_8ROWS dstq+64*8, 64 + LOAD_8ROWS rsp+gprsize+16*51, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_64x32_internal).pass1_end6)] + jmp m(idct_8x8_internal).pass1_end + +.pass1_end6: + SAVE_8ROWS dstq+64*16, 64 + LOAD_8ROWS rsp+gprsize+16*59, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_64x32_internal).pass1_end7)] + jmp m(idct_8x8_internal).pass1_end + +.pass1_end7: + SAVE_8ROWS dstq+64*24, 64 + + add coeffq, 16 + add dstq, 16 + dec r3d + jg .pass1_loop + +.pass2: + mov coeffq, [rsp+gprsize*4+16*67] + mov dstq, [rsp+gprsize*3+16*67] + mov eobd, [rsp+gprsize*1+16*67] + lea dstq, [dstq+32] + mov [rsp+gprsize*1+16*35], eobd + lea tx2q, [o(m(idct_64x32_internal).pass2_end)] + mov r3d, 4 + jmp m(idct_32x32_internal).pass2_loop + +.pass2_end: + mova [rsp+gprsize+16*0], m7 + lea r3, [o(m(idct_64x32_internal).pass2_end1)] + jmp m(idct_8x32_internal).end2 + +.pass2_end1: + lea tx2q, [o(m(idct_64x32_internal).pass2_end)] + add coeffq, 16*32 + mov dstq, [rsp+gprsize*2+16*35] + mov r3d, [rsp+gprsize*3+16*35] + dec r3d + jg m(idct_32x32_internal).pass2_loop + +.pass2_end2: + mov dstq, [rsp+gprsize*3+16*67] + mov coeffq, [rsp+gprsize*2+16*67] + lea tx2q, [o(m(idct_32x32_internal).pass2_end)] + mov r3d, 4 + jmp m(idct_32x32_internal).pass2_loop + + +cglobal inv_txfm_add_dct_dct_64x64, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2 +%if ARCH_X86_32 + LEA r5, $$ +%endif + test eobd, eobd + jz .dconly + + call m(idct_64x64_internal) + RET + +.dconly: + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_8192)] + mov [coeffq], eobd + mov r3d, 64 + lea tx2q, [o(m(inv_txfm_add_dct_dct_64x32).end)] + jmp m(inv_txfm_add_dct_dct_64x16).body + +cglobal idct_64x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + %undef cmp + + mov r5d, 4 + mov r4d, 2 + sub eobd, 136 + cmovns r4d, r5d + +%if ARCH_X86_32 + LEA r5, $$ +%endif + + mov [rsp+gprsize*1+16*67], eobd + mov r3d, r4d + mov [rsp+gprsize*4+16*67], coeffq + mov [rsp+gprsize*3+16*67], dstq + lea dstq, [rsp+gprsize+16*69] + mov [rsp+gprsize*2+16*67], dstq + +.pass1_loop: + LOAD_4ROWS coeffq+64*0, 64*8 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + pxor m4, m4 + LOAD_4ROWS coeffq+64*4, 64*8 + + REPX {mova x, m4}, m5, m6, m7 + call m(idct_16x8_internal).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + LOAD_8ROWS coeffq+64*2, 64*4 + mova [rsp+gprsize+16*19], m0 + mova [rsp+gprsize+16*26], m1 + mova [rsp+gprsize+16*23], m2 + mova [rsp+gprsize+16*22], m3 + mova [rsp+gprsize+16*21], m4 + mova [rsp+gprsize+16*24], m5 + mova [rsp+gprsize+16*25], m6 + mova [rsp+gprsize+16*20], m7 + + call m(idct_8x32_internal).main_fast + SAVE_8ROWS rsp+gprsize+16*3, 16 + + LOAD_8ROWS coeffq+64*1, 64*2 + mova [rsp+gprsize+16*35], m0 ;in1 + mova [rsp+gprsize+16*49], m1 ;in3 + mova [rsp+gprsize+16*43], m2 ;in5 + mova [rsp+gprsize+16*41], m3 ;in7 + mova [rsp+gprsize+16*39], m4 ;in9 + mova [rsp+gprsize+16*45], m5 ;in11 + mova [rsp+gprsize+16*47], m6 ;in13 + mova [rsp+gprsize+16*37], m7 ;in15 + + LOAD_8ROWS coeffq+64*17, 64*2 + mova [rsp+gprsize+16*63], m0 ;in17 + mova [rsp+gprsize+16*53], m1 ;in19 + mova [rsp+gprsize+16*55], m2 ;in21 + mova [rsp+gprsize+16*61], m3 ;in23 + mova [rsp+gprsize+16*59], m4 ;in25 + mova [rsp+gprsize+16*57], m5 ;in27 + mova [rsp+gprsize+16*51], m6 ;in29 + mova [rsp+gprsize+16*65], m7 ;in31 + + call m(idct_16x64_internal).main + + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_64x64_internal).pass1_end)] + jmp m(idct_8x8_internal).pass1_end1 + +.pass1_end: + SAVE_8ROWS coeffq+64*0, 64 + LOAD_8ROWS rsp+gprsize+16*11, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_64x64_internal).pass1_end1)] + jmp m(idct_8x8_internal).pass1_end1 + +.pass1_end1: + SAVE_8ROWS coeffq+64*8, 64 + LOAD_8ROWS rsp+gprsize+16*19, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_64x64_internal).pass1_end2)] + jmp m(idct_8x8_internal).pass1_end1 + +.pass1_end2: + SAVE_8ROWS coeffq+64*16, 64 + LOAD_8ROWS rsp+gprsize+16*27, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_64x64_internal).pass1_end3)] + jmp m(idct_8x8_internal).pass1_end1 + +.pass1_end3: + SAVE_8ROWS coeffq+64*24, 64 + LOAD_8ROWS rsp+gprsize+16*35, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_64x64_internal).pass1_end4)] + jmp m(idct_8x8_internal).pass1_end1 + +.pass1_end4: + SAVE_8ROWS dstq+64*0, 64 + LOAD_8ROWS rsp+gprsize+16*43, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_64x64_internal).pass1_end5)] + jmp m(idct_8x8_internal).pass1_end1 + +.pass1_end5: + SAVE_8ROWS dstq+64*8, 64 + LOAD_8ROWS rsp+gprsize+16*51, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_64x64_internal).pass1_end6)] + jmp m(idct_8x8_internal).pass1_end1 + +.pass1_end6: + SAVE_8ROWS dstq+64*16, 64 + LOAD_8ROWS rsp+gprsize+16*59, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_64x64_internal).pass1_end7)] + jmp m(idct_8x8_internal).pass1_end1 + +.pass1_end7: + SAVE_8ROWS dstq+64*24, 64 + + add coeffq, 16 + add dstq, 16 + dec r3d + jg .pass1_loop + +.pass2: + mov dstq, [rsp+gprsize*3+16*67] + mov coeffq, [rsp+gprsize*2+16*67] + lea dstq, [dstq+32] + mov r3d, 4 + lea r4, [dstq+8] + mov [rsp+gprsize*2+16*67], r4 + lea r4, [o(m(idct_64x64_internal).pass2_end)] + jmp m(idct_16x64_internal).pass2_loop + +.pass2_end: + LOAD_8ROWS rsp+gprsize+16*35, 16 + lea dstq, [dstq+strideq*2] + add rsp, 16*32 + mova [rsp+gprsize+16*0], m7 + lea r3, [o(m(idct_64x64_internal).pass2_end1)] + jmp m(idct_8x32_internal).end2 + +.pass2_end1: + add coeffq, 16*32 + sub rsp, 16*32 + + mov dstq, [rsp+gprsize*2+16*67] + mov r3d, [rsp+gprsize*3+16*67] + lea r4, [dstq+8] + mov [rsp+gprsize*2+16*67], r4 + lea r4, [o(m(idct_64x64_internal).pass2_end)] + + dec r3d + jg m(idct_16x64_internal).pass2_loop + +.pass2_end2: + mov coeffq, [rsp+gprsize*4+16*67] + mov dstq, [rsp+gprsize*2+16*67] + mov r3d, 4 + sub dstq, 72 + lea r4, [dstq+8] + mov [rsp+gprsize*2+16*67], r4 + lea r4, [o(m(idct_16x64_internal).end1)] + jmp m(idct_16x64_internal).pass2_loop diff --git a/third_party/dav1d/src/x86/loopfilter.asm b/third_party/dav1d/src/x86/loopfilter.asm new file mode 100644 index 0000000000..8cf20b685b --- /dev/null +++ b/third_party/dav1d/src/x86/loopfilter.asm @@ -0,0 +1,1601 @@ +; Copyright © 2018, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 32 + +pb_4x1_4x5_4x9_4x13: times 2 db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 +pb_7_1: times 16 db 7, 1 +pb_3_1: times 16 db 3, 1 +pb_2_1: times 16 db 2, 1 +pb_m1_0: times 16 db -1, 0 +pb_m1_1: times 16 db -1, 1 +pb_m1_2: times 16 db -1, 2 +pb_1: times 32 db 1 +pb_2: times 32 db 2 +pb_3: times 32 db 3 +pb_4: times 32 db 4 +pb_16: times 32 db 16 +pb_63: times 32 db 63 +pb_64: times 32 db 64 +pb_128: times 32 db 0x80 +pb_129: times 32 db 0x81 +pb_240: times 32 db 0xf0 +pb_248: times 32 db 0xf8 +pb_254: times 32 db 0xfe + +pw_2048: times 16 dw 2048 +pw_4096: times 16 dw 4096 + +pb_mask: dd 1, 2, 4, 8, 16, 32, 64, 128 + +SECTION .text + +%macro ABSSUB 4 ; dst, a, b, tmp + psubusb %1, %2, %3 + psubusb %4, %3, %2 + por %1, %4 +%endmacro + +%macro TRANSPOSE_16x4_AND_WRITE_4x32 5 + ; transpose 16x4 + punpcklbw m%5, m%1, m%2 + punpckhbw m%1, m%2 + punpcklbw m%2, m%3, m%4 + punpckhbw m%3, m%4 + punpcklwd m%4, m%5, m%2 + punpckhwd m%5, m%2 + punpcklwd m%2, m%1, m%3 + punpckhwd m%1, m%3 + + ; write out + movd [dstq+strideq*0-2], xm%4 + pextrd [dstq+strideq*1-2], xm%4, 1 + pextrd [dstq+strideq*2-2], xm%4, 2 + pextrd [dstq+stride3q-2], xm%4, 3 + lea dstq, [dstq+strideq*4] + movd [dstq+strideq*0-2], xm%5 + pextrd [dstq+strideq*1-2], xm%5, 1 + pextrd [dstq+strideq*2-2], xm%5, 2 + pextrd [dstq+stride3q-2], xm%5, 3 + lea dstq, [dstq+strideq*4] + movd [dstq+strideq*0-2], xm%2 + pextrd [dstq+strideq*1-2], xm%2, 1 + pextrd [dstq+strideq*2-2], xm%2, 2 + pextrd [dstq+stride3q-2], xm%2, 3 + lea dstq, [dstq+strideq*4] + movd [dstq+strideq*0-2], xm%1 + pextrd [dstq+strideq*1-2], xm%1, 1 + pextrd [dstq+strideq*2-2], xm%1, 2 + pextrd [dstq+stride3q-2], xm%1, 3 + lea dstq, [dstq+strideq*4] + + vextracti128 xm%4, m%4, 1 + vextracti128 xm%5, m%5, 1 + vextracti128 xm%2, m%2, 1 + vextracti128 xm%1, m%1, 1 + + movd [dstq+strideq*0-2], xm%4 + pextrd [dstq+strideq*1-2], xm%4, 1 + pextrd [dstq+strideq*2-2], xm%4, 2 + pextrd [dstq+stride3q-2], xm%4, 3 + lea dstq, [dstq+strideq*4] + movd [dstq+strideq*0-2], xm%5 + pextrd [dstq+strideq*1-2], xm%5, 1 + pextrd [dstq+strideq*2-2], xm%5, 2 + pextrd [dstq+stride3q-2], xm%5, 3 + lea dstq, [dstq+strideq*4] + movd [dstq+strideq*0-2], xm%2 + pextrd [dstq+strideq*1-2], xm%2, 1 + pextrd [dstq+strideq*2-2], xm%2, 2 + pextrd [dstq+stride3q-2], xm%2, 3 + lea dstq, [dstq+strideq*4] + movd [dstq+strideq*0-2], xm%1 + pextrd [dstq+strideq*1-2], xm%1, 1 + pextrd [dstq+strideq*2-2], xm%1, 2 + pextrd [dstq+stride3q-2], xm%1, 3 + lea dstq, [dstq+strideq*4] +%endmacro + +%macro TRANSPOSE_16X16B 3 ; in_load_15_from_mem, out_store_0_in_mem, mem +%if %1 == 0 + mova %3, m15 +%endif + + ; input in m0-15 + punpcklbw m15, m0, m1 + punpckhbw m0, m1 + punpcklbw m1, m2, m3 + punpckhbw m2, m3 + punpcklbw m3, m4, m5 + punpckhbw m4, m5 + punpcklbw m5, m6, m7 + punpckhbw m6, m7 + punpcklbw m7, m8, m9 + punpckhbw m8, m9 + punpcklbw m9, m10, m11 + punpckhbw m10, m11 + punpcklbw m11, m12, m13 + punpckhbw m12, m13 + mova m13, %3 + mova %3, m12 + punpcklbw m12, m14, m13 + punpckhbw m13, m14, m13 + + ; interleaved in m15,0,1,2,3,4,5,6,7,8,9,10,11,rsp%3,12,13 + punpcklwd m14, m15, m1 + punpckhwd m15, m1 + punpcklwd m1, m0, m2 + punpckhwd m0, m2 + punpcklwd m2, m3, m5 + punpckhwd m3, m5 + punpcklwd m5, m4, m6 + punpckhwd m4, m6 + punpcklwd m6, m7, m9 + punpckhwd m7, m9 + punpcklwd m9, m8, m10 + punpckhwd m8, m10 + punpcklwd m10, m11, m12 + punpckhwd m11, m12 + mova m12, %3 + mova %3, m11 + punpcklwd m11, m12, m13 + punpckhwd m12, m13 + + ; interleaved in m14,15,1,0,2,3,5,4,6,7,9,8,10,rsp%3,11,12 + punpckldq m13, m14, m2 + punpckhdq m14, m2 + punpckldq m2, m15, m3 + punpckhdq m15, m3 + punpckldq m3, m1, m5 + punpckhdq m1, m5 + punpckldq m5, m0, m4 + punpckhdq m0, m4 + punpckldq m4, m6, m10 + punpckhdq m6, m10 + punpckldq m10, m9, m11 + punpckhdq m9, m11 + punpckldq m11, m8, m12 + punpckhdq m8, m12 + mova m12, %3 + mova %3, m8 + punpckldq m8, m7, m12 + punpckhdq m7, m12 + + ; interleaved in m13,14,2,15,3,1,5,0,4,6,8,7,10,9,11,rsp%3 + punpcklqdq m12, m13, m4 + punpckhqdq m13, m4 + punpcklqdq m4, m14, m6 + punpckhqdq m14, m6 + punpcklqdq m6, m2, m8 + punpckhqdq m2, m8 + punpcklqdq m8, m15, m7 + punpckhqdq m15, m7 + punpcklqdq m7, m3, m10 + punpckhqdq m3, m10 + punpcklqdq m10, m1, m9 + punpckhqdq m1, m9 + punpcklqdq m9, m5, m11 + punpckhqdq m5, m11 + mova m11, %3 + mova %3, m12 + punpcklqdq m12, m0, m11 + punpckhqdq m0, m11 +%if %2 == 0 + mova m11, %3 +%endif + + ; interleaved m11,13,4,14,6,2,8,15,7,3,10,1,9,5,12,0 + SWAP 0, 11, 1, 13, 5, 2, 4, 6, 8, 7, 15 + SWAP 3, 14, 12, 9 +%endmacro + +%macro FILTER 2 ; width [4/6/8/16], dir [h/v] + ; load data +%ifidn %2, v +%if %1 == 4 + lea tmpq, [dstq+mstrideq*2] + mova m3, [tmpq+strideq*0] ; p1 + mova m4, [tmpq+strideq*1] ; p0 + mova m5, [tmpq+strideq*2] ; q0 + mova m6, [tmpq+stride3q] ; q1 +%else + ; load 6-8 pixels, remainder (for wd=16) will be read inline + lea tmpq, [dstq+mstrideq*4] +%if %1 != 6 + mova m12, [tmpq+strideq*0] +%endif + mova m13, [tmpq+strideq*1] + mova m3, [tmpq+strideq*2] + mova m4, [tmpq+stride3q] + mova m5, [dstq+strideq*0] + mova m6, [dstq+strideq*1] + mova m14, [dstq+strideq*2] +%if %1 != 6 + mova m15, [dstq+stride3q] +%endif +%endif +%else + ; load lines +%if %1 == 4 + movd xm3, [dstq+strideq*0-2] + movd xm4, [dstq+strideq*1-2] + movd xm5, [dstq+strideq*2-2] + movd xm6, [dstq+stride3q -2] + lea tmpq, [dstq+strideq*4] + pinsrd xm3, [tmpq+strideq*0-2], 2 + pinsrd xm4, [tmpq+strideq*1-2], 2 + pinsrd xm5, [tmpq+strideq*2-2], 2 + pinsrd xm6, [tmpq+stride3q -2], 2 + lea tmpq, [tmpq+strideq*4] + pinsrd xm3, [tmpq+strideq*0-2], 1 + pinsrd xm4, [tmpq+strideq*1-2], 1 + pinsrd xm5, [tmpq+strideq*2-2], 1 + pinsrd xm6, [tmpq+stride3q -2], 1 + lea tmpq, [tmpq+strideq*4] + pinsrd xm3, [tmpq+strideq*0-2], 3 + pinsrd xm4, [tmpq+strideq*1-2], 3 + pinsrd xm5, [tmpq+strideq*2-2], 3 + pinsrd xm6, [tmpq+stride3q -2], 3 + lea tmpq, [tmpq+strideq*4] + movd xm12, [tmpq+strideq*0-2] + movd xm13, [tmpq+strideq*1-2] + movd xm14, [tmpq+strideq*2-2] + movd xm15, [tmpq+stride3q -2] + lea tmpq, [tmpq+strideq*4] + pinsrd xm12, [tmpq+strideq*0-2], 2 + pinsrd xm13, [tmpq+strideq*1-2], 2 + pinsrd xm14, [tmpq+strideq*2-2], 2 + pinsrd xm15, [tmpq+stride3q -2], 2 + lea tmpq, [tmpq+strideq*4] + pinsrd xm12, [tmpq+strideq*0-2], 1 + pinsrd xm13, [tmpq+strideq*1-2], 1 + pinsrd xm14, [tmpq+strideq*2-2], 1 + pinsrd xm15, [tmpq+stride3q -2], 1 + lea tmpq, [tmpq+strideq*4] + pinsrd xm12, [tmpq+strideq*0-2], 3 + pinsrd xm13, [tmpq+strideq*1-2], 3 + pinsrd xm14, [tmpq+strideq*2-2], 3 + pinsrd xm15, [tmpq+stride3q -2], 3 + vinserti128 m3, xm12, 1 + vinserti128 m4, xm13, 1 + vinserti128 m5, xm14, 1 + vinserti128 m6, xm15, 1 + + ; transpose 4x16 + ; xm3: A-D0,A-D8,A-D4,A-D12 + ; xm4: A-D1,A-D9,A-D5,A-D13 + ; xm5: A-D2,A-D10,A-D6,A-D14 + ; xm6: A-D3,A-D11,A-D7,A-D15 + punpcklbw m7, m3, m4 + punpckhbw m3, m4 + punpcklbw m4, m5, m6 + punpckhbw m5, m6 + ; xm7: A0-1,B0-1,C0-1,D0-1,A8-9,B8-9,C8-9,D8-9 + ; xm3: A4-5,B4-5,C4-5,D4-5,A12-13,B12-13,C12-13,D12-13 + ; xm4: A2-3,B2-3,C2-3,D2-3,A10-11,B10-11,C10-11,D10-11 + ; xm5: A6-7,B6-7,C6-7,D6-7,A14-15,B14-15,C14-15,D14-15 + punpcklwd m6, m7, m4 + punpckhwd m7, m4 + punpcklwd m4, m3, m5 + punpckhwd m3, m5 + ; xm6: A0-3,B0-3,C0-3,D0-3 + ; xm7: A8-11,B8-11,C8-11,D8-11 + ; xm4: A4-7,B4-7,C4-7,D4-7 + ; xm3: A12-15,B12-15,C12-15,D12-15 + punpckldq m5, m6, m4 + punpckhdq m6, m4 + punpckldq m4, m7, m3 + punpckhdq m7, m3 + ; xm5: A0-7,B0-7 + ; xm6: C0-7,D0-7 + ; xm4: A8-15,B8-15 + ; xm7: C8-15,D8-15 + punpcklqdq m3, m5, m4 + punpckhqdq m4, m5, m4 + punpcklqdq m5, m6, m7 + punpckhqdq m6, m7 + ; xm3: A0-15 + ; xm5: B0-15 + ; xm4: C0-15 + ; xm6: D0-15 +%elif %1 == 6 || %1 == 8 + movq xm3, [dstq+strideq*0-%1/2] + movq xm4, [dstq+strideq*1-%1/2] + movq xm5, [dstq+strideq*2-%1/2] + movq xm6, [dstq+stride3q -%1/2] + lea tmpq, [dstq+strideq*8] + movhps xm3, [tmpq+strideq*0-%1/2] + movhps xm4, [tmpq+strideq*1-%1/2] + movhps xm5, [tmpq+strideq*2-%1/2] + movhps xm6, [tmpq+stride3q -%1/2] + lea tmpq, [tmpq+strideq*8] + movq xm7, [tmpq+strideq*0-%1/2] + movq xm8, [tmpq+strideq*1-%1/2] + movq xm9, [tmpq+strideq*2-%1/2] + movq xm11, [tmpq+stride3q -%1/2] + lea tmpq, [tmpq+strideq*8] + movhps xm7, [tmpq+strideq*0-%1/2] + movhps xm8, [tmpq+strideq*1-%1/2] + movhps xm9, [tmpq+strideq*2-%1/2] + movhps xm11, [tmpq+stride3q -%1/2] + vinserti128 m3, xm7, 1 + vinserti128 m4, xm8, 1 + vinserti128 m5, xm9, 1 + vinserti128 m6, xm11, 1 + lea tmpq, [dstq+strideq*4] + movq xm12, [tmpq+strideq*0-%1/2] + movq xm13, [tmpq+strideq*1-%1/2] + movq xm14, [tmpq+strideq*2-%1/2] + movq xm15, [tmpq+stride3q -%1/2] + lea tmpq, [tmpq+strideq*8] + movhps xm12, [tmpq+strideq*0-%1/2] + movhps xm13, [tmpq+strideq*1-%1/2] + movhps xm14, [tmpq+strideq*2-%1/2] + movhps xm15, [tmpq+stride3q -%1/2] + lea tmpq, [tmpq+strideq*8] + movq xm7, [tmpq+strideq*0-%1/2] + movq xm8, [tmpq+strideq*1-%1/2] + movq xm9, [tmpq+strideq*2-%1/2] + movq xm11, [tmpq+stride3q -%1/2] + lea tmpq, [tmpq+strideq*8] + movhps xm7, [tmpq+strideq*0-%1/2] + movhps xm8, [tmpq+strideq*1-%1/2] + movhps xm9, [tmpq+strideq*2-%1/2] + movhps xm11, [tmpq+stride3q -%1/2] + vinserti128 m12, xm7, 1 + vinserti128 m13, xm8, 1 + vinserti128 m14, xm9, 1 + vinserti128 m15, xm11, 1 + + ; transpose 8x16 + ; xm3: A-H0,A-H8 + ; xm4: A-H1,A-H9 + ; xm5: A-H2,A-H10 + ; xm6: A-H3,A-H11 + ; xm12: A-H4,A-H12 + ; xm13: A-H5,A-H13 + ; xm14: A-H6,A-H14 + ; xm15: A-H7,A-H15 + punpcklbw m7, m3, m4 + punpckhbw m3, m4 + punpcklbw m4, m5, m6 + punpckhbw m5, m6 + punpcklbw m6, m12, m13 + punpckhbw m12, m13 + punpcklbw m13, m14, m15 + punpckhbw m14, m15 + ; xm7: A0-1,B0-1,C0-1,D0-1,E0-1,F0-1,G0-1,H0-1 + ; xm3: A8-9,B8-9,C8-9,D8-9,E8-9,F8-9,G8-9,H8-9 + ; xm4: A2-3,B2-3,C2-3,D2-3,E2-3,F2-3,G2-3,H2-3 + ; xm5: A10-11,B10-11,C10-11,D10-11,E10-11,F10-11,G10-11,H10-11 + ; xm6: A4-5,B4-5,C4-5,D4-5,E4-5,F4-5,G4-5,H4-5 + ; xm12: A12-13,B12-13,C12-13,D12-13,E12-13,F12-13,G12-13,H12-13 + ; xm13: A6-7,B6-7,C6-7,D6-7,E6-7,F6-7,G6-7,H6-7 + ; xm14: A14-15,B14-15,C14-15,D14-15,E14-15,F14-15,G14-15,H14-15 + punpcklwd m15, m7, m4 + punpckhwd m7, m4 + punpcklwd m4, m3, m5 + punpckhwd m3, m5 + punpcklwd m5, m6, m13 + punpckhwd m6, m13 + punpcklwd m13, m12, m14 + punpckhwd m12, m14 + ; xm15: A0-3,B0-3,C0-3,D0-3 + ; xm7: E0-3,F0-3,G0-3,H0-3 + ; xm4: A8-11,B8-11,C8-11,D8-11 + ; xm3: E8-11,F8-11,G8-11,H8-11 + ; xm5: A4-7,B4-7,C4-7,D4-7 + ; xm6: E4-7,F4-7,G4-7,H4-7 + ; xm13: A12-15,B12-15,C12-15,D12-15 + ; xm12: E12-15,F12-15,G12-15,H12-15 + punpckldq m14, m15, m5 + punpckhdq m15, m5 + punpckldq m5, m7, m6 +%if %1 != 6 + punpckhdq m7, m6 +%endif + punpckldq m6, m4, m13 + punpckhdq m4, m13 + punpckldq m13, m3, m12 +%if %1 != 6 + punpckhdq m12, m3, m12 +%endif + ; xm14: A0-7,B0-7 + ; xm15: C0-7,D0-7 + ; xm5: E0-7,F0-7 + ; xm7: G0-7,H0-7 + ; xm6: A8-15,B8-15 + ; xm4: C8-15,D8-15 + ; xm13: E8-15,F8-15 + ; xm12: G8-15,H8-15 + punpcklqdq m3, m14, m6 + punpckhqdq m14, m6 + punpckhqdq m6, m15, m4 + punpcklqdq m15, m4 + punpcklqdq m4, m5, m13 + punpckhqdq m13, m5, m13 +%if %1 == 8 + punpcklqdq m5, m7, m12 + punpckhqdq m12, m7, m12 + ; xm3: A0-15 + ; xm14: B0-15 + ; xm15: C0-15 + ; xm6: D0-15 + ; xm4: E0-15 + ; xm13: F0-15 + ; xm5: G0-15 + ; xm12: H0-15 + SWAP 12, 3, 15 + SWAP 13, 14, 5, 4, 6 + ; 3,14,15,6,4,13,5,12 -> 12,13,3,4,5,6,14,15 +%else + SWAP 13, 3, 14 + SWAP 6, 4, 15, 5 + ; 3,14,15,6,4,13 -> 13,3,4,5,6,14 +%endif +%else + ; load and 16x16 transpose. We only use 14 pixels but we'll need the + ; remainder at the end for the second transpose + movu xm0, [dstq+strideq*0-8] + movu xm1, [dstq+strideq*1-8] + movu xm2, [dstq+strideq*2-8] + movu xm3, [dstq+stride3q -8] + lea tmpq, [dstq+strideq*4] + movu xm4, [tmpq+strideq*0-8] + movu xm5, [tmpq+strideq*1-8] + movu xm6, [tmpq+strideq*2-8] + movu xm7, [tmpq+stride3q -8] + lea tmpq, [tmpq+strideq*4] + movu xm8, [tmpq+strideq*0-8] + movu xm9, [tmpq+strideq*1-8] + movu xm10, [tmpq+strideq*2-8] + movu xm11, [tmpq+stride3q -8] + lea tmpq, [tmpq+strideq*4] + movu xm12, [tmpq+strideq*0-8] + movu xm13, [tmpq+strideq*1-8] + movu xm14, [tmpq+strideq*2-8] + movu xm15, [tmpq+stride3q -8] + lea tmpq, [tmpq+strideq*4] + vinserti128 m0, [tmpq+strideq*0-8], 1 + vinserti128 m1, [tmpq+strideq*1-8], 1 + vinserti128 m2, [tmpq+strideq*2-8], 1 + vinserti128 m3, [tmpq+stride3q -8], 1 + lea tmpq, [tmpq+strideq*4] + vinserti128 m4, [tmpq+strideq*0-8], 1 + vinserti128 m5, [tmpq+strideq*1-8], 1 + vinserti128 m6, [tmpq+strideq*2-8], 1 + vinserti128 m7, [tmpq+stride3q -8], 1 + lea tmpq, [tmpq+strideq*4] + vinserti128 m8, [tmpq+strideq*0-8], 1 + vinserti128 m9, [tmpq+strideq*1-8], 1 + vinserti128 m10, [tmpq+strideq*2-8], 1 + vinserti128 m11, [tmpq+stride3q -8], 1 + lea tmpq, [tmpq+strideq*4] + vinserti128 m12, [tmpq+strideq*0-8], 1 + vinserti128 m13, [tmpq+strideq*1-8], 1 + vinserti128 m14, [tmpq+strideq*2-8], 1 + vinserti128 m15, [tmpq+stride3q -8], 1 + + TRANSPOSE_16X16B 0, 1, [rsp+11*32] + mova [rsp+12*32], m1 + mova [rsp+13*32], m2 + mova [rsp+14*32], m3 + mova [rsp+15*32], m12 + mova [rsp+16*32], m13 + mova [rsp+17*32], m14 + mova [rsp+18*32], m15 + ; 4,5,6,7,8,9,10,11 -> 12,13,3,4,5,6,14,15 + SWAP 12, 4, 7 + SWAP 13, 5, 8 + SWAP 3, 6, 9 + SWAP 10, 14 + SWAP 11, 15 +%endif +%endif + + ; load L/E/I/H +%ifidn %2, v + movu m1, [lq] + movu m0, [lq+l_strideq] +%else + movq xm1, [lq] + movq xm2, [lq+l_strideq*2] + movhps xm1, [lq+l_strideq] + movhps xm2, [lq+l_stride3q] + lea lq, [lq+l_strideq*4] + movq xm10, [lq] + movq xm0, [lq+l_strideq*2] + movhps xm10, [lq+l_strideq] + movhps xm0, [lq+l_stride3q] + lea lq, [lq+l_strideq*4] + vinserti128 m1, xm10, 1 + vinserti128 m2, xm0, 1 + shufps m0, m1, m2, q3131 + shufps m1, m2, q2020 +%endif + pxor m2, m2 + pcmpeqb m10, m2, m0 + pand m1, m10 + por m0, m1 ; l[x][] ? l[x][] : l[x-stride][] + pshufb m0, [pb_4x1_4x5_4x9_4x13] ; l[x][1] + pcmpeqb m10, m2, m0 ; !L + psrlq m2, m0, [lutq+128] + pand m2, [pb_63] + vpbroadcastb m1, [lutq+136] + pminub m2, m1 + pmaxub m2, [pb_1] ; I + pand m1, m0, [pb_240] + psrlq m1, 4 ; H + paddb m0, [pb_2] + paddb m0, m0 + paddb m0, m2 ; E + pxor m1, [pb_128] + pxor m2, [pb_128] + pxor m0, [pb_128] + + ABSSUB m8, m3, m4, m9 ; abs(p1-p0) + pmaxub m8, m10 + ABSSUB m9, m5, m6, m10 ; abs(q1-q0) + pmaxub m8, m9 +%if %1 == 4 + pxor m8, [pb_128] + pcmpgtb m7, m8, m1 ; hev +%else + pxor m7, m8, [pb_128] + pcmpgtb m7, m1 ; hev + +%if %1 == 6 + ABSSUB m9, m13, m4, m10 ; abs(p2-p0) + pmaxub m9, m8 +%else + ABSSUB m9, m12, m4, m10 ; abs(p3-p0) + pmaxub m9, m8 + ABSSUB m10, m13, m4, m11 ; abs(p2-p0) + pmaxub m9, m10 +%endif + ABSSUB m10, m5, m14, m11 ; abs(q2-q0) + pmaxub m9, m10 +%if %1 != 6 + ABSSUB m10, m5, m15, m11 ; abs(q3-q0) + pmaxub m9, m10 +%endif + pxor m9, [pb_128] + pcmpgtb m9, [pb_129] ; !flat8in + +%if %1 == 6 + ABSSUB m10, m13, m3, m1 ; abs(p2-p1) +%else + ABSSUB m10, m12, m13, m11 ; abs(p3-p2) + ABSSUB m11, m13, m3, m1 ; abs(p2-p1) + pmaxub m10, m11 + ABSSUB m11, m14, m15, m1 ; abs(q3-q2) + pmaxub m10, m11 +%endif + ABSSUB m11, m14, m6, m1 ; abs(q2-q1) + pmaxub m10, m11 +%if %1 == 16 + vpbroadcastd m11, [maskq+8] + vpbroadcastd m1, [maskq+4] + por m11, m1 + pand m11, [pb_mask] + pcmpeqd m11, [pb_mask] + pand m10, m11 +%else + vpbroadcastd m11, [maskq+4] + pand m11, [pb_mask] + pcmpeqd m11, [pb_mask] + pand m10, m11 ; only apply fm-wide to wd>4 blocks +%endif + pmaxub m8, m10 + + pxor m8, [pb_128] +%endif + pcmpgtb m8, m2 + + ABSSUB m10, m3, m6, m11 ; abs(p1-q1) + ABSSUB m11, m4, m5, m2 ; abs(p0-q0) + paddusb m11, m11 + pand m10, [pb_254] + psrlq m10, 1 + paddusb m10, m11 ; abs(p0-q0)*2+(abs(p1-q1)>>1) + pxor m10, [pb_128] + pcmpgtb m10, m0 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E + por m8, m10 + +%if %1 == 16 +%ifidn %2, v + lea tmpq, [dstq+mstrideq*8] + mova m0, [tmpq+strideq*1] +%else + mova m0, [rsp+12*32] +%endif + ABSSUB m1, m0, m4, m2 +%ifidn %2, v + mova m0, [tmpq+strideq*2] +%else + mova m0, [rsp+13*32] +%endif + ABSSUB m2, m0, m4, m10 + pmaxub m1, m2 +%ifidn %2, v + mova m0, [tmpq+stride3q] +%else + mova m0, [rsp+14*32] +%endif + ABSSUB m2, m0, m4, m10 + pmaxub m1, m2 +%ifidn %2, v + lea tmpq, [dstq+strideq*4] + mova m0, [tmpq+strideq*0] +%else + mova m0, [rsp+15*32] +%endif + ABSSUB m2, m0, m5, m10 + pmaxub m1, m2 +%ifidn %2, v + mova m0, [tmpq+strideq*1] +%else + mova m0, [rsp+16*32] +%endif + ABSSUB m2, m0, m5, m10 + pmaxub m1, m2 +%ifidn %2, v + mova m0, [tmpq+strideq*2] +%else + mova m0, [rsp+17*32] +%endif + ABSSUB m2, m0, m5, m10 + pmaxub m1, m2 + pxor m1, [pb_128] + pcmpgtb m1, [pb_129] ; !flat8out + por m1, m9 ; !flat8in | !flat8out + vpbroadcastd m2, [maskq+8] + pand m10, m2, [pb_mask] + pcmpeqd m10, [pb_mask] + pandn m1, m10 ; flat16 + pandn m1, m8, m1 ; flat16 & fm + + vpbroadcastd m10, [maskq+4] + por m10, m2 + pand m2, m10, [pb_mask] + pcmpeqd m2, [pb_mask] + pandn m9, m2 ; flat8in + pandn m9, m8, m9 + vpbroadcastd m2, [maskq+0] + por m2, m10 + pand m2, [pb_mask] + pcmpeqd m2, [pb_mask] + pandn m8, m2 + pandn m8, m9, m8 ; fm & !flat8 & !flat16 + pandn m9, m1, m9 ; flat8 & !flat16 +%elif %1 != 4 + vpbroadcastd m0, [maskq+4] + pand m2, m0, [pb_mask] + pcmpeqd m2, [pb_mask] + pandn m9, m2 + pandn m9, m8, m9 ; flat8 & fm + vpbroadcastd m2, [maskq+0] + por m0, m2 + pand m0, [pb_mask] + pcmpeqd m0, [pb_mask] + pandn m8, m0 + pandn m8, m9, m8 ; fm & !flat8 +%else + vpbroadcastd m0, [maskq+0] + pand m0, [pb_mask] + pcmpeqd m0, [pb_mask] + pandn m8, m0 ; fm +%endif + + ; short filter + + pxor m3, [pb_128] + pxor m6, [pb_128] + psubsb m10, m3, m6 ; iclip_diff(p1-q1) + pand m10, m7 ; f=iclip_diff(p1-q1)&hev + pxor m4, [pb_128] + pxor m5, [pb_128] + psubsb m11, m5, m4 + paddsb m10, m11 + paddsb m10, m11 + paddsb m10, m11 ; f=iclip_diff(3*(q0-p0)+f) + pand m8, m10 ; f&=fm + paddsb m10, m8, [pb_3] + paddsb m8, [pb_4] + pand m10, [pb_248] + pand m8, [pb_248] + psrlq m10, 3 + psrlq m8, 3 + pxor m10, [pb_16] + pxor m8, [pb_16] + psubb m10, [pb_16] ; f2 + psubb m8, [pb_16] ; f1 + paddsb m4, m10 + psubsb m5, m8 + pxor m4, [pb_128] + pxor m5, [pb_128] + + pxor m8, [pb_128] + pxor m10, m10 + pavgb m8, m10 ; f=(f1+1)>>1 + psubb m8, [pb_64] + pandn m8, m7, m8 ; f&=!hev + paddsb m3, m8 + psubsb m6, m8 + pxor m3, [pb_128] + pxor m6, [pb_128] + +%if %1 == 16 + ; flat16 filter +%ifidn %2, v + lea tmpq, [dstq+mstrideq*8] + mova m0, [tmpq+strideq*1] ; p6 + mova m2, [tmpq+strideq*2] ; p5 + mova m7, [tmpq+stride3q] ; p4 +%else + mova m0, [rsp+12*32] + mova m2, [rsp+13*32] + mova m7, [rsp+14*32] +%endif + + mova [rsp+0*32], m9 + mova [rsp+1*32], m14 + mova [rsp+2*32], m15 + + ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A + ; write -6 + punpcklbw m14, m0, m12 + punpckhbw m15, m0, m12 + pmaddubsw m10, m14, [pb_7_1] + pmaddubsw m11, m15, [pb_7_1] ; p6*7+p3 + punpcklbw m8, m2, m7 + punpckhbw m9, m2, m7 + pmaddubsw m8, [pb_2] + pmaddubsw m9, [pb_2] + paddw m10, m8 + paddw m11, m9 ; p6*7+p5*2+p4*2+p3 + punpcklbw m8, m13, m3 + punpckhbw m9, m13, m3 + pmaddubsw m8, [pb_1] + pmaddubsw m9, [pb_1] + paddw m10, m8 + paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1 + punpcklbw m8, m4, m5 + punpckhbw m9, m4, m5 + pmaddubsw m8, [pb_1] + pmaddubsw m9, [pb_1] + paddw m10, m8 + paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 + pmulhrsw m8, m10, [pw_2048] + pmulhrsw m9, m11, [pw_2048] + packuswb m8, m9 + pand m8, m1 + pandn m9, m1, m2 + por m8, m9 +%ifidn %2, v + mova [tmpq+strideq*2], m8 ; p5 +%else + mova [rsp+13*32], m8 +%endif + + ; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B + ; write -5 + pmaddubsw m14, [pb_m1_1] + pmaddubsw m15, [pb_m1_1] + paddw m10, m14 + paddw m11, m15 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0 + punpcklbw m8, m0, m6 + punpckhbw m9, m0, m6 + pmaddubsw m8, [pb_m1_1] + pmaddubsw m9, [pb_m1_1] + mova [rsp+3*32], m8 + mova [rsp+4*32], m9 + paddw m10, m8 + paddw m11, m9 ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1 + pmulhrsw m8, m10, [pw_2048] + pmulhrsw m9, m11, [pw_2048] + packuswb m8, m9 + pand m8, m1 + pandn m9, m1, m7 + por m8, m9 +%ifidn %2, v + mova [tmpq+stride3q], m8 ; p4 +%else + mova [rsp+14*32], m8 +%endif + + ; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C + ; write -4 + mova m14, [rsp+1*32] + punpcklbw m8, m0, m13 + punpckhbw m9, m0, m13 + pmaddubsw m8, [pb_m1_1] + pmaddubsw m9, [pb_m1_1] + paddw m10, m8 + paddw m11, m9 ; p6*4+p5*2+p4*2+p3*2+p2*2+p1+p0+q0+q1 + punpcklbw m8, m2, m14 + punpckhbw m2, m14 + pmaddubsw m8, [pb_m1_1] + pmaddubsw m2, [pb_m1_1] + mova [rsp+1*32], m8 + paddw m10, m8 + paddw m11, m2 ; p6*4+p5+p4*2+p3*2+p2*2+p1+p0+q0+q1+q2 + pmulhrsw m8, m10, [pw_2048] + pmulhrsw m9, m11, [pw_2048] + packuswb m8, m9 + pand m8, m1 + pandn m9, m1, m12 + por m8, m9 +%ifidn %2, v + mova [tmpq+strideq*4], m8 ; p3 +%else + mova [rsp+19*32], m8 +%endif + + ; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D + ; write -3 + mova m15, [rsp+2*32] + punpcklbw m8, m0, m3 + punpckhbw m9, m0, m3 + pmaddubsw m8, [pb_m1_1] + pmaddubsw m9, [pb_m1_1] + paddw m10, m8 + paddw m11, m9 ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2 + punpcklbw m8, m7, m15 + punpckhbw m7, m15 + pmaddubsw m8, [pb_m1_1] + pmaddubsw m7, [pb_m1_1] + mova [rsp+2*32], m8 + paddw m10, m8 + paddw m11, m7 ; p6*3+p5+p4+p3*2+p2*2+p1*2+p0+q0+q1+q2+q3 + pmulhrsw m8, m10, [pw_2048] + pmulhrsw m9, m11, [pw_2048] + packuswb m8, m9 + pand m8, m1 + pandn m9, m1, m13 + por m8, m9 + mova [rsp+6*32], m8 ; don't clobber p2/m13 since we need it in F + + ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E + ; write -2 +%ifidn %2, v + lea tmpq, [dstq+strideq*4] +%endif + punpcklbw m8, m0, m4 + punpckhbw m9, m0, m4 + pmaddubsw m8, [pb_m1_1] + pmaddubsw m9, [pb_m1_1] + paddw m10, m8 + paddw m11, m9 ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3 +%ifidn %2, v + mova m9, [tmpq+strideq*0] ; q4 +%else + mova m9, [rsp+15*32] +%endif + punpcklbw m8, m12, m9 + punpckhbw m9, m12, m9 + pmaddubsw m8, [pb_m1_1] + pmaddubsw m9, [pb_m1_1] + mova [rsp+7*32], m8 + mova [rsp+5*32], m9 + paddw m10, m8 + paddw m11, m9 ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4 + pmulhrsw m8, m10, [pw_2048] + pmulhrsw m9, m11, [pw_2048] + packuswb m8, m9 + pand m8, m1 + pandn m9, m1, m3 + por m8, m9 + mova [rsp+8*32], m8 ; don't clobber p1/m3 since we need it in G + + ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F + ; write -1 +%ifidn %2, v + mova m9, [tmpq+strideq*1] ; q5 +%else + mova m9, [rsp+16*32] +%endif + punpcklbw m8, m0, m5 + punpckhbw m0, m5 + pmaddubsw m8, [pb_m1_1] + pmaddubsw m0, [pb_m1_1] + paddw m10, m8 + paddw m11, m0 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4 + punpcklbw m0, m13, m9 + punpckhbw m9, m13, m9 + mova m13, [rsp+6*32] + pmaddubsw m0, [pb_m1_1] + pmaddubsw m9, [pb_m1_1] + mova [rsp+ 9*32], m0 + mova [rsp+10*32], m9 + paddw m10, m0 + paddw m11, m9 ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5 + pmulhrsw m0, m10, [pw_2048] + pmulhrsw m8, m11, [pw_2048] + packuswb m0, m8 + pand m0, m1 + pandn m8, m1, m4 + por m0, m8 + mova [rsp+6*32], m0 ; don't clobber p0/m4 since we need it in H + + ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G + ; write +0 +%ifidn %2, v + mova m0, [tmpq+strideq*2] ; q6 +%else + mova m0, [rsp+17*32] +%endif + paddw m10, [rsp+3*32] + paddw m11, [rsp+4*32] ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5 + punpcklbw m8, m3, m0 + punpckhbw m9, m3, m0 + mova m3, [rsp+8*32] + pmaddubsw m8, [pb_m1_1] + pmaddubsw m9, [pb_m1_1] + mova [rsp+3*32], m8 + mova [rsp+4*32], m9 + paddw m10, m8 + paddw m11, m9 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6 + pmulhrsw m8, m10, [pw_2048] + pmulhrsw m9, m11, [pw_2048] + packuswb m8, m9 + pand m8, m1 + pandn m9, m1, m5 + por m8, m9 + mova [rsp+8*32], m8 ; don't clobber q0/m5 since we need it in I + + ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H + ; write +1 + paddw m10, [rsp+1*32] + paddw m11, m2 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6 + punpcklbw m8, m4, m0 + punpckhbw m2, m4, m0 + mova m4, [rsp+6*32] + pmaddubsw m8, [pb_m1_1] + pmaddubsw m2, [pb_m1_1] + paddw m10, m8 + paddw m11, m2 ; p4+p3+p2+p1+p0+q0*2+q1*2+q2*2+q3+q4+q5+q6*2 + pmulhrsw m2, m10, [pw_2048] + pmulhrsw m9, m11, [pw_2048] + packuswb m2, m9 + pand m2, m1 + pandn m9, m1, m6 + por m2, m9 ; don't clobber q1/m6 since we need it in K + + ; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I + ; write +2 + paddw m10, [rsp+2*32] + paddw m11, m7 ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2 + punpcklbw m8, m5, m0 + punpckhbw m9, m5, m0 + mova m5, [rsp+8*32] + pmaddubsw m8, [pb_m1_1] + pmaddubsw m9, [pb_m1_1] + paddw m10, m8 + paddw m11, m9 ; p3+p2+p1+p0+q0+q1*2+q2*2+q3*2+q4+q5+q6*3 + pmulhrsw m7, m10, [pw_2048] + pmulhrsw m9, m11, [pw_2048] + packuswb m7, m9 + pand m7, m1 + pandn m9, m1, m14 + por m7, m9 ; don't clobber q2/m14 since we need it in K + + ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J + ; write +3 + paddw m10, [rsp+7*32] + paddw m11, [rsp+5*32] ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3 + punpcklbw m8, m6, m0 + punpckhbw m9, m6, m0 + SWAP 2, 6 + pmaddubsw m8, [pb_m1_1] + pmaddubsw m9, [pb_m1_1] + paddw m10, m8 + paddw m11, m9 ; p2+p1+p0+q0+q1+q2*2+q3*2+q4*2+q5+q6*4 + pmulhrsw m8, m10, [pw_2048] + pmulhrsw m9, m11, [pw_2048] + packuswb m8, m9 + pand m8, m1 + pandn m9, m1, m15 + por m8, m9 +%ifidn %2, v + mova [tmpq+mstrideq], m8 ; q3 +%else + mova [rsp+20*32], m8 +%endif + + ; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K + ; write +4 + paddw m10, [rsp+ 9*32] + paddw m11, [rsp+10*32] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 + punpcklbw m8, m14, m0 + punpckhbw m9, m14, m0 + SWAP 14, 7 + pmaddubsw m8, [pb_m1_1] + pmaddubsw m9, [pb_m1_1] + paddw m10, m8 + paddw m11, m9 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5 + pmulhrsw m8, m10, [pw_2048] + pmulhrsw m9, m11, [pw_2048] + packuswb m8, m9 + pand m8, m1 +%ifidn %2, v + pandn m9, m1, [tmpq+strideq*0] +%else + pandn m9, m1, [rsp+15*32] +%endif + por m8, m9 +%ifidn %2, v + mova [tmpq+strideq*0], m8 ; q4 +%else + mova [rsp+15*32], m8 +%endif + + ; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L + ; write +5 + paddw m10, [rsp+3*32] + paddw m11, [rsp+4*32] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 + punpcklbw m8, m15, m0 + punpckhbw m9, m15, m0 + pmaddubsw m8, [pb_m1_1] + pmaddubsw m9, [pb_m1_1] + paddw m10, m8 + paddw m11, m9 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5 + pmulhrsw m10, [pw_2048] + pmulhrsw m11, [pw_2048] + packuswb m10, m11 + pand m10, m1 +%ifidn %2, v + pandn m11, m1, [tmpq+strideq*1] +%else + pandn m11, m1, [rsp+16*32] +%endif + por m10, m11 +%ifidn %2, v + mova [tmpq+strideq*1], m10 ; q5 +%else + mova [rsp+16*32], m10 +%endif + + mova m9, [rsp+0*32] +%ifidn %2, v + lea tmpq, [dstq+mstrideq*4] +%endif +%endif +%if %1 >= 8 + ; flat8 filter + punpcklbw m0, m12, m3 + punpckhbw m1, m12, m3 + pmaddubsw m2, m0, [pb_3_1] + pmaddubsw m7, m1, [pb_3_1] ; 3 * p3 + p1 + punpcklbw m8, m13, m4 + punpckhbw m11, m13, m4 + pmaddubsw m8, [pb_2_1] + pmaddubsw m11, [pb_2_1] + paddw m2, m8 + paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 + punpcklbw m8, m5, [pb_4] + punpckhbw m11, m5, [pb_4] + pmaddubsw m8, [pb_1] + pmaddubsw m11, [pb_1] + paddw m2, m8 + paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4 + psrlw m8, m2, 3 + psrlw m11, m7, 3 + packuswb m8, m11 + pand m8, m9 + pandn m11, m9, m13 + por m10, m8, m11 ; p2 +%ifidn %2, v + mova [tmpq+strideq*1], m10 ; p2 +%endif + + pmaddubsw m8, m0, [pb_m1_1] + pmaddubsw m11, m1, [pb_m1_1] + paddw m2, m8 + paddw m7, m11 + punpcklbw m8, m13, m6 + punpckhbw m11, m13, m6 + pmaddubsw m8, [pb_m1_1] + pmaddubsw m11, [pb_m1_1] + paddw m2, m8 + paddw m7, m11 ; 2 * p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4 + psrlw m8, m2, 3 + psrlw m11, m7, 3 + packuswb m8, m11 + pand m8, m9 + pandn m11, m9, m3 + por m8, m11 ; p1 +%ifidn %2, v + mova [tmpq+strideq*2], m8 ; p1 +%else + mova [rsp+0*32], m8 +%endif + + pmaddubsw m0, [pb_1] + pmaddubsw m1, [pb_1] + psubw m2, m0 + psubw m7, m1 + punpcklbw m8, m4, m14 + punpckhbw m11, m4, m14 + pmaddubsw m8, [pb_1] + pmaddubsw m11, [pb_1] + paddw m2, m8 + paddw m7, m11 ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4 + psrlw m8, m2, 3 + psrlw m11, m7, 3 + packuswb m8, m11 + pand m8, m9 + pandn m11, m9, m4 + por m8, m11 ; p0 +%ifidn %2, v + mova [tmpq+stride3q ], m8 ; p0 +%else + mova [rsp+1*32], m8 +%endif + + punpcklbw m0, m5, m15 + punpckhbw m1, m5, m15 + pmaddubsw m8, m0, [pb_1] + pmaddubsw m11, m1, [pb_1] + paddw m2, m8 + paddw m7, m11 + punpcklbw m8, m4, m12 + punpckhbw m11, m4, m12 + pmaddubsw m8, [pb_1] + pmaddubsw m11, [pb_1] + psubw m2, m8 + psubw m7, m11 ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4 + psrlw m8, m2, 3 + psrlw m11, m7, 3 + packuswb m8, m11 + pand m8, m9 + pandn m11, m9, m5 + por m11, m8, m11 ; q0 +%ifidn %2, v + mova [dstq+strideq*0], m11 ; q0 +%endif + + pmaddubsw m0, [pb_m1_1] + pmaddubsw m1, [pb_m1_1] + paddw m2, m0 + paddw m7, m1 + punpcklbw m8, m13, m6 + punpckhbw m13, m6 + pmaddubsw m8, [pb_m1_1] + pmaddubsw m13, [pb_m1_1] + paddw m2, m8 + paddw m7, m13 ; p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4 + psrlw m8, m2, 3 + psrlw m13, m7, 3 + packuswb m8, m13 + pand m8, m9 + pandn m13, m9, m6 + por m13, m8, m13 ; q1 +%ifidn %2, v + mova [dstq+strideq*1], m13 ; q1 +%endif + + punpcklbw m0, m3, m6 + punpckhbw m1, m3, m6 + pmaddubsw m0, [pb_1] + pmaddubsw m1, [pb_1] + psubw m2, m0 + psubw m7, m1 + punpcklbw m0, m14, m15 + punpckhbw m1, m14, m15 + pmaddubsw m0, [pb_1] + pmaddubsw m1, [pb_1] + paddw m2, m0 + paddw m7, m1 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4 + psrlw m2, 3 + psrlw m7, 3 + packuswb m2, m7 + pand m2, m9 + pandn m7, m9, m14 + por m2, m7 ; q2 +%ifidn %2, v + mova [dstq+strideq*2], m2 ; q2 +%else + mova m0, [rsp+0*32] + mova m1, [rsp+1*32] +%if %1 == 8 + ; 16x8 transpose + punpcklbw m3, m12, m10 + punpckhbw m12, m10 + punpcklbw m10, m0, m1 + punpckhbw m0, m1 + punpcklbw m1, m11, m13 + punpckhbw m11, m13 + punpcklbw m13, m2, m15 + punpckhbw m2, m15 + + punpcklwd m15, m3, m10 + punpckhwd m3, m10 + punpcklwd m10, m12, m0 + punpckhwd m12, m0 + punpcklwd m0, m1, m13 + punpckhwd m1, m13 + punpcklwd m13, m11, m2 + punpckhwd m11, m2 + + punpckldq m2, m15, m0 + punpckhdq m15, m0 + punpckldq m0, m3, m1 + punpckhdq m3, m1 + punpckldq m1, m10, m13 + punpckhdq m10, m13 + punpckldq m13, m12, m11 + punpckhdq m12, m11 + + ; write 8x32 + movq [dstq+strideq*0-4], xm2 + movhps [dstq+strideq*1-4], xm2 + movq [dstq+strideq*2-4], xm15 + movhps [dstq+stride3q -4], xm15 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0-4], xm0 + movhps [dstq+strideq*1-4], xm0 + movq [dstq+strideq*2-4], xm3 + movhps [dstq+stride3q -4], xm3 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0-4], xm1 + movhps [dstq+strideq*1-4], xm1 + movq [dstq+strideq*2-4], xm10 + movhps [dstq+stride3q -4], xm10 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0-4], xm13 + movhps [dstq+strideq*1-4], xm13 + movq [dstq+strideq*2-4], xm12 + movhps [dstq+stride3q -4], xm12 + lea dstq, [dstq+strideq*4] + + vextracti128 xm2, m2, 1 + vextracti128 xm15, m15, 1 + vextracti128 xm0, m0, 1 + vextracti128 xm3, m3, 1 + vextracti128 xm1, m1, 1 + vextracti128 xm10, m10, 1 + vextracti128 xm13, m13, 1 + vextracti128 xm12, m12, 1 + + movq [dstq+strideq*0-4], xm2 + movhps [dstq+strideq*1-4], xm2 + movq [dstq+strideq*2-4], xm15 + movhps [dstq+stride3q -4], xm15 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0-4], xm0 + movhps [dstq+strideq*1-4], xm0 + movq [dstq+strideq*2-4], xm3 + movhps [dstq+stride3q -4], xm3 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0-4], xm1 + movhps [dstq+strideq*1-4], xm1 + movq [dstq+strideq*2-4], xm10 + movhps [dstq+stride3q -4], xm10 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0-4], xm13 + movhps [dstq+strideq*1-4], xm13 + movq [dstq+strideq*2-4], xm12 + movhps [dstq+stride3q -4], xm12 + lea dstq, [dstq+strideq*4] +%else + ; 16x16 transpose and store + SWAP 5, 10, 2 + SWAP 6, 0 + SWAP 7, 1 + SWAP 8, 11 + SWAP 9, 13 + mova m0, [rsp+11*32] + mova m1, [rsp+12*32] + mova m2, [rsp+13*32] + mova m3, [rsp+14*32] + mova m4, [rsp+19*32] + mova m11, [rsp+20*32] + mova m12, [rsp+15*32] + mova m13, [rsp+16*32] + mova m14, [rsp+17*32] + TRANSPOSE_16X16B 1, 0, [rsp+18*32] + movu [dstq+strideq*0-8], xm0 + movu [dstq+strideq*1-8], xm1 + movu [dstq+strideq*2-8], xm2 + movu [dstq+stride3q -8], xm3 + lea dstq, [dstq+strideq*4] + movu [dstq+strideq*0-8], xm4 + movu [dstq+strideq*1-8], xm5 + movu [dstq+strideq*2-8], xm6 + movu [dstq+stride3q -8], xm7 + lea dstq, [dstq+strideq*4] + movu [dstq+strideq*0-8], xm8 + movu [dstq+strideq*1-8], xm9 + movu [dstq+strideq*2-8], xm10 + movu [dstq+stride3q -8], xm11 + lea dstq, [dstq+strideq*4] + movu [dstq+strideq*0-8], xm12 + movu [dstq+strideq*1-8], xm13 + movu [dstq+strideq*2-8], xm14 + movu [dstq+stride3q -8], xm15 + lea dstq, [dstq+strideq*4] + vextracti128 [dstq+strideq*0-8], m0, 1 + vextracti128 [dstq+strideq*1-8], m1, 1 + vextracti128 [dstq+strideq*2-8], m2, 1 + vextracti128 [dstq+stride3q -8], m3, 1 + lea dstq, [dstq+strideq*4] + vextracti128 [dstq+strideq*0-8], m4, 1 + vextracti128 [dstq+strideq*1-8], m5, 1 + vextracti128 [dstq+strideq*2-8], m6, 1 + vextracti128 [dstq+stride3q -8], m7, 1 + lea dstq, [dstq+strideq*4] + vextracti128 [dstq+strideq*0-8], m8, 1 + vextracti128 [dstq+strideq*1-8], m9, 1 + vextracti128 [dstq+strideq*2-8], m10, 1 + vextracti128 [dstq+stride3q -8], m11, 1 + lea dstq, [dstq+strideq*4] + vextracti128 [dstq+strideq*0-8], m12, 1 + vextracti128 [dstq+strideq*1-8], m13, 1 + vextracti128 [dstq+strideq*2-8], m14, 1 + vextracti128 [dstq+stride3q -8], m15, 1 + lea dstq, [dstq+strideq*4] +%endif +%endif +%elif %1 == 6 + ; flat6 filter + + punpcklbw m8, m13, m5 + punpckhbw m11, m13, m5 + pmaddubsw m0, m8, [pb_3_1] + pmaddubsw m1, m11, [pb_3_1] + punpcklbw m7, m4, m3 + punpckhbw m10, m4, m3 + pmaddubsw m2, m7, [pb_2] + pmaddubsw m12, m10, [pb_2] + paddw m0, m2 + paddw m1, m12 + pmulhrsw m2, m0, [pw_4096] + pmulhrsw m12, m1, [pw_4096] + packuswb m2, m12 + pand m2, m9 + pandn m12, m9, m3 + por m2, m12 +%ifidn %2, v + mova [tmpq+strideq*2], m2 ; p1 +%endif + + pmaddubsw m8, [pb_m1_1] + pmaddubsw m11, [pb_m1_1] + paddw m0, m8 + paddw m1, m11 + punpcklbw m8, m13, m6 + punpckhbw m11, m13, m6 + pmaddubsw m8, [pb_m1_1] + pmaddubsw m11, [pb_m1_1] + paddw m0, m8 + paddw m1, m11 + pmulhrsw m12, m0, [pw_4096] + pmulhrsw m13, m1, [pw_4096] + packuswb m12, m13 + pand m12, m9 + pandn m13, m9, m4 + por m12, m13 +%ifidn %2, v + mova [tmpq+stride3q], m12 ; p0 +%endif + + paddw m0, m8 + paddw m1, m11 + punpcklbw m8, m3, m14 + punpckhbw m11, m3, m14 + pmaddubsw m14, m8, [pb_m1_1] + pmaddubsw m13, m11, [pb_m1_1] + paddw m0, m14 + paddw m1, m13 + pmulhrsw m14, m0, [pw_4096] + pmulhrsw m13, m1, [pw_4096] + packuswb m14, m13 + pand m14, m9 + pandn m13, m9, m5 + por m14, m13 +%ifidn %2, v + mova [dstq+strideq*0], m14 ; q0 +%endif + + pmaddubsw m8, [pb_m1_2] + pmaddubsw m11, [pb_m1_2] + paddw m0, m8 + paddw m1, m11 + pmaddubsw m7, [pb_m1_0] + pmaddubsw m10, [pb_m1_0] + paddw m0, m7 + paddw m1, m10 + pmulhrsw m0, [pw_4096] + pmulhrsw m1, [pw_4096] + packuswb m0, m1 + pand m0, m9 + pandn m9, m6 + por m0, m9 +%ifidn %2, v + mova [dstq+strideq*1], m0 ; q1 +%else + TRANSPOSE_16x4_AND_WRITE_4x32 2, 12, 14, 0, 1 +%endif +%else +%ifidn %2, v + mova [tmpq+strideq*0], m3 ; p1 + mova [tmpq+strideq*1], m4 ; p0 + mova [tmpq+strideq*2], m5 ; q0 + mova [tmpq+stride3q ], m6 ; q1 +%else + TRANSPOSE_16x4_AND_WRITE_4x32 3, 4, 5, 6, 7 +%endif +%endif +%endmacro + +INIT_YMM avx2 +cglobal lpf_v_sb_y, 7, 10, 16, 32 * 11, \ + dst, stride, mask, l, l_stride, lut, \ + w, stride3, mstride, tmp + shl l_strideq, 2 + sub lq, l_strideq + mov mstrideq, strideq + neg mstrideq + lea stride3q, [strideq*3] + +.loop: + cmp byte [maskq+8], 0 ; vmask[2] + je .no_flat16 + + FILTER 16, v + jmp .end + +.no_flat16: + cmp byte [maskq+4], 0 ; vmask[1] + je .no_flat + + FILTER 8, v + jmp .end + +.no_flat: + cmp byte [maskq+0], 0 ; vmask[0] + je .end + + FILTER 4, v + +.end: + add lq, 32 + add dstq, 32 + add maskq, 1 + sub wd, 8 + jg .loop + RET + +INIT_YMM avx2 +cglobal lpf_h_sb_y, 7, 10, 16, 32 * 21, \ + dst, stride, mask, l, l_stride, lut, \ + h, stride3, l_stride3, tmp + shl l_strideq, 2 + sub lq, 4 + lea stride3q, [strideq*3] + lea l_stride3q, [l_strideq*3] + +.loop: + cmp byte [maskq+8], 0 ; vmask[2] + je .no_flat16 + + FILTER 16, h + jmp .end + +.no_flat16: + cmp byte [maskq+4], 0 ; vmask[1] + je .no_flat + + FILTER 8, h + jmp .end + +.no_flat: + cmp byte [maskq+0], 0 ; vmask[0] + je .no_filter + + FILTER 4, h + jmp .end + +.no_filter: + lea dstq, [dstq+stride3q*8] + lea lq, [lq+l_strideq*8] + lea dstq, [dstq+strideq*8] +.end: + add maskq, 1 + sub hd, 8 + jg .loop + RET + +INIT_YMM avx2 +cglobal lpf_v_sb_uv, 7, 10, 16, \ + dst, stride, mask, l, l_stride, lut, \ + w, stride3, mstride, tmp + shl l_strideq, 2 + sub lq, l_strideq + mov mstrideq, strideq + neg mstrideq + lea stride3q, [strideq*3] + +.loop: + cmp byte [maskq+4], 0 ; vmask[1] + je .no_flat + + FILTER 6, v + jmp .end + +.no_flat: + cmp byte [maskq+0], 0 ; vmask[0] + je .end + + FILTER 4, v + +.end: + add lq, 32 + add dstq, 32 + add maskq, 1 + sub wd, 8 + jg .loop + RET + +INIT_YMM avx2 +cglobal lpf_h_sb_uv, 7, 10, 16, \ + dst, stride, mask, l, l_stride, lut, \ + h, stride3, l_stride3, tmp + shl l_strideq, 2 + sub lq, 4 + lea stride3q, [strideq*3] + lea l_stride3q, [l_strideq*3] + +.loop: + cmp byte [maskq+4], 0 ; vmask[1] + je .no_flat + + FILTER 6, h + jmp .end + +.no_flat: + cmp byte [maskq+0], 0 ; vmask[0] + je .no_filter + + FILTER 4, h + jmp .end + +.no_filter: + lea dstq, [dstq+stride3q*8] + lea lq, [lq+l_strideq*8] + lea dstq, [dstq+strideq*8] +.end: + add maskq, 1 + sub hd, 8 + jg .loop + RET + +%endif ; ARCH_X86_64 diff --git a/third_party/dav1d/src/x86/loopfilter_init_tmpl.c b/third_party/dav1d/src/x86/loopfilter_init_tmpl.c new file mode 100644 index 0000000000..4d48c90491 --- /dev/null +++ b/third_party/dav1d/src/x86/loopfilter_init_tmpl.c @@ -0,0 +1,60 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/loopfilter.h" + +#define decl_loopfilter_sb_fns(ext) \ +decl_loopfilter_sb_fn(dav1d_lpf_h_sb_y_##ext); \ +decl_loopfilter_sb_fn(dav1d_lpf_v_sb_y_##ext); \ +decl_loopfilter_sb_fn(dav1d_lpf_h_sb_uv_##ext); \ +decl_loopfilter_sb_fn(dav1d_lpf_v_sb_uv_##ext) + +decl_loopfilter_sb_fns(ssse3); +decl_loopfilter_sb_fns(avx2); + +COLD void bitfn(dav1d_loop_filter_dsp_init_x86)(Dav1dLoopFilterDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; + +#if BITDEPTH == 8 + c->loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_ssse3; + c->loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_ssse3; + c->loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_ssse3; + c->loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_ssse3; +#endif + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; + +#if BITDEPTH == 8 && ARCH_X86_64 + c->loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_avx2; + c->loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_avx2; + c->loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_avx2; + c->loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_avx2; +#endif +} diff --git a/third_party/dav1d/src/x86/loopfilter_ssse3.asm b/third_party/dav1d/src/x86/loopfilter_ssse3.asm new file mode 100644 index 0000000000..cc70051a88 --- /dev/null +++ b/third_party/dav1d/src/x86/loopfilter_ssse3.asm @@ -0,0 +1,2348 @@ +; Copyright © 2018, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA 16 + +pb_4x0_4x4_4x8_4x12: db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 +pb_7_1: times 8 db 7, 1 +pb_3_1: times 8 db 3, 1 +pb_2_1: times 8 db 2, 1 +pb_m1_0: times 8 db -1, 0 +pb_m1_1: times 8 db -1, 1 +pb_m1_2: times 8 db -1, 2 +pb_1: times 16 db 1 +pb_2: times 16 db 2 +pb_3: times 16 db 3 +pb_4: times 16 db 4 +pb_16: times 16 db 16 +pb_63: times 16 db 63 +pb_64: times 16 db 64 +pb_128: times 16 db 0x80 +pb_129: times 16 db 0x81 +pb_240: times 16 db 0xf0 +pb_248: times 16 db 0xf8 +pb_254: times 16 db 0xfe + +pw_2048: times 8 dw 2048 +pw_4096: times 8 dw 4096 + +pd_mask: dd 1, 2, 4, 8 + +SECTION .text + +%macro ABSSUB 4 ; dst, a, b, tmp + psubusb %1, %2, %3 + psubusb %4, %3, %2 + por %1, %4 +%endmacro + +%macro TRANSPOSE_16x4_AND_WRITE_4x16 5 + ; transpose 16x4 + punpcklbw m%5, m%1, m%2 + punpckhbw m%1, m%2 + punpcklbw m%2, m%3, m%4 + punpckhbw m%3, m%4 + punpcklwd m%4, m%5, m%2 + punpckhwd m%5, m%2 + punpcklwd m%2, m%1, m%3 + punpckhwd m%1, m%3 + + ; write out +%assign %%n 0 +%rep 4 + movd [dstq+strideq *0-2], xm%4 + movd [dstq+strideq *4-2], xm%5 + movd [dstq+strideq *8-2], xm%2 + movd [dstq+stride3q*4-2], xm%1 + add dstq, strideq +%if %%n < 3 + psrldq xm%4, 4 + psrldq xm%5, 4 + psrldq xm%2, 4 + psrldq xm%1, 4 +%endif +%assign %%n (%%n+1) +%endrep + lea dstq, [dstq+stride3q*4] +%endmacro + +%macro TRANSPOSE_16X16B 2 ; output_transpose, mem +%if %1 == 0 + mova %2, m15 ; m7 in 32-bit +%endif + + ; input in m0-7 + punpcklbw m15, m0, m1 + punpckhbw m0, m1 + punpcklbw m1, m2, m3 + punpckhbw m2, m3 + punpcklbw m3, m4, m5 + punpckhbw m4, m5 +%if ARCH_X86_64 + SWAP 4, 5, 7 +%else + %if %1 == 0 + mova m5, %2 + %else + mova m5, [esp+1*16] + %endif + mova %2, m4 +%endif + punpcklbw m4, m6, m5 + punpckhbw m6, m5 + + ; interleaved in m15,0,1,2,3,7,4,6 + punpcklwd m5, m15, m1 + punpckhwd m15, m1 + punpcklwd m1, m0, m2 + punpckhwd m0, m2 + punpcklwd m2, m3, m4 + punpckhwd m3, m4 +%if ARCH_X86_64 + SWAP 3, 4, 7 +%else + mova m4, %2 + mova %2, m3 +%endif + punpcklwd m3, m4, m6 + punpckhwd m4, m6 + + ; interleaved in m5,15,1,0,2,7,3,4 + punpckldq m6, m5, m2 + punpckhdq m5, m2 +%if ARCH_X86_64 + SWAP 2, 7, 5 +%else + mova m2, %2 + mova [esp+1*16], m5 +%endif + punpckldq m5, m15, m2 + punpckhdq m15, m2 + punpckldq m2, m1, m3 + punpckhdq m1, m3 + punpckldq m3, m0, m4 + punpckhdq m0, m4 + +%if ARCH_X86_32 + mova [esp+0*16], m6 + mova [esp+2*16], m5 + mova [esp+3*16], m15 + mova [esp+4*16], m2 + mova [esp+5*16], m1 + mova [esp+6*16], m3 + mova [esp+7*16], m0 + mova m8, [esp+ 8*16] + mova m9, [esp+ 9*16] + mova m10, [esp+10*16] + %if %1 == 0 + mova m11, [esp+11*16] + mova m12, [esp+12*16] + mova m13, [esp+13*16] + mova m14, [esp+14*16] + %else + mova m11, [esp+20*16] + mova m12, [esp+15*16] + mova m13, [esp+16*16] + mova m14, [esp+17*16] + %endif +%endif + + ; input in m8-m15 +%if ARCH_X86_64 + SWAP 7, 4 +%endif + punpcklbw m7, m8, m9 + punpckhbw m8, m9 + punpcklbw m9, m10, m11 + punpckhbw m10, m11 + punpcklbw m11, m12, m13 + punpckhbw m12, m13 +%if ARCH_X86_64 + mova m13, %2 +%else + %if %1 == 0 + mova m13, [esp+15*16] + %else + mova m13, [esp+18*16] + %endif +%endif + mova %2, m12 + punpcklbw m12, m14, m13 + punpckhbw m14, m14, m13 + + ; interleaved in m7,8,9,10,11,rsp%2,12,14 + punpcklwd m13, m7, m9 + punpckhwd m7, m9 + punpcklwd m9, m8, m10 + punpckhwd m8, m10 + punpcklwd m10, m11, m12 + punpckhwd m11, m12 + mova m12, %2 + mova %2, m11 + punpcklwd m11, m12, m14 + punpckhwd m12, m14 + + ; interleaved in m13,7,9,8,10,rsp%2,11,12 + punpckldq m14, m13, m10 + punpckhdq m13, m10 + punpckldq m10, m9, m11 + punpckhdq m9, m11 + punpckldq m11, m8, m12 + punpckhdq m8, m12 + mova m12, %2 + mova %2, m8 + punpckldq m8, m7, m12 + punpckhdq m7, m12 + +%if ARCH_X86_32 + mova [esp+ 8*16], m10 + mova [esp+ 9*16], m9 + mova [esp+10*16], m11 + SWAP 6, 1 + SWAP 4, 2 + SWAP 5, 3 + mova m6, [esp+0*16] + mova m4, [esp+1*16] + mova m5, [esp+2*16] +%endif + + ; interleaved in m6,7,5,15,2,1,3,0,14,13,10,9,11,rsp%2,8,7 + punpcklqdq m12, m6, m14 + punpckhqdq m6, m14 + punpcklqdq m14, m4, m13 + punpckhqdq m4, m13 + punpcklqdq m13, m5, m8 + punpckhqdq m5, m8 +%if ARCH_X86_64 + SWAP 8, 5 +%else + mova m8, [esp+3*16] + mova [esp+27*16], m5 + %define m15 m8 +%endif + punpcklqdq m5, m15, m7 + punpckhqdq m15, m7 + +%if ARCH_X86_32 + mova [esp+11*16], m12 + mova [esp+12*16], m6 + mova [esp+13*16], m14 + mova [esp+14*16], m4 + mova [esp+26*16], m13 + mova [esp+ 0*16], m5 + mova [esp+ 1*16], m15 + mova m2, [esp+ 4*16] + mova m10, [esp+ 8*16] + mova m1, [esp+ 5*16] + mova m9, [esp+ 9*16] + mova m3, [esp+ 6*16] + mova m11, [esp+10*16] + mova m0, [esp+ 7*16] +%endif + + punpcklqdq m7, m2, m10 + punpckhqdq m2, m10 + punpcklqdq m10, m1, m9 + punpckhqdq m1, m9 + punpcklqdq m9, m3, m11 + punpckhqdq m3, m11 + mova m11, %2 +%if ARCH_X86_32 + %define m12 m3 +%endif + mova %2, m12 + punpcklqdq m12, m0, m11 + punpckhqdq m0, m11 +%if %1 == 1 + mova m11, %2 +%endif + +%if ARCH_X86_64 + ; interleaved m11,6,14,4,13,8,5,15,7,2,10,1,9,3,12,0 + SWAP 0, 11, 1, 6, 5, 8, 7, 15 + SWAP 2, 14, 12, 9 + SWAP 3, 4, 13 +%else + %if %1 == 0 + mova [esp+15*16], m9 + mova [esp+17*16], m12 + mova [esp+18*16], m0 + mova [esp+28*16], m10 + mova [esp+29*16], m1 + mova m3, [esp+0*16] + mova m4, [esp+1*16] + SWAP m5, m7 + SWAP m6, m2 + %else + SWAP 0, 7 + SWAP 3, 1, 2, 4, 6 + %endif +%endif +%endmacro + +%macro FILTER 2 ; width [4/6/8/16], dir [h/v] +%if ARCH_X86_64 + %define %%flat8mem [rsp+0*16] + %define %%q2mem [rsp+1*16] + %define %%q3mem [rsp+2*16] +%else + %if %1 == 4 || %1 == 6 + %define %%p2mem [esp+ 8*16] + %define %%q2mem [esp+ 9*16] + %define %%flat8mem [esp+10*16] + %else + %ifidn %2, v + %define %%p2mem [esp+16*16] + %define %%q2mem [esp+ 1*16] + %define %%q3mem [esp+18*16] + %define %%flat8mem [esp+ 0*16] + %define %%flat16mem [esp+20*16] + %else + %define %%p2mem [esp+27*16] + %define %%q2mem [esp+28*16] + %define %%q3mem [esp+29*16] + %define %%flat8mem [esp+21*16] + %define %%flat16mem [esp+30*16] + %endif + %endif + %xdefine m12reg m12 +%endif + +%if ARCH_X86_32 + lea stride3q, [strideq*3] +%endif + ; load data +%ifidn %2, v +%if ARCH_X86_32 + mov mstrideq, strideq + neg mstrideq +%endif +%if %1 == 4 + lea tmpq, [dstq+mstrideq*2] + mova m3, [tmpq+strideq*0] ; p1 + mova m4, [tmpq+strideq*1] ; p0 + mova m5, [tmpq+strideq*2] ; q0 + mova m6, [tmpq+stride3q] ; q1 +%else + ; load 6-8 pixels, remainder (for wd=16) will be read inline + lea tmpq, [dstq+mstrideq*4] + ; we load p3 later +%define %%p3mem [dstq+mstrideq*4] + %if ARCH_X86_32 + %define m13 m0 + %define m14 m1 + %define m15 m2 + %endif + mova m13, [tmpq+strideq*1] + mova m3, [tmpq+strideq*2] + mova m4, [tmpq+stride3q] + mova m5, [dstq+strideq*0] + mova m6, [dstq+strideq*1] + mova m14, [dstq+strideq*2] +%if %1 != 6 + mova m15, [dstq+stride3q] +%endif + %if ARCH_X86_32 + mova %%p2mem, m13 + mova %%q2mem, m14 + %define m13 %%p2mem + %define m14 %%q2mem + %if %1 != 6 + mova %%q3mem, m15 + %define m15 %%q3mem + %endif + %endif +%endif +%else ; %2 == h + ; load lines +%if %1 == 4 + ; transpose 4x16 + movd m7, [dstq+strideq*0-2] + movd m3, [dstq+strideq*1-2] + movd m4, [dstq+strideq*2-2] + movd m5, [dstq+stride3q -2] + lea tmpq, [dstq+strideq*4] + punpcklbw m7, m3 + punpcklbw m4, m5 + movd m3, [tmpq+strideq*0-2] + movd m1, [tmpq+strideq*1-2] + movd m5, [tmpq+strideq*2-2] + movd m6, [tmpq+stride3q -2] + lea tmpq, [tmpq+strideq*4] + punpcklbw m3, m1 + punpcklbw m5, m6 + movd m0, [tmpq+strideq*0-2] + movd m1, [tmpq+strideq*1-2] + punpcklbw m0, m1 + movd m1, [tmpq+strideq*2-2] + movd m2, [tmpq+stride3q -2] + punpcklbw m1, m2 + punpcklqdq m7, m0 + punpcklqdq m4, m1 + lea tmpq, [tmpq+strideq*4] + movd m0, [tmpq+strideq*0-2] + movd m1, [tmpq+strideq*1-2] + punpcklbw m0, m1 + movd m1, [tmpq+strideq*2-2] + movd m2, [tmpq+stride3q -2] + punpcklbw m1, m2 + punpcklqdq m3, m0 + punpcklqdq m5, m1 + ; xm7: A0-1,B0-1,C0-1,D0-1,A8-9,B8-9,C8-9,D8-9 + ; xm3: A4-5,B4-5,C4-5,D4-5,A12-13,B12-13,C12-13,D12-13 + ; xm4: A2-3,B2-3,C2-3,D2-3,A10-11,B10-11,C10-11,D10-11 + ; xm5: A6-7,B6-7,C6-7,D6-7,A14-15,B14-15,C14-15,D14-15 + punpcklwd m6, m7, m4 + punpckhwd m7, m4 + punpcklwd m4, m3, m5 + punpckhwd m3, m5 + ; xm6: A0-3,B0-3,C0-3,D0-3 + ; xm7: A8-11,B8-11,C8-11,D8-11 + ; xm4: A4-7,B4-7,C4-7,D4-7 + ; xm3: A12-15,B12-15,C12-15,D12-15 + punpckldq m5, m6, m4 + punpckhdq m6, m4 + punpckldq m4, m7, m3 + punpckhdq m7, m3 + ; xm5: A0-7,B0-7 + ; xm6: C0-7,D0-7 + ; xm4: A8-15,B8-15 + ; xm7: C8-15,D8-15 + punpcklqdq m3, m5, m4 + punpckhqdq m5, m5, m4 + punpcklqdq m4, m6, m7 + punpckhqdq m6, m7 + ; xm3: A0-15 + ; xm5: B0-15 + ; xm4: C0-15 + ; xm6: D0-15 + SWAP 4, 5 +%elif %1 == 6 || %1 == 8 + ; transpose 8x16 + movq m7, [dstq+strideq*0-%1/2] + movq m3, [dstq+strideq*1-%1/2] + movq m4, [dstq+strideq*2-%1/2] + movq m5, [dstq+stride3q -%1/2] + lea tmpq, [dstq+strideq*8] + punpcklbw m7, m3 + punpcklbw m4, m5 + movq m3, [tmpq+strideq*0-%1/2] + movq m1, [tmpq+strideq*1-%1/2] + movq m5, [tmpq+strideq*2-%1/2] + movq m6, [tmpq+stride3q -%1/2] + lea tmpq, [dstq+strideq*4] + punpcklbw m3, m1 + punpcklbw m5, m6 + movq m6, [tmpq+strideq*0-%1/2] + movq m0, [tmpq+strideq*1-%1/2] + movq m1, [tmpq+strideq*2-%1/2] + movq m2, [tmpq+stride3q -%1/2] + lea tmpq, [tmpq+strideq*8] + punpcklbw m6, m0 + punpcklbw m1, m2 + movq m2, [tmpq+strideq*2-%1/2] + movq m0, [tmpq+stride3q -%1/2] + punpcklbw m2, m0 +%if ARCH_X86_64 + SWAP m15, m2 +%else + %define m15 [esp+3*16] + mova m15, m2 +%endif + movq m0, [tmpq+strideq*0-%1/2] + movq m2, [tmpq+strideq*1-%1/2] + punpcklbw m0, m2 + ; xm7: A0-1,B0-1,C0-1,D0-1,E0-1,F0-1,G0-1,H0-1 + ; xm3: A8-9,B8-9,C8-9,D8-9,E8-9,F8-9,G8-9,H8-9 + ; xm4: A2-3,B2-3,C2-3,D2-3,E2-3,F2-3,G2-3,H2-3 + ; xm5: A10-11,B10-11,C10-11,D10-11,E10-11,F10-11,G10-11,H10-11 + ; xm6: A4-5,B4-5,C4-5,D4-5,E4-5,F4-5,G4-5,H4-5 + ; xm0: A12-13,B12-13,C12-13,D12-13,E12-13,F12-13,G12-13,H12-13 + ; xm1: A6-7,B6-7,C6-7,D6-7,E6-7,F6-7,G6-7,H6-7 + ; xm2: A14-15,B14-15,C14-15,D14-15,E14-15,F14-15,G14-15,H14-15 + punpcklwd m2, m7, m4 + punpckhwd m7, m4 + punpcklwd m4, m3, m5 + punpckhwd m3, m5 + punpcklwd m5, m6, m1 + punpckhwd m6, m1 + punpcklwd m1, m0, m15 + punpckhwd m0, m15 +%if ARCH_X86_64 + SWAP m15, m0 +%else + mova m15, m0 +%endif + ; xm2: A0-3,B0-3,C0-3,D0-3 + ; xm7: E0-3,F0-3,G0-3,H0-3 + ; xm4: A8-11,B8-11,C8-11,D8-11 + ; xm3: E8-11,F8-11,G8-11,H8-11 + ; xm5: A4-7,B4-7,C4-7,D4-7 + ; xm6: E4-7,F4-7,G4-7,H4-7 + ; xm1: A12-15,B12-15,C12-15,D12-15 + ; xm0: E12-15,F12-15,G12-15,H12-15 + punpckldq m0, m2, m5 + punpckhdq m2, m5 + punpckldq m5, m7, m6 +%if %1 != 6 + punpckhdq m7, m6 +%endif + punpckldq m6, m4, m1 + punpckhdq m4, m1 + punpckldq m1, m3, m15 +%if %1 != 6 + punpckhdq m3, m15 + %if ARCH_X86_64 + SWAP m15, m3 + %else + mova m15, m3 + %endif +%endif + ; xm0: A0-7,B0-7 + ; xm2: C0-7,D0-7 + ; xm5: E0-7,F0-7 + ; xm7: G0-7,H0-7 + ; xm6: A8-15,B8-15 + ; xm4: C8-15,D8-15 + ; xm1: E8-15,F8-15 + ; xm3: G8-15,H8-15 + punpcklqdq m3, m0, m6 + punpckhqdq m0, m6 + punpckhqdq m6, m2, m4 + punpcklqdq m2, m4 + punpcklqdq m4, m5, m1 + punpckhqdq m5, m1 +%if %1 == 8 + punpcklqdq m1, m7, m15 + punpckhqdq m7, m15 + ; xm3: A0-15 + ; xm0: B0-15 + ; xm2: C0-15 + ; xm6: D0-15 + ; xm4: E0-15 + ; xm5: F0-15 + ; xm1: G0-15 + ; xm7: H0-15 +%if ARCH_X86_64 + SWAP 11, 3, 2 + SWAP 13, 0 + SWAP 6, 5, 4 + SWAP 14, 1 + SWAP 15, 7 + ; 3,0,2,6,4,5,1,7 -> 11,13,3,4,5,6,14,15 + mova [rsp+21*16], m11 + %define %%p3mem [rsp+21*16] +%else + %define m11 [esp+26*16] + %define m13 [esp+27*16] + %define m14 [esp+28*16] + %define m15 [esp+29*16] + mova m11, m3 + mova m13, m0 + SWAP 3, 2 + SWAP 6, 5, 4 + mova m14, m1 + mova m15, m7 + %define %%p3mem [esp+26*16] +%endif +%else + %if ARCH_X86_64 + SWAP 13, 3, 0 + SWAP 14, 5, 6, 4, 2 + ; 3,0,2,6,4,5 -> 13,3,4,5,6,14 + %else + %define m13 %%p2mem + %define m14 %%q2mem + mova m13, m3 + mova m14, m5 + SWAP 3, 0 + SWAP 5, 6, 4, 2 + ; 0,2,6,4 -> 3,4,5,6 + %endif +%endif +%else +%if ARCH_X86_64 + mova [rsp+20*16], m12 +%endif + ; load and 16x16 transpose. We only use 14 pixels but we'll need the + ; remainder at the end for the second transpose +%if ARCH_X86_32 + %xdefine m8 m0 + %xdefine m9 m1 + %xdefine m10 m2 + %xdefine m11 m3 + %xdefine m12 m4 + %xdefine m13 m5 + %xdefine m14 m6 + %xdefine m15 m7 + lea tmpq, [dstq+strideq*8] + movu m8, [tmpq+strideq*0-8] + movu m9, [tmpq+strideq*1-8] + movu m10, [tmpq+strideq*2-8] + movu m11, [tmpq+stride3q -8] + lea tmpq, [tmpq+strideq*4] + movu m12, [tmpq+strideq*0-8] + movu m13, [tmpq+strideq*1-8] + movu m14, [tmpq+strideq*2-8] + movu m15, [tmpq+stride3q -8] + mova [esp+ 8*16], m8 + mova [esp+ 9*16], m9 + mova [esp+10*16], m10 + mova [esp+11*16], m11 + mova [esp+12*16], m12 + mova [esp+13*16], m13 + mova [esp+14*16], m14 + mova [esp+15*16], m15 +%endif + movu m0, [dstq+strideq*0-8] + movu m1, [dstq+strideq*1-8] + movu m2, [dstq+strideq*2-8] + movu m3, [dstq+stride3q -8] + lea tmpq, [dstq+strideq*4] + movu m4, [tmpq+strideq*0-8] + movu m5, [tmpq+strideq*1-8] + movu m6, [tmpq+strideq*2-8] + movu m7, [tmpq+stride3q -8] + lea tmpq, [tmpq+strideq*4] +%if ARCH_X86_64 + movu m8, [tmpq+strideq*0-8] + movu m9, [tmpq+strideq*1-8] + movu m10, [tmpq+strideq*2-8] + movu m11, [tmpq+stride3q -8] + lea tmpq, [tmpq+strideq*4] + movu m12, [tmpq+strideq*0-8] + movu m13, [tmpq+strideq*1-8] + movu m14, [tmpq+strideq*2-8] + movu m15, [tmpq+stride3q -8] +%endif + +%if ARCH_X86_64 + TRANSPOSE_16X16B 0, [rsp+11*16] + mova [rsp+12*16], m1 + mova [rsp+13*16], m2 + mova [rsp+14*16], m3 + mova [rsp+15*16], m12 + mova [rsp+16*16], m13 + mova [rsp+17*16], m14 + mova [rsp+18*16], m15 + ; 4,5,6,7,8,9,10,11 -> 12,13,3,4,5,6,14,15 + SWAP 12, 4, 7 + SWAP 13, 5, 8 + SWAP 3, 6, 9 + SWAP 10, 14 + SWAP 11, 15 + mova [rsp+21*16], m12 + %define %%p3mem [rsp+21*16] + mova m12, [rsp+20*16] +%else + TRANSPOSE_16X16B 0, [esp+16*16] + %define %%p3mem [esp+26*16] + %define m11 %%p3mem + %define m13 %%p2mem + %define m14 %%q2mem + %define m15 %%q3mem +%endif +%endif ; if 4 elif 6 or 8 else 16 +%endif ; if v else h + + ; load L/E/I/H +%if ARCH_X86_32 + mov l_strideq, l_stridem +%endif +%ifidn %2, v + movu m1, [lq] + movu m0, [lq+l_strideq] +%else + %if ARCH_X86_32 + lea l_stride3q, [l_strideq*3] + %endif + movq xm1, [lq] + movq xm2, [lq+l_strideq*2] + movhps xm1, [lq+l_strideq] + movhps xm2, [lq+l_stride3q] + shufps m0, m1, m2, q3131 + shufps m1, m2, q2020 + %if ARCH_X86_32 + lea stride3q, [strideq*3] + %endif +%endif + +%if ARCH_X86_32 + %ifidn %2, v + mov lutd, lutm + %endif +%endif + pxor m2, m2 + pcmpeqb m7, m2, m0 + pand m1, m7 + por m0, m1 ; l[x][] ? l[x][] : l[x-stride][] + pshufb m0, [PIC_sym(pb_4x0_4x4_4x8_4x12)] ; l[x][1] + pcmpeqb m2, m0 ; !L + psrlq m7, m0, [lutq+128] + pand m7, [PIC_sym(pb_63)] + pminub m7, minlvl + pmaxub m7, [PIC_sym(pb_1)] ; I + pand m1, m0, [PIC_sym(pb_240)] + psrlq m1, 4 ; H + paddb m0, [PIC_sym(pb_2)] + paddb m0, m0 + paddb m0, m7 ; E + pxor m1, [PIC_sym(pb_128)] + pxor m7, [PIC_sym(pb_128)] + pxor m0, [PIC_sym(pb_128)] + SWAP 2, 7 + +%if ARCH_X86_64 + SWAP 0, 8 + SWAP 2, 10 +%else + %ifidn %2, v + mov mstrideq, strideq + neg mstrideq + %if %1 == 4 + lea tmpq, [dstq+mstrideq*2] + %elif %1 == 6 || %1 == 8 + lea tmpq, [dstq+mstrideq*4] + %endif + %endif + mova [esp+3*16], m0 + mova [esp+4*16], m2 +%endif + + ABSSUB m0, m3, m4, m2 ; abs(p1-p0) + pmaxub m0, m7 + ABSSUB m2, m5, m6, m7 ; abs(q1-q0) + pmaxub m0, m2 +%if %1 == 4 + pxor m0, [PIC_sym(pb_128)] + pcmpgtb m7, m0, m1 ; hev + %if ARCH_X86_64 + SWAP 7, 11 + %else + mova [esp+5*16], m7 + %endif +%else + pxor m7, m0, [PIC_sym(pb_128)] + pcmpgtb m7, m1 ; hev +%if ARCH_X86_64 + SWAP 7, 11 +%else + mova [esp+5*16], m7 +%endif + +%if %1 == 6 + ABSSUB m1, m13, m4, m7 ; abs(p2-p0) + pmaxub m1, m0 +%else + mova m2, %%p3mem + ABSSUB m1, m2, m4, m7 ; abs(p3-p0) + pmaxub m1, m0 + ABSSUB m7, m13, m4, m2 ; abs(p2-p0) + pmaxub m1, m7 +%endif + ABSSUB m7, m5, m14, m2 ; abs(p2-p0) + pmaxub m1, m7 +%if %1 != 6 + ABSSUB m7, m5, m15, m2 ; abs(q3-q0) + pmaxub m1, m7 +%endif + pxor m1, [PIC_sym(pb_128)] + pcmpgtb m1, [PIC_sym(pb_129)] ; !flat8in +%if ARCH_X86_64 + SWAP 1, 9 +%else + mova [esp+6*16], m1 +%endif + +%if %1 == 6 + ABSSUB m7, m13, m3, m1 ; abs(p2-p1) +%else + mova m2, %%p3mem + ABSSUB m7, m2, m13, m1 ; abs(p3-p2) + ABSSUB m2, m13, m3, m1 ; abs(p2-p1) + pmaxub m7, m2 + ABSSUB m2, m14, m15, m1 ; abs(q3-q2) + pmaxub m7, m2 +%endif + ABSSUB m2, m14, m6, m1 ; abs(q2-q1) + pmaxub m7, m2 +%if ARCH_X86_32 + %define m12 m1 + mova m12, maskmem +%endif + pand m2, m12, mask1 + pcmpeqd m2, m12 + pand m7, m2 ; only apply fm-wide to wd>4 blocks + pmaxub m0, m7 + + pxor m0, [PIC_sym(pb_128)] +%endif ; %if %1 == 4 else +%if ARCH_X86_64 + SWAP 2, 10 + pcmpgtb m0, m2 +%else + pcmpgtb m0, [esp+4*16] +%endif + + ABSSUB m1, m3, m6, m7 ; abs(p1-q1) + ABSSUB m7, m4, m5, m2 ; abs(p0-q0) + paddusb m7, m7 + pand m1, [PIC_sym(pb_254)] + psrlq m1, 1 + paddusb m1, m7 ; abs(p0-q0)*2+(abs(p1-q1)>>1) + pxor m1, [PIC_sym(pb_128)] +%if ARCH_X86_64 + pcmpgtb m1, m8 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E +%else + pcmpgtb m1, [esp+3*16] +%endif + por m0, m1 + +%if %1 == 16 +%if ARCH_X86_64 + SWAP 0, 8 +%else + mova [esp+3*16], m0 +%endif +%ifidn %2, v + lea tmpq, [dstq+mstrideq*8] + mova m0, [tmpq+strideq*1] +%else + mova m0, [rsp+12*16] +%endif + ABSSUB m1, m0, m4, m2 +%ifidn %2, v + mova m0, [tmpq+strideq*2] +%else + mova m0, [rsp+13*16] +%endif + ABSSUB m2, m0, m4, m7 + pmaxub m1, m2 +%ifidn %2, v + mova m0, [tmpq+stride3q] +%else + mova m0, [rsp+14*16] +%endif + ABSSUB m2, m0, m4, m7 + pmaxub m1, m2 +%ifidn %2, v + lea tmpq, [dstq+strideq*4] + mova m0, [tmpq+strideq*0] +%else + mova m0, [rsp+15*16] +%endif + ABSSUB m2, m0, m5, m7 + pmaxub m1, m2 +%ifidn %2, v + mova m0, [tmpq+strideq*1] +%else + mova m0, [rsp+16*16] +%endif + ABSSUB m2, m0, m5, m7 + pmaxub m1, m2 +%ifidn %2, v + mova m0, [tmpq+strideq*2] +%else + mova m0, [rsp+17*16] +%endif + ABSSUB m2, m0, m5, m7 + pmaxub m1, m2 + pxor m1, [PIC_sym(pb_128)] + pcmpgtb m1, [PIC_sym(pb_129)] ; !flat8out +%if ARCH_X86_64 + por m1, m9 ; !flat8in | !flat8out +%else + por m1, [esp+6*16] + %define m12 m7 + mova m12, maskmem +%endif + pand m2, m12, mask2 + pcmpeqd m2, m12 + pandn m1, m2 ; flat16 +%if ARCH_X86_64 + pandn m2, m8, m1 ; flat16 & fm +%else + pandn m2, [esp+3*16], m1 ; flat16 & fm + mova %%flat16mem, m2 +%endif + SWAP 1, 2 + + pand m2, m12, mask1 + pcmpeqd m2, m12 +%if ARCH_X86_64 + pandn m9, m2 ; flat8in + pandn m2, m8, m9 + SWAP 2, 9 +%else + pandn m0, [esp+6*16], m2 + pandn m2, [esp+3*16], m0 + mova [esp+6*16], m2 +%endif + pand m2, m12, mask0 + pcmpeqd m2, m12 +%if ARCH_X86_64 + pandn m8, m2 + pandn m2, m9, m8 ; fm & !flat8 & !flat16 + SWAP 2, 8 + pandn m2, m1, m9 ; flat8 & !flat16 + SWAP 2, 9 + SWAP 0, 8 + SWAP 1, 10 +%else + pandn m0, [esp+3*16], m2 + pandn m2, [esp+6*16], m0 + SWAP 2, 0 + pandn m2, m1, [esp+6*16] + mova %%flat8mem, m2 +%endif +%elif %1 != 4 + %if ARCH_X86_64 + SWAP 1, 9 + %else + %define m12 m7 + mova m12, maskmem + mova m1, [esp+6*16] + %endif + pand m2, m12, mask1 + pcmpeqd m2, m12 + pandn m1, m2 + pandn m2, m0, m1 ; flat8 & fm + pand m1, m12, mask0 + pcmpeqd m1, m12 + pandn m0, m1 + pandn m1, m2, m0 ; fm & !flat8 + SWAP 1, 2, 0 + %if ARCH_X86_64 + SWAP 1, 9 + %else + mova %%flat8mem, m1 + %endif +%else +%if ARCH_X86_32 + %define m12 m1 + mova m12, maskmem +%endif + pand m2, m12, mask0 + pcmpeqd m2, m12 + pandn m0, m2 ; fm +%endif + + ; short filter + + mova m1, [PIC_sym(pb_128)] +%if ARCH_X86_64 + SWAP 7, 11 +%else + mova m7, [esp+5*16] +%endif + pxor m3, m1 + pxor m6, m1 + pxor m4, m1 + pxor m5, m1 + psubsb m1, m3, m6 ; iclip_diff(p1-q1) + pand m1, m7 ; f=iclip_diff(p1-q1)&hev + psubsb m2, m5, m4 + paddsb m1, m2 + paddsb m1, m2 + paddsb m1, m2 ; f=iclip_diff(3*(q0-p0)+f) + mova m2, [PIC_sym(pb_16)] + pand m0, m1 ; f&=fm + paddsb m1, m0, [PIC_sym(pb_3)] + paddsb m0, [PIC_sym(pb_4)] + pand m1, [PIC_sym(pb_248)] + pand m0, [PIC_sym(pb_248)] + psrlq m1, 3 + psrlq m0, 3 + pxor m1, m2 + pxor m0, m2 + psubb m1, m2 ; f2 + psubb m0, m2 ; f1 + mova m2, [PIC_sym(pb_128)] + paddsb m4, m1 + psubsb m5, m0 + pxor m4, m2 + pxor m5, m2 + + pxor m0, m2 + pxor m1, m1 + pavgb m0, m1 ; f=(f1+1)>>1 + psubb m0, [PIC_sym(pb_64)] + pandn m7, m0 ; f&=!hev + paddsb m3, m7 + psubsb m6, m7 + pxor m3, m2 + pxor m6, m2 + +%if %1 == 16 + ; flat16 filter +%ifidn %2, v + lea tmpq, [dstq+mstrideq*8] + mova m0, [tmpq+strideq*1] ; p6 + mova m2, [tmpq+strideq*2] ; p5 + mova m7, [tmpq+stride3q] ; p4 +%else + mova m0, [rsp+12*16] + mova m2, [rsp+13*16] + mova m7, [rsp+14*16] +%endif + +%if ARCH_X86_64 + SWAP 1, 10 + mova %%flat8mem, m9 + mova %%q2mem, m14 + mova %%q3mem, m15 + SWAP 0, 8 + SWAP 1, 9 +%else + %ifidn %2, v + mova [esp+17*16], m0 + mova [esp+19*16], m3 + mova [esp+21*16], m4 + mova [esp+22*16], m5 + mova [esp+23*16], m6 + %xdefine m11 m3 + %xdefine m14 m4 + %xdefine m15 m5 + %xdefine m10 m6 + %define m13 %%p2mem + %define m8 [esp+17*16] + %define m9 %%flat16mem + %define m3 [esp+19*16] + %define m4 [esp+21*16] + %define m5 [esp+22*16] + %define m6 [esp+23*16] + %else + mova [esp+31*16], m0 + mova [esp+32*16], m3 + mova [esp+33*16], m4 + mova [esp+34*16], m5 + mova [esp+35*16], m6 + %xdefine m11 m3 + %xdefine m14 m4 + %xdefine m15 m5 + %xdefine m10 m6 + %define m13 %%p2mem + %define m8 [esp+31*16] + %define m9 %%flat16mem + %define m3 [esp+32*16] + %define m4 [esp+33*16] + %define m5 [esp+34*16] + %define m6 [esp+35*16] + %endif +%endif + + ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A + ; write -6 + mova m11, %%p3mem +%if ARCH_X86_64 + punpcklbw m14, m8, m11 + punpckhbw m15, m8, m11 +%else + punpcklbw m14, m0, m11 + punpckhbw m15, m0, m11 +%endif +%ifidn %2, v + mova [rsp+5*16], m11 +%endif + pmaddubsw m10, m14, [PIC_sym(pb_7_1)] + pmaddubsw m11, m15, [PIC_sym(pb_7_1)] ; p6*7+p3 + punpcklbw m0, m2, m7 + punpckhbw m1, m2, m7 + pmaddubsw m0, [PIC_sym(pb_2)] + pmaddubsw m1, [PIC_sym(pb_2)] + paddw m10, m0 + paddw m11, m1 ; p6*7+p5*2+p4*2+p3 + punpcklbw m0, m13, m3 + punpckhbw m1, m13, m3 + pmaddubsw m0, [PIC_sym(pb_1)] + pmaddubsw m1, [PIC_sym(pb_1)] + paddw m10, m0 + paddw m11, m1 ; p6*7+p5*2+p4*2+p3+p2+p1 + punpcklbw m0, m4, m5 + punpckhbw m1, m4, m5 + pmaddubsw m0, [PIC_sym(pb_1)] + pmaddubsw m1, [PIC_sym(pb_1)] + paddw m10, m0 + paddw m11, m1 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 + pmulhrsw m0, m10, [PIC_sym(pw_2048)] + pmulhrsw m1, m11, [PIC_sym(pw_2048)] + packuswb m0, m1 + pand m0, m9 + pandn m1, m9, m2 + por m0, m1 +%ifidn %2, v + mova [tmpq+strideq*2], m0 ; p5 +%else + mova [rsp+13*16], m0 +%endif + + ; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B + ; write -5 + pmaddubsw m14, [PIC_sym(pb_m1_1)] + pmaddubsw m15, [PIC_sym(pb_m1_1)] + paddw m10, m14 + paddw m11, m15 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0 + punpcklbw m0, m8, m6 + punpckhbw m1, m8, m6 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m1, [PIC_sym(pb_m1_1)] + mova [rsp+3*16], m0 + mova [rsp+4*16], m1 + paddw m10, m0 + paddw m11, m1 ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1 + pmulhrsw m0, m10, [PIC_sym(pw_2048)] + pmulhrsw m1, m11, [PIC_sym(pw_2048)] + packuswb m0, m1 + pand m0, m9 + pandn m1, m9, m7 + por m0, m1 +%ifidn %2, v + mova [tmpq+stride3q], m0 ; p4 +%else + mova [rsp+14*16], m0 +%endif + + ; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C + ; write -4 + mova m14, %%q2mem + punpcklbw m0, m8, m13 + punpckhbw m1, m8, m13 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m1, [PIC_sym(pb_m1_1)] + paddw m10, m0 + paddw m11, m1 ; p6*4+p5*2+p4*2+p3*2+p2*2+p1+p0+q0+q1 + punpcklbw m0, m2, m14 + punpckhbw m2, m14 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m2, [PIC_sym(pb_m1_1)] + mova [rsp+1*16], m0 + paddw m10, m0 + paddw m11, m2 ; p6*4+p5+p4*2+p3*2+p2*2+p1+p0+q0+q1+q2 + pmulhrsw m0, m10, [PIC_sym(pw_2048)] + pmulhrsw m1, m11, [PIC_sym(pw_2048)] + packuswb m0, m1 + pand m0, m9 + pandn m1, m9, %%p3mem + por m0, m1 +%ifidn %2, v + mova [tmpq+strideq*4], m0 ; p3 +%else + mova [rsp+19*16], m0 +%endif + + ; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D + ; write -3 + mova m15, %%q3mem + punpcklbw m0, m8, m3 + punpckhbw m1, m8, m3 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m1, [PIC_sym(pb_m1_1)] + paddw m10, m0 + paddw m11, m1 ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2 + punpcklbw m0, m7, m15 + punpckhbw m7, m15 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m7, [PIC_sym(pb_m1_1)] + mova [rsp+2*16], m0 +%if ARCH_X86_32 + %ifidn %2, v + mova [esp+24*16], m7 + %else + mova [esp+36*16], m7 + %endif +%endif + paddw m10, m0 + paddw m11, m7 ; p6*3+p5+p4+p3*2+p2*2+p1*2+p0+q0+q1+q2+q3 + pmulhrsw m0, m10, [PIC_sym(pw_2048)] + pmulhrsw m1, m11, [PIC_sym(pw_2048)] + packuswb m0, m1 + pand m0, m9 + pandn m1, m9, m13 + por m0, m1 + mova [rsp+6*16], m0 ; don't clobber p2/m13 since we need it in F + + ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E + ; write -2 + punpcklbw m0, m8, m4 + punpckhbw m1, m8, m4 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m1, [PIC_sym(pb_m1_1)] + paddw m10, m0 + paddw m11, m1 ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3 +%if ARCH_X86_64 + SWAP 7, 8 +%endif +%ifidn %2, v + mova m1, [dstq+strideq*4] ; q4 + mova m7, [rsp+5*16] ; (pre-filter) p3 +%else + mova m1, [rsp+15*16] + mova m7, %%p3mem ; (pre-filter) p3 +%endif + punpcklbw m0, m1, m7 + punpckhbw m1, m1, m7 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m1, [PIC_sym(pb_m1_1)] + mova [rsp+7*16], m0 + mova [rsp+5*16], m1 + psubw m10, m0 + psubw m11, m1 ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4 + pmulhrsw m0, m10, [PIC_sym(pw_2048)] + pmulhrsw m1, m11, [PIC_sym(pw_2048)] + packuswb m0, m1 + pand m0, m9 + pandn m1, m9, m3 + por m0, m1 + mova [rsp+8*16], m0 ; don't clobber p1/m3 since we need it in G + + ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F + ; write -1 +%ifidn %2, v + mova m7, [tmpq+strideq*1] ; p6 + lea tmpq, [dstq+strideq*4] + mova m1, [tmpq+strideq*1] ; q5 +%else + mova m7, [rsp+12*16] ; p6 + mova m1, [rsp+16*16] +%endif + punpcklbw m0, m7, m5 + punpckhbw m7, m5 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m7, [PIC_sym(pb_m1_1)] + paddw m10, m0 + paddw m11, m7 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4 + punpcklbw m7, m13, m1 + pmaddubsw m7, [PIC_sym(pb_m1_1)] + mova [rsp+9*16], m7 + paddw m10, m7 +%if ARCH_X86_64 + punpckhbw m13, m1 + mova m1, [rsp+6*16] + SWAP 1, 13 +%else + punpckhbw m7, m13, m1 + mova m1, [esp+6*16] + mova m13, m1 + SWAP 1, 7 +%endif + pmaddubsw m1, [PIC_sym(pb_m1_1)] + mova [rsp+10*16], m1 + paddw m11, m1 ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5 + pmulhrsw m7, m10, [PIC_sym(pw_2048)] + pmulhrsw m0, m11, [PIC_sym(pw_2048)] + packuswb m7, m0 + pand m7, m9 + pandn m0, m9, m4 + por m7, m0 + mova [rsp+6*16], m7 ; don't clobber p0/m4 since we need it in H + + ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G + ; write +0 +%ifidn %2, v + mova m7, [tmpq+strideq*2] ; q6 +%else + mova m7, [rsp+17*16] +%endif + paddw m10, [rsp+3*16] + paddw m11, [rsp+4*16] ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5 + punpcklbw m0, m3, m7 + punpckhbw m1, m3, m7 +%if ARCH_X86_64 + mova m3, [rsp+8*16] +%endif + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m1, [PIC_sym(pb_m1_1)] + mova [rsp+3*16], m0 + mova [rsp+4*16], m1 + paddw m10, m0 + paddw m11, m1 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6 + pmulhrsw m0, m10, [PIC_sym(pw_2048)] + pmulhrsw m1, m11, [PIC_sym(pw_2048)] + packuswb m0, m1 + pand m0, m9 + pandn m1, m9, m5 + por m0, m1 +%if ARCH_X86_32 + mova m1, [esp+8*16] + mova m3, m1 +%endif + mova [rsp+8*16], m0 ; don't clobber q0/m5 since we need it in I + + ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H + ; write +1 + paddw m10, [rsp+1*16] + paddw m11, m2 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6 + punpcklbw m0, m4, m7 + punpckhbw m2, m4, m7 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m2, [PIC_sym(pb_m1_1)] + paddw m10, m0 + paddw m11, m2 ; p4+p3+p2+p1+p0+q0*2+q1*2+q2*2+q3+q4+q5+q6*2 +%if ARCH_X86_64 + mova m4, [rsp+6*16] +%else + %define m4 [esp+6*16] +%endif + pmulhrsw m2, m10, [PIC_sym(pw_2048)] + pmulhrsw m1, m11, [PIC_sym(pw_2048)] + packuswb m2, m1 + pand m2, m9 + pandn m1, m9, m6 + por m2, m1 ; don't clobber q1/m6 since we need it in K + + ; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I + ; write +2 + paddw m10, [rsp+2*16] +%if ARCH_X86_64 + SWAP 7, 8 + paddw m11, m7 +%else + mova m8, m7 + %ifidn %2, v + paddw m11, [esp+24*16] ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2 + %else + paddw m11, [esp+36*16] ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2 + %endif +%endif + punpcklbw m0, m5, m8 + punpckhbw m1, m5, m8 +%if ARCH_X86_64 + mova m5, [rsp+8*16] +%else + %define m5 [esp+8*16] +%endif + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m1, [PIC_sym(pb_m1_1)] + paddw m10, m0 + paddw m11, m1 ; p3+p2+p1+p0+q0+q1*2+q2*2+q3*2+q4+q5+q6*3 + pmulhrsw m7, m10, [PIC_sym(pw_2048)] + pmulhrsw m1, m11, [PIC_sym(pw_2048)] + packuswb m7, m1 + pand m7, m9 + pandn m1, m9, m14 + por m7, m1 ; don't clobber q2/m14 since we need it in K + + ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J + ; write +3 + psubw m10, [rsp+7*16] + psubw m11, [rsp+5*16] ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3 + punpcklbw m0, m6, m8 + punpckhbw m1, m6, m8 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m1, [PIC_sym(pb_m1_1)] + paddw m10, m0 + paddw m11, m1 ; p2+p1+p0+q0+q1+q2*2+q3*2+q4*2+q5+q6*4 + pmulhrsw m0, m10, [PIC_sym(pw_2048)] + pmulhrsw m1, m11, [PIC_sym(pw_2048)] + packuswb m0, m1 + pand m0, m9 + pandn m1, m9, m15 + por m0, m1 +%ifidn %2, v + mova [tmpq+mstrideq], m0 ; q3 +%else + mova [rsp+20*16], m0 +%endif + + ; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K + ; write +4 + paddw m10, [rsp+ 9*16] + paddw m11, [rsp+10*16] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 + punpcklbw m0, m14, m8 + punpckhbw m1, m14, m8 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m1, [PIC_sym(pb_m1_1)] + paddw m10, m0 + paddw m11, m1 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5 + pmulhrsw m0, m10, [PIC_sym(pw_2048)] + pmulhrsw m1, m11, [PIC_sym(pw_2048)] + packuswb m0, m1 + pand m0, m9 +%ifidn %2, v + pandn m1, m9, [tmpq+strideq*0] +%else + pandn m1, m9, [rsp+15*16] +%endif + por m0, m1 +%ifidn %2, v + mova [tmpq+strideq*0], m0 ; q4 +%else + mova [rsp+15*16], m0 +%endif + + ; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L + ; write +5 + paddw m10, [rsp+3*16] + paddw m11, [rsp+4*16] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 + punpcklbw m0, m15, m8 + punpckhbw m1, m15, m8 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m1, [PIC_sym(pb_m1_1)] + paddw m10, m0 + paddw m11, m1 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5 + pmulhrsw m10, [PIC_sym(pw_2048)] + pmulhrsw m11, [PIC_sym(pw_2048)] + packuswb m10, m11 + pand m10, m9 +%ifidn %2, v + pandn m11, m9, [tmpq+strideq*1] +%else + pandn m11, m9, [rsp+16*16] +%endif + por m10, m11 +%ifidn %2, v + mova [tmpq+strideq*1], m10 ; q5 +%else + mova [rsp+16*16], m10 +%endif + +%if ARCH_X86_64 + SWAP 0, 8 + SWAP 1, 9 + SWAP 14, 7 +%else + %xdefine m3 m11 + %xdefine m4 m14 + %xdefine m5 m15 + %xdefine m6 m10 + mova %%q2mem, m7 + %ifidn %2, v + mova m3, [esp+19*16] + %else + mova m3, [esp+32*16] + %endif + mova m4, [esp+ 6*16] + mova m5, [esp+ 8*16] +%endif + SWAP m6, m2 + +%if ARCH_X86_64 + mova m9, %%flat8mem +%endif +%ifidn %2, v + lea tmpq, [dstq+mstrideq*4] +%endif +%endif ; if %1 == 16 +%if %1 >= 8 + ; flat8 filter +%if ARCH_X86_32 + %define m9 %%flat8mem + %define m11 m1 + %define m13 %%p2mem + %define m14 %%q2mem + %define m15 %%q3mem +%endif + mova m11, %%p3mem + punpcklbw m0, m11, m3 + punpcklbw m7, m13, m4 + pmaddubsw m2, m0, [PIC_sym(pb_3_1)] ; 3 * p3 + p1 + pmaddubsw m7, [PIC_sym(pb_2_1)] + paddw m2, m7 ; 3 * p3 + 2 * p2 + p1 + p0 + punpcklbw m7, m5, [PIC_sym(pb_4)] + pmaddubsw m7, [PIC_sym(pb_1)] + paddw m2, m7 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4 + punpckhbw m1, m11, m3 + pmaddubsw m7, m1, [PIC_sym(pb_3_1)] ; 3 * p3 + p1 + punpckhbw m0, m13, m4 + pmaddubsw m0, [PIC_sym(pb_2_1)] + paddw m7, m0 ; 3 * p3 + 2 * p2 + p1 + p0 + punpckhbw m0, m5, [PIC_sym(pb_4)] + pmaddubsw m0, [PIC_sym(pb_1)] + paddw m7, m0 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4 + psrlw m0, m2, 3 + psrlw m1, m7, 3 + packuswb m0, m1 + pand m0, m9 + pandn m1, m9, m13 + por m0, m1 ; p2 +%ifidn %2, v + mova [tmpq+strideq*1], m0 +%else + %if ARCH_X86_64 + SWAP 0, 10 + %else + mova [esp+2*16], m0 + %endif +%endif + +%if ARCH_X86_32 + mova m11, %%p3mem +%endif + punpcklbw m0, m11, m3 + punpckhbw m1, m11, m3 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m1, [PIC_sym(pb_m1_1)] + paddw m2, m0 + paddw m7, m1 + punpcklbw m0, m13, m6 + punpckhbw m1, m13, m6 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m1, [PIC_sym(pb_m1_1)] + paddw m2, m0 + paddw m7, m1 ; 2 * p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4 + psrlw m0, m2, 3 + psrlw m1, m7, 3 + packuswb m0, m1 + pand m0, m9 + pandn m1, m9, m3 + por m0, m1 ; p1 +%ifidn %2, v + mova [tmpq+strideq*2], m0 +%else + mova [rsp+0*16], m0 +%endif + +%if ARCH_X86_32 + mova m11, %%p3mem +%endif + punpcklbw m0, m11, m3 + punpckhbw m1, m11, m3 + pmaddubsw m0, [PIC_sym(pb_1)] + pmaddubsw m1, [PIC_sym(pb_1)] + psubw m2, m0 + psubw m7, m1 + punpcklbw m0, m4, m14 + punpckhbw m1, m4, m14 + pmaddubsw m0, [PIC_sym(pb_1)] + pmaddubsw m1, [PIC_sym(pb_1)] + paddw m2, m0 + paddw m7, m1 ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4 + psrlw m0, m2, 3 + psrlw m1, m7, 3 + packuswb m0, m1 + pand m0, m9 + pandn m1, m9, m4 + por m0, m1 ; p0 +%ifidn %2, v + mova [tmpq+stride3q], m0 +%else + mova [rsp+1*16], m0 +%endif + + punpcklbw m0, m5, m15 + punpckhbw m1, m5, m15 + pmaddubsw m0, [PIC_sym(pb_1)] + pmaddubsw m1, [PIC_sym(pb_1)] + paddw m2, m0 + paddw m7, m1 +%if ARCH_X86_32 + mova m11, %%p3mem +%endif + punpcklbw m0, m11, m4 + punpckhbw m11, m11, m4 + pmaddubsw m0, [PIC_sym(pb_1)] + pmaddubsw m11, [PIC_sym(pb_1)] + psubw m2, m0 + psubw m7, m11 ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4 + psrlw m0, m2, 3 + psrlw m11, m7, 3 + packuswb m0, m11 + pand m0, m9 + pandn m11, m9, m5 + por m11, m0 ; q0 +%ifidn %2, v + mova [dstq+strideq*0], m11 +%elif ARCH_X86_32 + mova [esp+8*16], m11 +%endif + + punpcklbw m0, m5, m15 + punpckhbw m1, m5, m15 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m1, [PIC_sym(pb_m1_1)] + paddw m2, m0 + paddw m7, m1 + punpcklbw m0, m13, m6 + punpckhbw m1, m13, m6 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m1, [PIC_sym(pb_m1_1)] + paddw m2, m0 + paddw m7, m1 ; p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4 + psrlw m0, m2, 3 + psrlw m1, m7, 3 + packuswb m0, m1 + pand m0, m9 + pandn m1, m9, m6 + por m0, m1 ; q1 +%ifidn %2, v + mova [dstq+strideq*1], m0 +%else + %if ARCH_X86_64 + SWAP 0, 13 + %else + mova [esp+9*16], m0 + %endif +%endif + + punpcklbw m0, m3, m6 + punpckhbw m1, m3, m6 + pmaddubsw m0, [PIC_sym(pb_1)] + pmaddubsw m1, [PIC_sym(pb_1)] + psubw m2, m0 + psubw m7, m1 + punpcklbw m0, m14, m15 + punpckhbw m1, m14, m15 + pmaddubsw m0, [PIC_sym(pb_1)] + pmaddubsw m1, [PIC_sym(pb_1)] + paddw m2, m0 + paddw m7, m1 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4 + psrlw m2, 3 + psrlw m7, 3 + packuswb m2, m7 + pand m2, m9 + pandn m7, m9, m14 + por m2, m7 ; q2 +%ifidn %2, v + mova [dstq+strideq*2], m2 +%else + mova m0, [rsp+0*16] +%if %1 == 8 + mova m1, [rsp+1*16] + mova m4, %%p3mem + +%if ARCH_X86_32 + %define m10 [esp+2*16] + %define m11 [esp+8*16] + %define m13 [esp+9*16] +%endif + + ; 16x8 transpose + punpcklbw m3, m4, m10 + punpckhbw m4, m10 + punpcklbw m5, m0, m1 + punpckhbw m0, m1 + punpcklbw m1, m11, m13 + punpckhbw m6, m11, m13 + punpcklbw m7, m2, m15 + punpckhbw m2, m15 +%if ARCH_X86_64 + SWAP 2, 15 +%else + mova m15, m2 +%endif + + punpcklwd m2, m3, m5 + punpckhwd m3, m5 + punpcklwd m5, m4, m0 + punpckhwd m4, m0 + punpcklwd m0, m1, m7 + punpckhwd m1, m7 + punpcklwd m7, m6, m15 + punpckhwd m6, m15 +%if ARCH_X86_64 + SWAP 6, 15 +%else + mova m15, m6 +%endif + + punpckldq m6, m2, m0 + punpckhdq m2, m0 + punpckldq m0, m3, m1 + punpckhdq m3, m1 + punpckldq m1, m5, m7 + punpckhdq m5, m7 + punpckldq m7, m4, m15 + punpckhdq m4, m15 + + ; write 8x16 + movq [dstq+strideq*0-4], xm6 + movhps [dstq+strideq*1-4], xm6 + movq [dstq+strideq*2-4], xm2 + movhps [dstq+stride3q -4], xm2 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0-4], xm0 + movhps [dstq+strideq*1-4], xm0 + movq [dstq+strideq*2-4], xm3 + movhps [dstq+stride3q -4], xm3 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0-4], xm1 + movhps [dstq+strideq*1-4], xm1 + movq [dstq+strideq*2-4], xm5 + movhps [dstq+stride3q -4], xm5 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0-4], xm7 + movhps [dstq+strideq*1-4], xm7 + movq [dstq+strideq*2-4], xm4 + movhps [dstq+stride3q -4], xm4 + lea dstq, [dstq+strideq*4] +%else + ; 16x16 transpose and store + SWAP 6, 0 + SWAP 7, 1 + %if ARCH_X86_64 + SWAP 5, 10, 2 + SWAP 8, 11 + SWAP 9, 13 + mova [rsp+21*16], m12 + %else + mova [esp+10*16], m2 + %xdefine m8 m0 + %xdefine m9 m1 + %xdefine m10 m2 + %xdefine m11 m3 + %xdefine m12 m4 + %xdefine m13 m5 + %xdefine m14 m6 + %xdefine m15 m7 + %endif + mova m0, [rsp+11*16] + mova m1, [rsp+12*16] + mova m2, [rsp+13*16] + mova m3, [rsp+14*16] + mova m4, [rsp+19*16] +%if ARCH_X86_64 + mova m7, [rsp+ 1*16] + mova m11, [rsp+20*16] + mova m12, [rsp+15*16] + mova m13, [rsp+16*16] + mova m14, [rsp+17*16] + TRANSPOSE_16X16B 1, [rsp+18*16] +%else + mova m5, [esp+ 2*16] + TRANSPOSE_16X16B 1, [esp+32*16] + mov tmpq, dstq + lea dstq, [dstq+strideq*8] +%endif + movu [dstq+strideq*0-8], xm0 + movu [dstq+strideq*1-8], xm1 + movu [dstq+strideq*2-8], xm2 + movu [dstq+stride3q -8], xm3 + lea dstq, [dstq+strideq*4] + movu [dstq+strideq*0-8], xm4 + movu [dstq+strideq*1-8], xm5 + movu [dstq+strideq*2-8], xm6 + movu [dstq+stride3q -8], xm7 +%if ARCH_X86_64 + lea dstq, [dstq+strideq*4] +%else + %xdefine m8 m0 + %xdefine m9 m1 + %xdefine m10 m2 + %xdefine m11 m3 + %xdefine m12 m4 + %xdefine m13 m5 + %xdefine m14 m6 + %xdefine m15 m7 + mova m8, [esp+11*16] + mova m9, [esp+12*16] + mova m10, [esp+13*16] + mova m11, [esp+14*16] + mova m12, [esp+26*16] + mova m13, [esp+27*16] + mova m14, [esp+ 0*16] + mova m15, [esp+ 1*16] + mov dstq, tmpq +%endif + movu [dstq+strideq*0-8], xm8 + movu [dstq+strideq*1-8], xm9 + movu [dstq+strideq*2-8], xm10 + movu [dstq+stride3q -8], xm11 + lea dstq, [dstq+strideq*4] + movu [dstq+strideq*0-8], xm12 + movu [dstq+strideq*1-8], xm13 + movu [dstq+strideq*2-8], xm14 + movu [dstq+stride3q -8], xm15 + lea dstq, [dstq+strideq*4] +%if ARCH_X86_32 + lea dstq, [dstq+strideq*8] +%else + mova m12, [rsp+21*16] +%endif + +%endif ; if %1 == 8 +%endif ; ifidn %2, v +%elif %1 == 6 + ; flat6 filter +%if ARCH_X86_32 + mova [esp+3*16], m3 + mova [esp+4*16], m4 + mova [esp+5*16], m5 + mova [esp+6*16], m6 + %xdefine m8 m3 + %xdefine m10 m4 + %xdefine m11 m5 + %xdefine m15 m6 + %define m3 [esp+3*16] + %define m4 [esp+4*16] + %define m5 [esp+5*16] + %define m6 [esp+6*16] + %define m9 %%flat8mem + %define m13 %%p2mem + %define m14 %%q2mem +%endif + + punpcklbw m8, m13, m5 + punpckhbw m11, m13, m5 + pmaddubsw m0, m8, [PIC_sym(pb_3_1)] + pmaddubsw m1, m11, [PIC_sym(pb_3_1)] + punpcklbw m7, m4, m3 + punpckhbw m10, m4, m3 + pmaddubsw m2, m7, [PIC_sym(pb_2)] + pmaddubsw m15, m10, [PIC_sym(pb_2)] + paddw m0, m2 + paddw m1, m15 + pmulhrsw m2, m0, [PIC_sym(pw_4096)] + pmulhrsw m15, m1, [PIC_sym(pw_4096)] + packuswb m2, m15 + pand m2, m9 + pandn m15, m9, m3 + por m2, m15 +%ifidn %2, v + mova [tmpq+strideq*2], m2 ; p1 +%elif ARCH_X86_32 + mova [esp+11*16], m2 +%endif + + pmaddubsw m8, [PIC_sym(pb_m1_1)] + pmaddubsw m11, [PIC_sym(pb_m1_1)] + paddw m0, m8 + paddw m1, m11 + punpcklbw m8, m13, m6 + punpckhbw m11, m13, m6 +%if ARCH_X86_64 + SWAP 2, 13 +%endif + pmaddubsw m8, [PIC_sym(pb_m1_1)] + pmaddubsw m11, [PIC_sym(pb_m1_1)] + paddw m0, m8 + paddw m1, m11 + pmulhrsw m2, m0, [PIC_sym(pw_4096)] + pmulhrsw m15, m1, [PIC_sym(pw_4096)] + packuswb m2, m15 + pand m2, m9 + pandn m15, m9, m4 + por m2, m15 +%ifidn %2, v + mova [tmpq+stride3q], m2 ; p0 +%elif ARCH_X86_32 + mova [esp+8*16], m2 +%endif + + paddw m0, m8 + paddw m1, m11 + punpcklbw m8, m3, m14 + punpckhbw m11, m3, m14 +%if ARCH_X86_64 + SWAP 2, 14 +%endif + pmaddubsw m2, m8, [PIC_sym(pb_m1_1)] + pmaddubsw m15, m11, [PIC_sym(pb_m1_1)] + paddw m0, m2 + paddw m1, m15 + pmulhrsw m2, m0, [PIC_sym(pw_4096)] + pmulhrsw m15, m1, [PIC_sym(pw_4096)] + packuswb m2, m15 + pand m2, m9 + pandn m15, m9, m5 + por m2, m15 +%ifidn %2, v + mova [dstq+strideq*0], m2 ; q0 +%endif + + pmaddubsw m8, [PIC_sym(pb_m1_2)] + pmaddubsw m11, [PIC_sym(pb_m1_2)] + paddw m0, m8 + paddw m1, m11 + pmaddubsw m7, [PIC_sym(pb_m1_0)] + pmaddubsw m10, [PIC_sym(pb_m1_0)] + paddw m0, m7 + paddw m1, m10 + pmulhrsw m0, [PIC_sym(pw_4096)] + pmulhrsw m1, [PIC_sym(pw_4096)] + packuswb m0, m1 + pand m0, m9 + pandn m1, m9, m6 + por m0, m1 +%if ARCH_X86_32 + %xdefine m3 m8 + %xdefine m4 m10 + %xdefine m5 m11 + %xdefine m6 m15 +%endif +%ifidn %2, v + mova [dstq+strideq*1], m0 ; q1 +%else + %if ARCH_X86_64 + SWAP 3, 13 + SWAP 4, 14 + %else + mova m3, [esp+11*16] + mova m4, [esp+ 8*16] + %endif + SWAP 5, 2 + SWAP 6, 0 + TRANSPOSE_16x4_AND_WRITE_4x16 3, 4, 5, 6, 7 +%endif +%else ; if %1 == 4 +%ifidn %2, v + mova [tmpq+strideq*0], m3 ; p1 + mova [tmpq+strideq*1], m4 ; p0 + mova [tmpq+strideq*2], m5 ; q0 + mova [tmpq+stride3q ], m6 ; q1 +%else + TRANSPOSE_16x4_AND_WRITE_4x16 3, 4, 5, 6, 7 +%endif +%endif +%if ARCH_X86_32 + %define m12 m12reg +%endif +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; 32-bit PIC helpers ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%if ARCH_X86_32 + %define PIC_base_offset $$ + + %macro SETUP_PIC 0 ; PIC_reg + %define PIC_reg r2 + %assign PIC_reg_stk_offset stack_size-gprsize*(1+copy_args*4) + LEA PIC_reg, $$ + %endmacro + + %macro XCHG_PIC_REG 1 ; 0=mask 1=PIC_base + %if %1 == 0 + mov [esp+PIC_reg_stk_offset], PIC_reg + mov PIC_reg, maskm + %else + mov PIC_reg, [esp+PIC_reg_stk_offset] + %endif + %endmacro + + %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset) + +%else + %macro XCHG_PIC_REG 1 + %endmacro + %define PIC_sym(sym) (sym) +%endif + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%if ARCH_X86_32 + %if STACK_ALIGNMENT < required_stack_alignment + %assign copy_args 1 + %else + %assign copy_args 0 + %endif +%endif + +%macro RELOC_ARGS 1 + %if copy_args + %define maskm [esp+stack_size-gprsize*1] + %define l_stridem [esp+stack_size-gprsize*2] + %define lutm [esp+stack_size-gprsize*3] + %define %1m [esp+stack_size-gprsize*4] + mov r6d, r6m + mov maskm, maskd + mov lutm, lutd + mov %1m, r6d + %else + %define %1m r6m + %endif +%endmacro + +%if ARCH_X86_32 + %define tmpq r4 + %define mstrideq r5 + %define stride3q r6 + %define l_stride3q r6 +%endif + +INIT_XMM ssse3 +%if ARCH_X86_64 +cglobal lpf_v_sb_y, 7, 11, 16, 16 * 15, \ + dst, stride, mask, l, l_stride, lut, \ + w, stride3, mstride, tmp, mask_bits +%else +cglobal lpf_v_sb_y, 6, 7, 8, -16 * (26 + copy_args), \ + dst, stride, mask, l, l_stride, lut, mask_bits + RELOC_ARGS w + SETUP_PIC + %define m12 m5 +%endif + shl l_strideq, 2 + sub lq, l_strideq +%if ARCH_X86_64 + mov mstrideq, strideq + neg mstrideq + lea stride3q, [strideq*3] +%else + mov l_stridem, l_strided +%endif + mov mask_bitsd, 0xf + mova m12, [PIC_sym(pd_mask)] + XCHG_PIC_REG 0 + movu m0, [maskq] + pxor m4, m4 + movd m3, [lutq+136] + pshufb m3, m4 + pshufd m2, m0, q2222 + pshufd m1, m0, q1111 + pshufd m0, m0, q0000 + por m1, m2 + por m0, m1 + mova [rsp+11*16], m0 + mova [rsp+12*16], m1 + mova [rsp+13*16], m2 + mova [rsp+14*16], m3 + +%define maskmem [esp+15*16] +%define mask0 [rsp+11*16] +%define mask1 [rsp+12*16] +%define mask2 [rsp+13*16] +%define minlvl [rsp+14*16] + +.loop: + test [maskq+8], mask_bitsd ; vmask[2] + je .no_flat16 + +%if ARCH_X86_32 + XCHG_PIC_REG 1 + mov [esp+25*16], mask_bitsd + mova maskmem, m12 +%endif + FILTER 16, v + jmp .end + +.no_flat16: + test [maskq+4], mask_bitsd ; vmask[1] + je .no_flat + +%if ARCH_X86_32 + XCHG_PIC_REG 1 + mov [esp+25*16], mask_bitsd + mova maskmem, m12 +%endif + FILTER 8, v + jmp .end + +.no_flat: + test [maskq+0], mask_bitsd ; vmask[0] + XCHG_PIC_REG 1 + je .no_filter + +%if ARCH_X86_32 + mov [esp+25*16], mask_bitsd + mova maskmem, m12 +%endif + FILTER 4, v + +.end: +%if ARCH_X86_32 + mova m12, maskmem + mov mask_bitsd, [esp+25*16] +%endif +.no_filter: + pslld m12, 4 + shl mask_bitsd, 4 + add lq, 16 + add dstq, 16 +%if ARCH_X86_64 + sub wd, 4 +%else + sub dword wm, 4 +%endif + XCHG_PIC_REG 0 + jg .loop + RET + +INIT_XMM ssse3 +%if ARCH_X86_64 +cglobal lpf_h_sb_y, 7, 11, 16, 16 * 26, \ + dst, stride, mask, l, l_stride, lut, \ + h, stride3, l_stride3, tmp, mask_bits +%else +cglobal lpf_h_sb_y, 6, 7, 8, -16 * (39 + copy_args), \ + dst, stride, mask, l, l_stride, lut, mask_bits + RELOC_ARGS h + SETUP_PIC + %define m12 m5 +%endif + sub lq, 4 + shl l_strideq, 2 +%if ARCH_X86_64 + lea stride3q, [strideq*3] + lea l_stride3q, [l_strideq*3] +%else + mov l_stridem, l_strided +%endif + mov mask_bitsd, 0xf + mova m12, [PIC_sym(pd_mask)] + XCHG_PIC_REG 0 + movu m0, [maskq] + pxor m4, m4 + movd m3, [lutq+136] + pshufb m3, m4 + pshufd m2, m0, q2222 + pshufd m1, m0, q1111 + pshufd m0, m0, q0000 + por m1, m2 + por m0, m1 + mova [rsp+22*16], m0 + mova [rsp+23*16], m1 + mova [rsp+24*16], m2 + mova [rsp+25*16], m3 + +%define maskmem [esp+37*16] +%define mask0 [rsp+22*16] +%define mask1 [rsp+23*16] +%define mask2 [rsp+24*16] +%define minlvl [rsp+25*16] + +.loop: + test [maskq+8], mask_bitsd ; vmask[2] + je .no_flat16 + +%if ARCH_X86_32 + XCHG_PIC_REG 1 + mov [esp+38*16], mask_bitsd + mova maskmem, m12 +%endif + FILTER 16, h + jmp .end + +.no_flat16: + test [maskq+4], mask_bitsd ; vmask[1] + je .no_flat + +%if ARCH_X86_32 + XCHG_PIC_REG 1 + mov [esp+38*16], mask_bitsd + mova maskmem, m12 +%endif + FILTER 8, h + jmp .end + +.no_flat: + test [maskq+0], mask_bitsd ; vmask[0] + XCHG_PIC_REG 1 + je .no_filter + +%if ARCH_X86_32 + mov [esp+38*16], mask_bitsd + mova maskmem, m12 +%endif + FILTER 4, h + jmp .end + +.no_filter: + lea dstq, [dstq+strideq*8] + lea dstq, [dstq+strideq*8] +%if ARCH_X86_32 + jmp .end_noload +.end: + mova m12, maskmem + mov l_strideq, l_stridem + mov mask_bitsd, [esp+38*16] +.end_noload: +%else +.end: +%endif + lea lq, [lq+l_strideq*4] + pslld m12, 4 + shl mask_bitsd, 4 +%if ARCH_X86_64 + sub hd, 4 +%else + sub dword hm, 4 +%endif + XCHG_PIC_REG 0 + jg .loop + RET + +INIT_XMM ssse3 +%if ARCH_X86_64 +cglobal lpf_v_sb_uv, 7, 11, 16, 3 * 16, \ + dst, stride, mask, l, l_stride, lut, \ + w, stride3, mstride, tmp, mask_bits +%else +cglobal lpf_v_sb_uv, 6, 7, 8, -16 * (12 + copy_args), \ + dst, stride, mask, l, l_stride, lut, mask_bits + RELOC_ARGS w + SETUP_PIC + %define m12 m4 +%endif + shl l_strideq, 2 + sub lq, l_strideq +%if ARCH_X86_64 + mov mstrideq, strideq + neg mstrideq + lea stride3q, [strideq*3] +%else + mov l_stridem, l_strided +%endif + mov mask_bitsd, 0xf + mova m12, [PIC_sym(pd_mask)] + XCHG_PIC_REG 0 + movq m0, [maskq] + pxor m3, m3 + movd m2, [lutq+136] + pshufb m2, m3 + pshufd m1, m0, q1111 + pshufd m0, m0, q0000 + por m0, m1 + mova [rsp+0*16], m0 + mova [rsp+1*16], m1 + mova [rsp+2*16], m2 + +%define maskmem [esp+7*16] +%define mask0 [rsp+0*16] +%define mask1 [rsp+1*16] +%define minlvl [rsp+2*16] + +.loop: + test [maskq+4], mask_bitsd ; vmask[1] + je .no_flat + +%if ARCH_X86_32 + XCHG_PIC_REG 1 + mov [esp+11*16], mask_bitsd + mova maskmem, m12 +%endif + FILTER 6, v + jmp .end + +.no_flat: + test [maskq+0], mask_bitsd ; vmask[1] + XCHG_PIC_REG 1 + je .no_filter + +%if ARCH_X86_32 + mov [esp+11*16], mask_bitsd + mova maskmem, m12 +%endif + FILTER 4, v + +.end: +%if ARCH_X86_32 + mova m12, maskmem + mov mask_bitsd, [esp+11*16] +%endif +.no_filter: + pslld m12, 4 + shl mask_bitsd, 4 + add lq, 16 + add dstq, 16 +%if ARCH_X86_64 + sub wd, 4 +%else + sub dword wm, 4 +%endif + XCHG_PIC_REG 0 + jg .loop + RET + +INIT_XMM ssse3 +%if ARCH_X86_64 +cglobal lpf_h_sb_uv, 7, 11, 16, 16 * 3, \ + dst, stride, mask, l, l_stride, lut, \ + h, stride3, l_stride3, tmp, mask_bits +%else +cglobal lpf_h_sb_uv, 6, 7, 8, -16 * (13 + copy_args), \ + dst, stride, mask, l, l_stride, lut, mask_bits + RELOC_ARGS h + SETUP_PIC + %define m12 m4 +%endif + sub lq, 4 + shl l_strideq, 2 +%if ARCH_X86_64 + lea stride3q, [strideq*3] + lea l_stride3q, [l_strideq*3] +%else + mov l_stridem, l_strided +%endif + mov mask_bitsd, 0xf + mova m12, [PIC_sym(pd_mask)] + XCHG_PIC_REG 0 + movq m0, [maskq] + pxor m3, m3 + movd m2, [lutq+136] + pshufb m2, m3 + pshufd m1, m0, q1111 + pshufd m0, m0, q0000 + por m0, m1 + mova [rsp+0*16], m0 + mova [rsp+1*16], m1 + mova [rsp+2*16], m2 + +%define maskmem [esp+7*16] +%define mask0 [rsp+0*16] +%define mask1 [rsp+1*16] +%define minlvl [rsp+2*16] + +.loop: + test [maskq+4], mask_bitsd ; vmask[1] + je .no_flat + +%if ARCH_X86_32 + XCHG_PIC_REG 1 + mov [esp+12*16], mask_bitsd + mova maskmem, m12 +%endif + FILTER 6, h + jmp .end + +.no_flat: + test [maskq+0], mask_bitsd ; vmask[1] + XCHG_PIC_REG 1 + je .no_filter + +%if ARCH_X86_32 + mov [esp+12*16], mask_bitsd + mova maskmem, m12 +%endif + FILTER 4, h + jmp .end + +.no_filter: + lea dstq, [dstq+strideq*8] + lea dstq, [dstq+strideq*8] +%if ARCH_X86_32 + jmp .end_noload +.end: + mova m12, maskmem + mov l_strided, l_stridem + mov mask_bitsd, [esp+12*16] +.end_noload: +%else +.end: +%endif + lea lq, [lq+l_strideq*4] + pslld m12, 4 + shl mask_bitsd, 4 +%if ARCH_X86_64 + sub hd, 4 +%else + sub dword hm, 4 +%endif + XCHG_PIC_REG 0 + jg .loop + RET diff --git a/third_party/dav1d/src/x86/looprestoration.asm b/third_party/dav1d/src/x86/looprestoration.asm new file mode 100644 index 0000000000..fc6e9f124e --- /dev/null +++ b/third_party/dav1d/src/x86/looprestoration.asm @@ -0,0 +1,1158 @@ +; Copyright © 2018, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 32 +pb_right_ext_mask: times 32 db 0xff + times 32 db 0 +pb_14x0_1_2: times 14 db 0 + db 1, 2 +pb_0_to_15_min_n: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13 + db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14 +pb_15: times 16 db 15 +pw_16: times 2 dw 16 +pw_256: times 2 dw 256 +pw_2048: times 2 dw 2048 +pw_16380: times 2 dw 16380 +pw_0_128: dw 0, 128 +pw_5_6: dw 5, 6 +pd_6: dd 6 +pd_1024: dd 1024 +pd_0xf0080029: dd 0xf0080029 +pd_0xf00801c7: dd 0xf00801c7 + +cextern sgr_x_by_x + +SECTION .text + +INIT_YMM avx2 +cglobal wiener_filter_h, 5, 12, 16, dst, left, src, stride, fh, w, h, edge + mov edged, edgem + vpbroadcastb m15, [fhq+0] + movifnidn wd, wm + vpbroadcastb m14, [fhq+2] + mov hd, hm + vpbroadcastb m13, [fhq+4] + vpbroadcastw m12, [fhq+6] + vpbroadcastd m11, [pw_2048] + vpbroadcastd m10, [pw_16380] + lea r11, [pb_right_ext_mask] + + DEFINE_ARGS dst, left, src, stride, x, w, h, edge, srcptr, dstptr, xlim + + ; if (edge & has_right) align_w_to_32 + ; else w -= 32, and use that as limit in x loop + test edgeb, 2 ; has_right + jnz .align + mov xlimq, -3 + jmp .loop +.align: + add wd, 31 + and wd, ~31 + xor xlimd, xlimd + + ; main y loop for vertical filter +.loop: + mov srcptrq, srcq + mov dstptrq, dstq + lea xq, [wq+xlimq] + + ; load left edge pixels + test edgeb, 1 ; have_left + jz .emu_left + test leftq, leftq ; left == NULL for the edge-extended bottom/top + jz .load_left_combined + movd xm0, [leftq] + add leftq, 4 + pinsrd xm0, [srcq], 1 + pslldq xm0, 9 + jmp .left_load_done +.load_left_combined: + movq xm0, [srcq-3] + pslldq xm0, 10 + jmp .left_load_done +.emu_left: + movd xm0, [srcq] + pshufb xm0, [pb_14x0_1_2] + + ; load right edge pixels +.left_load_done: + cmp xd, 32 + jg .main_load + test xd, xd + jg .load_and_splat + je .splat_right + + ; for very small images (w=[1-2]), edge-extend the original cache, + ; ugly, but only runs in very odd cases + add wd, wd + pshufb xm0, [r11-pb_right_ext_mask+pb_0_to_15_min_n+wq*8-16] + shr wd, 1 + + ; main x loop, mostly this starts in .main_load +.splat_right: + ; no need to load new pixels, just extend them from the (possibly previously + ; extended) previous load into m0 + pshufb xm1, xm0, [pb_15] + jmp .main_loop +.load_and_splat: + ; load new pixels and extend edge for right-most + movu m1, [srcptrq+3] + sub r11, xq + movu m2, [r11-pb_right_ext_mask+pb_right_ext_mask+32] + add r11, xq + vpbroadcastb m3, [srcptrq+2+xq] + pand m1, m2 + pandn m3, m2, m3 + por m1, m3 + jmp .main_loop +.main_load: + ; load subsequent line + movu m1, [srcptrq+3] +.main_loop: + vinserti128 m0, xm1, 1 + + palignr m2, m1, m0, 10 + palignr m3, m1, m0, 11 + palignr m4, m1, m0, 12 + palignr m5, m1, m0, 13 + palignr m6, m1, m0, 14 + palignr m7, m1, m0, 15 + + punpcklbw m0, m2, m1 + punpckhbw m2, m1 + punpcklbw m8, m3, m7 + punpckhbw m3, m7 + punpcklbw m7, m4, m6 + punpckhbw m4, m6 + pxor m9, m9 + punpcklbw m6, m5, m9 + punpckhbw m5, m9 + + pmaddubsw m0, m15 + pmaddubsw m2, m15 + pmaddubsw m8, m14 + pmaddubsw m3, m14 + pmaddubsw m7, m13 + pmaddubsw m4, m13 + paddw m0, m8 + paddw m2, m3 + psllw m8, m6, 7 + psllw m3, m5, 7 + psubw m8, m10 + psubw m3, m10 + pmullw m6, m12 + pmullw m5, m12 + paddw m0, m7 + paddw m2, m4 + paddw m0, m6 + paddw m2, m5 + ; for a signed overflow to happen we need filter and pixels as follow: + ; filter => -5,-23,-17,90,-17,-23,-5 + ; pixels => 255,255,255,0,255,255,255 or 0,0,0,255,0,0,0 + ; m0 would fall in the range [-59A6;+59A6] = [A65A;59A6] + ; m8 would fall in the range [-3FFC;+3F84] = [C004;3F84] + ; 32-bit arithmetic m0+m8 = [-99A2;+992A] = [FFFF665E;992A] + ; => signed 16-bit overflow occurs + paddsw m0, m8 ; paddsw clips this range to [-8000;+7FFF] + paddsw m2, m3 + psraw m0, 3 ; shift changes the range to [-1000;+FFF] + psraw m2, 3 + paddw m0, m11 ; adding back 800 (removed in m8) changes the + paddw m2, m11 ; range to [-800;+17FF] as defined in the spec + mova [dstptrq], xm0 ; (note that adding another 800 would give us + mova [dstptrq+16], xm2; the same range as in the C code => [0;1FFF]) + vextracti128 [dstptrq+32], m0, 1 + vextracti128 [dstptrq+48], m2, 1 + vextracti128 xm0, m1, 1 + add srcptrq, 32 + add dstptrq, 64 + sub xq, 32 + cmp xd, 32 + jg .main_load + test xd, xd + jg .load_and_splat + cmp xd, xlimd + jg .splat_right + + add srcq, strideq + add dstq, 384*2 + dec hd + jg .loop + RET + +cglobal wiener_filter_v, 4, 10, 13, dst, stride, mid, w, h, fv, edge + movifnidn fvq, fvmp + mov edged, edgem + movifnidn hd, hm + vpbroadcastd m10, [fvq] + vpbroadcastd m11, [fvq+4] + vpbroadcastd m0, [pw_0_128] + vpbroadcastd m12, [pd_1024] + + DEFINE_ARGS dst, stride, mid, w, h, ylim, edge, y, mptr, dstptr + rorx ylimd, edged, 2 + paddw m11, m0 + and ylimd, 2 ; have_bottom + sub ylimd, 3 + + ; main x loop for vertical filter, does one column of 16 pixels +.loop_x: + mova m3, [midq] ; middle line + + ; load top pixels + test edgeb, 4 ; have_top + jz .emu_top + mova m0, [midq-384*4] + mova m2, [midq-384*2] + mova m1, m0 + jmp .load_bottom_pixels +.emu_top: + mova m0, m3 + mova m1, m3 + mova m2, m3 + + ; load bottom pixels +.load_bottom_pixels: + mov yd, hd + mov mptrq, midq + mov dstptrq, dstq + add yd, ylimd + jg .load_threelines + + ; the remainder here is somewhat messy but only runs in very weird + ; circumstances at the bottom of the image in very small blocks (h=[1-3]), + ; so performance is not terribly important here... + je .load_twolines + cmp yd, -1 + je .load_oneline + ; h == 1 case + mova m5, m3 + mova m4, m3 + mova m6, m3 + jmp .loop +.load_oneline: + ; h == 2 case + mova m4, [midq+384*2] + mova m5, m4 + mova m6, m4 + jmp .loop +.load_twolines: + ; h == 3 case + mova m4, [midq+384*2] + mova m5, [midq+384*4] + mova m6, m5 + jmp .loop +.load_threelines: + ; h > 3 case + mova m4, [midq+384*2] + mova m5, [midq+384*4] + ; third line loaded in main loop below + + ; main y loop for vertical filter +.loop_load: + ; load one line into m6. if that pixel is no longer available, do + ; nothing, since m6 still has the data from the previous line in it. We + ; try to structure the loop so that the common case is evaluated fastest + mova m6, [mptrq+384*6] +.loop: + paddw m0, m6 + paddw m7, m1, m5 + paddw m8, m2, m4 + punpcklwd m9, m0, m7 + punpckhwd m0, m7 + punpcklwd m7, m8, m3 + punpckhwd m8, m3 + pmaddwd m9, m10 + pmaddwd m0, m10 + pmaddwd m7, m11 + pmaddwd m8, m11 + add mptrq, 384*2 + paddd m7, m9 + paddd m0, m8 + paddd m7, m12 + paddd m0, m12 + psrad m7, 11 + psrad m0, 11 + packssdw m7, m0 + vextracti128 xm0, m7, 1 + packuswb xm7, xm0 + mova [dstptrq], xm7 + ; shift pixels one position + mova m0, m1 + mova m1, m2 + mova m2, m3 + mova m3, m4 + mova m4, m5 + mova m5, m6 + add dstptrq, strideq + dec yd + jg .loop_load + ; for the bottom pixels, continue using m6 (as extended edge) + cmp yd, ylimd + jg .loop + add midq, 32 + add dstq, 16 + sub wd, 16 + jg .loop_x + RET + +INIT_YMM avx2 +cglobal sgr_box3_h, 5, 11, 7, sumsq, sum, left, src, stride, w, h, edge, x, xlim + mov xlimd, edgem + movifnidn wd, wm + mov hd, hm + mov edged, xlimd + and xlimd, 2 ; have_right + jz .no_right + add wd, 2+15 + and wd, ~15 +.no_right: + lea r10, [pb_right_ext_mask+32] + xor xlimd, 2 ; 2*!have_right + pxor m1, m1 + add srcq, wq + lea sumq, [sumq+wq*2-2] + lea sumsqq, [sumsqq+wq*4-4] + neg wq +.loop_y: + mov xq, wq + + ; load left + test edgeb, 1 ; have_left + jz .no_left + test leftq, leftq + jz .load_left_from_main + vpbroadcastw xm0, [leftq+2] + add leftq, 4 + jmp .expand_x +.no_left: + vpbroadcastb xm0, [srcq+xq] + jmp .expand_x +.load_left_from_main: + vpbroadcastw xm0, [srcq+xq-2] +.expand_x: + punpckhbw xm0, xm1 + + ; when we reach this, xm0 contains left two px in highest words + cmp xd, -16 + jle .loop_x +.partial_load_and_extend: + vpbroadcastb m3, [srcq-1] + pmovzxbw m2, [srcq+xq] + movu m4, [r10+xq*2] + punpcklbw m3, m1 + pand m2, m4 + pandn m4, m3 + por m2, m4 + jmp .loop_x_noload +.right_extend: + psrldq xm2, xm0, 14 + vpbroadcastw m2, xm2 + jmp .loop_x_noload + +.loop_x: + pmovzxbw m2, [srcq+xq] +.loop_x_noload: + vinserti128 m0, xm2, 1 + palignr m3, m2, m0, 12 + palignr m4, m2, m0, 14 + + punpcklwd m5, m3, m2 + punpckhwd m6, m3, m2 + paddw m3, m4 + punpcklwd m0, m4, m1 + punpckhwd m4, m1 + pmaddwd m5, m5 + pmaddwd m6, m6 + pmaddwd m0, m0 + pmaddwd m4, m4 + paddw m3, m2 + paddd m5, m0 + vextracti128 xm0, m2, 1 + paddd m6, m4 + movu [sumq+xq*2], m3 + movu [sumsqq+xq*4+ 0], xm5 + movu [sumsqq+xq*4+16], xm6 + vextracti128 [sumsqq+xq*4+32], m5, 1 + vextracti128 [sumsqq+xq*4+48], m6, 1 + add xq, 16 + + ; if x <= -16 we can reload more pixels + ; else if x < 0 we reload and extend (this implies have_right=0) + ; else if x < xlimd we extend from previous load (this implies have_right=0) + ; else we are done + + cmp xd, -16 + jle .loop_x + test xd, xd + jl .partial_load_and_extend + cmp xd, xlimd + jl .right_extend + + add sumsqq, (384+16)*4 + add sumq, (384+16)*2 + add srcq, strideq + dec hd + jg .loop_y + RET + +INIT_YMM avx2 +cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim + movifnidn edged, edgem + mov xq, -2 + rorx ylimd, edged, 2 + and ylimd, 2 ; have_bottom + sub ylimd, 2 ; -2 if have_bottom=0, else 0 +.loop_x: + lea yd, [hq+ylimq+2] + lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4] + lea sum_ptrq, [sumq+xq*2+2-(384+16)*2] + test edgeb, 4 ; have_top + jnz .load_top + movu m0, [sumsq_ptrq+(384+16)*4*1] + movu m1, [sumsq_ptrq+(384+16)*4*1+32] + movu m6, [sum_ptrq+(384+16)*2*1] + mova m2, m0 + mova m3, m1 + mova m4, m0 + mova m5, m1 + mova m7, m6 + mova m8, m6 + jmp .loop_y_noload +.load_top: + movu m0, [sumsq_ptrq-(384+16)*4*1] ; l2sq [left] + movu m1, [sumsq_ptrq-(384+16)*4*1+32] ; l2sq [right] + movu m2, [sumsq_ptrq-(384+16)*4*0] ; l1sq [left] + movu m3, [sumsq_ptrq-(384+16)*4*0+32] ; l1sq [right] + movu m6, [sum_ptrq-(384+16)*2*1] ; l2 + movu m7, [sum_ptrq-(384+16)*2*0] ; l1 +.loop_y: + movu m4, [sumsq_ptrq+(384+16)*4*1] ; l0sq [left] + movu m5, [sumsq_ptrq+(384+16)*4*1+32] ; l0sq [right] + movu m8, [sum_ptrq+(384+16)*2*1] ; l0 +.loop_y_noload: + paddd m0, m2 + paddd m1, m3 + paddw m6, m7 + paddd m0, m4 + paddd m1, m5 + paddw m6, m8 + movu [sumsq_ptrq+ 0], m0 + movu [sumsq_ptrq+32], m1 + movu [sum_ptrq], m6 + + ; shift position down by one + mova m0, m2 + mova m1, m3 + mova m2, m4 + mova m3, m5 + mova m6, m7 + mova m7, m8 + add sumsq_ptrq, (384+16)*4 + add sum_ptrq, (384+16)*2 + dec yd + jg .loop_y + cmp yd, ylimd + jg .loop_y_noload + add xd, 16 + cmp xd, wd + jl .loop_x + RET + +INIT_YMM avx2 +cglobal sgr_calc_ab1, 4, 6, 11, a, b, w, h, s + sub aq, (384+16-1)*4 + sub bq, (384+16-1)*2 + add hd, 2 + lea r5, [sgr_x_by_x-0xf03] +%ifidn sd, sm + movd xm6, sd + vpbroadcastd m6, xm6 +%else + vpbroadcastd m6, sm +%endif + vpbroadcastd m8, [pd_0xf00801c7] + vpbroadcastd m9, [pw_256] + pcmpeqb m7, m7 + psrld m10, m9, 13 ; pd_2048 + DEFINE_ARGS a, b, w, h, x + +.loop_y: + mov xq, -2 +.loop_x: + pmovzxwd m0, [bq+xq*2] + pmovzxwd m1, [bq+xq*2+(384+16)*2] + movu m2, [aq+xq*4] + movu m3, [aq+xq*4+(384+16)*4] + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m2, m4 ; aa * 9 + paddd m3, m5 + pmaddwd m4, m0, m0 + pmaddwd m5, m1, m1 + pmaddwd m0, m8 + pmaddwd m1, m8 + psubd m2, m4 ; p = aa * 9 - bb * bb + psubd m3, m5 + pmulld m2, m6 + pmulld m3, m6 + paddusw m2, m8 + paddusw m3, m8 + psrld m2, 20 ; z + psrld m3, 20 + mova m5, m7 + vpgatherdd m4, [r5+m2], m5 ; xx + mova m5, m7 + vpgatherdd m2, [r5+m3], m5 + psrld m4, 24 + psrld m2, 24 + pmulld m0, m4 + pmulld m1, m2 + packssdw m4, m2 + psubw m4, m9, m4 + vpermq m4, m4, q3120 + paddd m0, m10 + paddd m1, m10 + psrld m0, 12 + psrld m1, 12 + movu [bq+xq*2], xm4 + vextracti128 [bq+xq*2+(384+16)*2], m4, 1 + movu [aq+xq*4], m0 + movu [aq+xq*4+(384+16)*4], m1 + add xd, 8 + cmp xd, wd + jl .loop_x + add aq, (384+16)*4*2 + add bq, (384+16)*2*2 + sub hd, 2 + jg .loop_y + RET + +INIT_YMM avx2 +cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \ + tmp_ptr, src_ptr, a_ptr, b_ptr, x, y + movifnidn wd, wm + mov hd, hm + vpbroadcastd m15, [pw_16] + xor xd, xd +.loop_x: + lea tmp_ptrq, [tq+xq*2] + lea src_ptrq, [srcq+xq*1] + lea a_ptrq, [aq+xq*4+(384+16)*4] + lea b_ptrq, [bq+xq*2+(384+16)*2] + movu m0, [aq+xq*4-(384+16)*4-4] + movu m2, [aq+xq*4-(384+16)*4+4] + mova m1, [aq+xq*4-(384+16)*4] ; a:top [first half] + paddd m0, m2 ; a:tl+tr [first half] + movu m2, [aq+xq*4-(384+16)*4-4+32] + movu m4, [aq+xq*4-(384+16)*4+4+32] + mova m3, [aq+xq*4-(384+16)*4+32] ; a:top [second half] + paddd m2, m4 ; a:tl+tr [second half] + movu m4, [aq+xq*4-4] + movu m5, [aq+xq*4+4] + paddd m1, [aq+xq*4] ; a:top+ctr [first half] + paddd m4, m5 ; a:l+r [first half] + movu m5, [aq+xq*4+32-4] + movu m6, [aq+xq*4+32+4] + paddd m3, [aq+xq*4+32] ; a:top+ctr [second half] + paddd m5, m6 ; a:l+r [second half] + + movu m6, [bq+xq*2-(384+16)*2-2] + movu m8, [bq+xq*2-(384+16)*2+2] + mova m7, [bq+xq*2-(384+16)*2] ; b:top + paddw m6, m8 ; b:tl+tr + movu m8, [bq+xq*2-2] + movu m9, [bq+xq*2+2] + paddw m7, [bq+xq*2] ; b:top+ctr + paddw m8, m9 ; b:l+r + mov yd, hd +.loop_y: + movu m9, [b_ptrq-2] + movu m10, [b_ptrq+2] + paddw m7, [b_ptrq] ; b:top+ctr+bottom + paddw m9, m10 ; b:bl+br + paddw m10, m7, m8 ; b:top+ctr+bottom+l+r + paddw m6, m9 ; b:tl+tr+bl+br + psubw m7, [b_ptrq-(384+16)*2*2] ; b:ctr+bottom + paddw m10, m6 + psllw m10, 2 + psubw m10, m6 ; aa + pmovzxbw m12, [src_ptrq] + punpcklwd m6, m10, m15 + punpckhwd m10, m15 + punpcklwd m13, m12, m15 + punpckhwd m12, m15 + pmaddwd m6, m13 ; aa*src[x]+256 [first half] + pmaddwd m10, m12 ; aa*src[x]+256 [second half] + + movu m11, [a_ptrq-4] + movu m12, [a_ptrq+4] + paddd m1, [a_ptrq] ; a:top+ctr+bottom [first half] + paddd m11, m12 ; a:bl+br [first half] + movu m12, [a_ptrq+32-4] + movu m13, [a_ptrq+32+4] + paddd m3, [a_ptrq+32] ; a:top+ctr+bottom [second half] + paddd m12, m13 ; a:bl+br [second half] + paddd m13, m1, m4 ; a:top+ctr+bottom+l+r [first half] + paddd m14, m3, m5 ; a:top+ctr+bottom+l+r [second half] + paddd m0, m11 ; a:tl+tr+bl+br [first half] + paddd m2, m12 ; a:tl+tr+bl+br [second half] + paddd m13, m0 + paddd m14, m2 + pslld m13, 2 + pslld m14, 2 + psubd m13, m0 ; bb [first half] + psubd m14, m2 ; bb [second half] + vperm2i128 m0, m13, m14, 0x31 + vinserti128 m13, xm14, 1 + psubd m1, [a_ptrq-(384+16)*4*2] ; a:ctr+bottom [first half] + psubd m3, [a_ptrq-(384+16)*4*2+32] ; a:ctr+bottom [second half] + + paddd m6, m13 + paddd m10, m0 + psrad m6, 9 + psrad m10, 9 + packssdw m6, m10 + mova [tmp_ptrq], m6 + + ; shift to next row + mova m0, m4 + mova m2, m5 + mova m4, m11 + mova m5, m12 + mova m6, m8 + mova m8, m9 + + add a_ptrq, (384+16)*4 + add b_ptrq, (384+16)*2 + add tmp_ptrq, 384*2 + add src_ptrq, strideq + dec yd + jg .loop_y + add xd, 16 + cmp xd, wd + jl .loop_x + RET + +INIT_YMM avx2 +cglobal sgr_weighted1, 4, 6, 6, dst, stride, t, w, h, wt +%ifidn wtd, wtm + shl wtd, 4 + movd xm5, wtd + vpbroadcastw m5, xm5 +%else + vpbroadcastw m5, wtm + mov hd, hm + psllw m5, 4 +%endif + DEFINE_ARGS dst, stride, t, w, h, idx +.loop_y: + xor idxd, idxd +.loop_x: + mova m0, [tq+idxq*2+ 0] + mova m1, [tq+idxq*2+32] + pmovzxbw m2, [dstq+idxq+ 0] + pmovzxbw m3, [dstq+idxq+16] + psllw m4, m2, 4 + psubw m0, m4 + psllw m4, m3, 4 + psubw m1, m4 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + vpermq m0, m0, q3120 + mova [dstq+idxq], m0 + add idxd, 32 + cmp idxd, wd + jl .loop_x + add tq, 384*2 + add dstq, strideq + dec hd + jg .loop_y + RET + +INIT_YMM avx2 +cglobal sgr_box5_h, 5, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xlim + mov edged, edgem + movifnidn wd, wm + mov hd, hm + test edgeb, 2 ; have_right + jz .no_right + xor xlimd, xlimd + add wd, 2+15 + and wd, ~15 + jmp .right_done +.no_right: + mov xlimd, 3 + sub wd, 1 +.right_done: + lea r10, [pb_right_ext_mask+32] + pxor m1, m1 + lea srcq, [srcq+wq+1] + lea sumq, [sumq+wq*2-2] + lea sumsqq, [sumsqq+wq*4-4] + neg wq +.loop_y: + mov xq, wq + + ; load left + test edgeb, 1 ; have_left + jz .no_left + test leftq, leftq + jz .load_left_from_main + vpbroadcastd xm2, [leftq] + movd xm0, [srcq+xq-1] + add leftq, 4 + palignr xm0, xm2, 1 + jmp .expand_x +.no_left: + vpbroadcastb xm0, [srcq+xq-1] + jmp .expand_x +.load_left_from_main: + vpbroadcastd xm0, [srcq+xq-4] +.expand_x: + punpckhbw xm0, xm1 + + ; when we reach this, xm0 contains left two px in highest words + cmp xd, -16 + jle .loop_x + test xd, xd + jge .right_extend +.partial_load_and_extend: + vpbroadcastb m3, [srcq-1] + pmovzxbw m2, [srcq+xq] + movu m4, [r10+xq*2] + punpcklbw m3, m1 + pand m2, m4 + pandn m4, m3 + por m2, m4 + jmp .loop_x_noload +.right_extend: + psrldq xm2, xm0, 14 + vpbroadcastw m2, xm2 + jmp .loop_x_noload + +.loop_x: + pmovzxbw m2, [srcq+xq] +.loop_x_noload: + vinserti128 m0, xm2, 1 + palignr m3, m2, m0, 8 + palignr m4, m2, m0, 10 + palignr m5, m2, m0, 12 + palignr m6, m2, m0, 14 + + paddw m0, m3, m2 + punpcklwd m7, m3, m2 + punpckhwd m3, m2 + paddw m0, m4 + punpcklwd m8, m4, m5 + punpckhwd m4, m5 + paddw m0, m5 + punpcklwd m9, m6, m1 + punpckhwd m5, m6, m1 + paddw m0, m6 + pmaddwd m7, m7 + pmaddwd m3, m3 + pmaddwd m8, m8 + pmaddwd m4, m4 + pmaddwd m9, m9 + pmaddwd m5, m5 + paddd m7, m8 + paddd m3, m4 + paddd m7, m9 + paddd m3, m5 + movu [sumq+xq*2], m0 + movu [sumsqq+xq*4+ 0], xm7 + movu [sumsqq+xq*4+16], xm3 + vextracti128 [sumsqq+xq*4+32], m7, 1 + vextracti128 [sumsqq+xq*4+48], m3, 1 + + vextracti128 xm0, m2, 1 + add xq, 16 + + ; if x <= -16 we can reload more pixels + ; else if x < 0 we reload and extend (this implies have_right=0) + ; else if x < xlimd we extend from previous load (this implies have_right=0) + ; else we are done + + cmp xd, -16 + jle .loop_x + test xd, xd + jl .partial_load_and_extend + cmp xd, xlimd + jl .right_extend + + add srcq, strideq + add sumsqq, (384+16)*4 + add sumq, (384+16)*2 + dec hd + jg .loop_y + RET + +INIT_YMM avx2 +cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim + movifnidn edged, edgem + mov xq, -2 + rorx ylimd, edged, 2 + and ylimd, 2 ; have_bottom + sub ylimd, 3 ; -3 if have_bottom=0, else -1 +.loop_x: + lea yd, [hq+ylimq+2] + lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4] + lea sum_ptrq, [sumq+xq*2+2-(384+16)*2] + test edgeb, 4 ; have_top + jnz .load_top + movu m0, [sumsq_ptrq+(384+16)*4*1] + movu m1, [sumsq_ptrq+(384+16)*4*1+32] + movu m10, [sum_ptrq+(384+16)*2*1] + mova m2, m0 + mova m3, m1 + mova m4, m0 + mova m5, m1 + mova m6, m0 + mova m7, m1 + mova m11, m10 + mova m12, m10 + mova m13, m10 + jmp .loop_y_second_load +.load_top: + movu m0, [sumsq_ptrq-(384+16)*4*1] ; l3/4sq [left] + movu m1, [sumsq_ptrq-(384+16)*4*1+32] ; l3/4sq [right] + movu m4, [sumsq_ptrq-(384+16)*4*0] ; l2sq [left] + movu m5, [sumsq_ptrq-(384+16)*4*0+32] ; l2sq [right] + movu m10, [sum_ptrq-(384+16)*2*1] ; l3/4 + movu m12, [sum_ptrq-(384+16)*2*0] ; l2 + mova m2, m0 + mova m3, m1 + mova m11, m10 +.loop_y: + movu m6, [sumsq_ptrq+(384+16)*4*1] ; l1sq [left] + movu m7, [sumsq_ptrq+(384+16)*4*1+32] ; l1sq [right] + movu m13, [sum_ptrq+(384+16)*2*1] ; l1 +.loop_y_second_load: + test yd, yd + jle .emulate_second_load + movu m8, [sumsq_ptrq+(384+16)*4*2] ; l0sq [left] + movu m9, [sumsq_ptrq+(384+16)*4*2+32] ; l0sq [right] + movu m14, [sum_ptrq+(384+16)*2*2] ; l0 +.loop_y_noload: + paddd m0, m2 + paddd m1, m3 + paddw m10, m11 + paddd m0, m4 + paddd m1, m5 + paddw m10, m12 + paddd m0, m6 + paddd m1, m7 + paddw m10, m13 + paddd m0, m8 + paddd m1, m9 + paddw m10, m14 + movu [sumsq_ptrq+ 0], m0 + movu [sumsq_ptrq+32], m1 + movu [sum_ptrq], m10 + + ; shift position down by one + mova m0, m4 + mova m1, m5 + mova m2, m6 + mova m3, m7 + mova m4, m8 + mova m5, m9 + mova m10, m12 + mova m11, m13 + mova m12, m14 + add sumsq_ptrq, (384+16)*4*2 + add sum_ptrq, (384+16)*2*2 + sub yd, 2 + jge .loop_y + ; l1 = l0 + mova m6, m8 + mova m7, m9 + mova m13, m14 + cmp yd, ylimd + jg .loop_y_noload + add xd, 16 + cmp xd, wd + jl .loop_x + RET +.emulate_second_load: + mova m8, m6 + mova m9, m7 + mova m14, m13 + jmp .loop_y_noload + +INIT_YMM avx2 +cglobal sgr_calc_ab2, 4, 6, 11, a, b, w, h, s + sub aq, (384+16-1)*4 + sub bq, (384+16-1)*2 + add hd, 2 + lea r5, [sgr_x_by_x-0xf03] +%ifidn sd, sm + movd xm6, sd + vpbroadcastd m6, xm6 +%else + vpbroadcastd m6, sm +%endif + vpbroadcastd m8, [pd_0xf0080029] + vpbroadcastd m9, [pw_256] + pcmpeqb m7, m7 + psrld m10, m9, 15 ; pd_512 + DEFINE_ARGS a, b, w, h, x +.loop_y: + mov xq, -2 +.loop_x: + pmovzxwd m0, [bq+xq*2+ 0] + pmovzxwd m1, [bq+xq*2+16] + movu m2, [aq+xq*4+ 0] + movu m3, [aq+xq*4+32] + pslld m4, m2, 3 ; aa * 8 + pslld m5, m3, 3 + paddd m2, m4 ; aa * 9 + paddd m3, m5 + paddd m4, m4 ; aa * 16 + paddd m5, m5 + paddd m2, m4 ; aa * 25 + paddd m3, m5 + pmaddwd m4, m0, m0 + pmaddwd m5, m1, m1 + psubd m2, m4 ; p = aa * 25 - bb * bb + psubd m3, m5 + pmulld m2, m6 + pmulld m3, m6 + paddusw m2, m8 + paddusw m3, m8 + psrld m2, 20 ; z + psrld m3, 20 + mova m5, m7 + vpgatherdd m4, [r5+m2], m5 ; xx + mova m5, m7 + vpgatherdd m2, [r5+m3], m5 + psrld m4, 24 + psrld m2, 24 + packssdw m3, m4, m2 + pmullw m4, m8 + pmullw m2, m8 + psubw m3, m9, m3 + vpermq m3, m3, q3120 + pmaddwd m0, m4 + pmaddwd m1, m2 + paddd m0, m10 + paddd m1, m10 + psrld m0, 10 + psrld m1, 10 + movu [bq+xq*2], m3 + movu [aq+xq*4+ 0], m0 + movu [aq+xq*4+32], m1 + add xd, 16 + cmp xd, wd + jl .loop_x + add aq, (384+16)*4*2 + add bq, (384+16)*2*2 + sub hd, 2 + jg .loop_y + RET + +INIT_YMM avx2 +cglobal sgr_finish_filter2, 5, 13, 13, t, src, stride, a, b, w, h, \ + tmp_ptr, src_ptr, a_ptr, b_ptr, x, y + movifnidn wd, wm + mov hd, hm + vpbroadcastd m9, [pw_5_6] + vpbroadcastd m12, [pw_256] + psrlw m11, m12, 1 ; pw_128 + psrlw m10, m12, 8 ; pw_1 + xor xd, xd +.loop_x: + lea tmp_ptrq, [tq+xq*2] + lea src_ptrq, [srcq+xq*1] + lea a_ptrq, [aq+xq*4+(384+16)*4] + lea b_ptrq, [bq+xq*2+(384+16)*2] + movu m0, [aq+xq*4-(384+16)*4-4] + mova m1, [aq+xq*4-(384+16)*4] + movu m2, [aq+xq*4-(384+16)*4+4] + movu m3, [aq+xq*4-(384+16)*4-4+32] + mova m4, [aq+xq*4-(384+16)*4+32] + movu m5, [aq+xq*4-(384+16)*4+4+32] + paddd m0, m2 + paddd m3, m5 + paddd m0, m1 + paddd m3, m4 + pslld m2, m0, 2 + pslld m5, m3, 2 + paddd m2, m0 + paddd m5, m3 + paddd m0, m2, m1 ; prev_odd_b [first half] + paddd m1, m5, m4 ; prev_odd_b [second half] + movu m3, [bq+xq*2-(384+16)*2-2] + mova m4, [bq+xq*2-(384+16)*2] + movu m5, [bq+xq*2-(384+16)*2+2] + paddw m3, m5 + punpcklwd m5, m3, m4 + punpckhwd m3, m4 + pmaddwd m5, m9 + pmaddwd m3, m9 + packssdw m2, m5, m3 ; prev_odd_a + mov yd, hd +.loop_y: + movu m3, [a_ptrq-4] + mova m4, [a_ptrq] + movu m5, [a_ptrq+4] + movu m6, [a_ptrq+32-4] + mova m7, [a_ptrq+32] + movu m8, [a_ptrq+32+4] + paddd m3, m5 + paddd m6, m8 + paddd m3, m4 + paddd m6, m7 + pslld m5, m3, 2 + pslld m8, m6, 2 + paddd m5, m3 + paddd m8, m6 + paddd m3, m5, m4 ; cur_odd_b [first half] + paddd m4, m8, m7 ; cur_odd_b [second half] + movu m5, [b_ptrq-2] + mova m6, [b_ptrq] + movu m7, [b_ptrq+2] + paddw m5, m7 + punpcklwd m7, m5, m6 + punpckhwd m5, m6 + pmaddwd m7, m9 + pmaddwd m5, m9 + packssdw m5, m7, m5 ; cur_odd_a + + paddd m0, m3 ; cur_even_b [first half] + paddd m1, m4 ; cur_even_b [second half] + paddw m2, m5 ; cur_even_a + + pmovzxbw m6, [src_ptrq] + vperm2i128 m8, m0, m1, 0x31 + vinserti128 m0, xm1, 1 + punpcklwd m7, m6, m10 + punpckhwd m6, m10 + punpcklwd m1, m2, m12 + punpckhwd m2, m12 + pmaddwd m7, m1 + pmaddwd m6, m2 + paddd m7, m0 + paddd m6, m8 + psrad m7, 9 + psrad m6, 9 + + pmovzxbw m8, [src_ptrq+strideq] + punpcklwd m0, m8, m10 + punpckhwd m8, m10 + punpcklwd m1, m5, m11 + punpckhwd m2, m5, m11 + pmaddwd m0, m1 + pmaddwd m8, m2 + vinserti128 m2, m3, xm4, 1 + vperm2i128 m1, m3, m4, 0x31 + paddd m0, m2 + paddd m8, m1 + psrad m0, 8 + psrad m8, 8 + + packssdw m7, m6 + packssdw m0, m8 + mova [tmp_ptrq+384*2*0], m7 + mova [tmp_ptrq+384*2*1], m0 + + mova m0, m3 + mova m1, m4 + mova m2, m5 + add a_ptrq, (384+16)*4*2 + add b_ptrq, (384+16)*2*2 + add tmp_ptrq, 384*2*2 + lea src_ptrq, [src_ptrq+strideq*2] + sub yd, 2 + jg .loop_y + add xd, 16 + cmp xd, wd + jl .loop_x + RET + +INIT_YMM avx2 +cglobal sgr_weighted2, 4, 7, 11, dst, stride, t1, t2, w, h, wt + movifnidn wd, wm + movifnidn hd, hm + vpbroadcastd m0, wtm + vpbroadcastd m10, [pd_1024] + DEFINE_ARGS dst, stride, t1, t2, w, h, idx +.loop_y: + xor idxd, idxd +.loop_x: + mova m1, [t1q+idxq*2+ 0] + mova m2, [t1q+idxq*2+32] + mova m3, [t2q+idxq*2+ 0] + mova m4, [t2q+idxq*2+32] + pmovzxbw m5, [dstq+idxq+ 0] + pmovzxbw m6, [dstq+idxq+16] + psllw m7, m5, 4 + psllw m8, m6, 4 + psubw m1, m7 + psubw m2, m8 + psubw m3, m7 + psubw m4, m8 + punpcklwd m9, m1, m3 + punpckhwd m1, m3 + punpcklwd m3, m2, m4 + punpckhwd m2, m4 + pmaddwd m9, m0 + pmaddwd m1, m0 + pmaddwd m3, m0 + pmaddwd m2, m0 + paddd m9, m10 + paddd m1, m10 + paddd m3, m10 + paddd m2, m10 + psrad m9, 11 + psrad m1, 11 + psrad m3, 11 + psrad m2, 11 + packssdw m1, m9, m1 + packssdw m2, m3, m2 + paddw m1, m5 + paddw m2, m6 + packuswb m1, m2 + vpermq m1, m1, q3120 + mova [dstq+idxq], m1 + add idxd, 32 + cmp idxd, wd + jl .loop_x + add dstq, strideq + add t1q, 384 * 2 + add t2q, 384 * 2 + dec hd + jg .loop_y + RET +%endif ; ARCH_X86_64 diff --git a/third_party/dav1d/src/x86/looprestoration_init_tmpl.c b/third_party/dav1d/src/x86/looprestoration_init_tmpl.c new file mode 100644 index 0000000000..b0201ce3db --- /dev/null +++ b/third_party/dav1d/src/x86/looprestoration_init_tmpl.c @@ -0,0 +1,233 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/looprestoration.h" + +#include "common/intops.h" +#include "src/tables.h" + +// Future potential optimizations: +// - special chroma versions which don't filter [0]/[6]; +// - running filter_h_avx2 transposed (one col of 32 pixels per iteration, top +// to bottom) instead of scanline-ordered should be faster since then the +// if (have_left) and similar conditions run only once instead of per line; +// - filter_v_avx2 currently runs 16 pixels per iteration, it should be possible +// to run 32 (like filter_h_avx2), and then all vpermqs can go; +// - maybe split out the top/bottom filter_h_avx2 from the main body filter_h_avx2, +// since then the have_left condition can be inlined; +// - consider having the wrapper (wiener_filter_${ext}) also in hand-written +// assembly, so the setup overhead is minimized. + +#define WIENER_FILTER(ext) \ +\ +void dav1d_wiener_filter_h_##ext(int16_t *dst, const pixel (*left)[4], \ + const pixel *src, ptrdiff_t stride, \ + const int16_t fh[7], const intptr_t w, \ + int h, enum LrEdgeFlags edges); \ +void dav1d_wiener_filter_v_##ext(pixel *dst, ptrdiff_t stride, \ + const int16_t *mid, int w, int h, \ + const int16_t fv[7], enum LrEdgeFlags edges); \ +\ +static void wiener_filter_##ext(pixel *const dst, const ptrdiff_t dst_stride, \ + const pixel (*const left)[4], \ + const pixel *lpf, const ptrdiff_t lpf_stride, \ + const int w, const int h, const int16_t fh[7], \ + const int16_t fv[7], const enum LrEdgeFlags edges) \ +{ \ + ALIGN_STK_32(int16_t, mid, 68 * 384,); \ +\ + /* horizontal filter */ \ + dav1d_wiener_filter_h_##ext(&mid[2 * 384], left, dst, dst_stride, \ + fh, w, h, edges); \ + if (edges & LR_HAVE_TOP) \ + dav1d_wiener_filter_h_##ext(mid, NULL, lpf, lpf_stride, \ + fh, w, 2, edges); \ + if (edges & LR_HAVE_BOTTOM) \ + dav1d_wiener_filter_h_##ext(&mid[(2 + h) * 384], NULL, \ + lpf + 6 * PXSTRIDE(lpf_stride), lpf_stride, \ + fh, w, 2, edges); \ +\ + dav1d_wiener_filter_v_##ext(dst, dst_stride, &mid[2*384], w, h, fv, edges); \ +} + +#define SGR_FILTER(ext) \ +\ +void dav1d_sgr_box3_h_##ext(int32_t *sumsq, int16_t *sum, \ + const pixel (*left)[4], \ + const pixel *src, const ptrdiff_t stride, \ + const int w, const int h, \ + const enum LrEdgeFlags edges); \ +void dav1d_sgr_box3_v_##ext(int32_t *sumsq, int16_t *sum, \ + const int w, const int h, \ + const enum LrEdgeFlags edges); \ +void dav1d_sgr_calc_ab1_##ext(int32_t *a, int16_t *b, \ + const int w, const int h, const int strength); \ +void dav1d_sgr_finish_filter1_##ext(coef *tmp, \ + const pixel *src, const ptrdiff_t stride, \ + const int32_t *a, const int16_t *b, \ + const int w, const int h); \ +\ +/* filter with a 3x3 box (radius=1) */ \ +static void dav1d_sgr_filter1_##ext(coef *tmp, \ + const pixel *src, const ptrdiff_t stride, \ + const pixel (*left)[4], \ + const pixel *lpf, const ptrdiff_t lpf_stride, \ + const int w, const int h, const int strength, \ + const enum LrEdgeFlags edges) \ +{ \ + ALIGN_STK_32(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); \ + int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; \ + ALIGN_STK_32(int16_t, sum_mem, (384 + 16) * 68 + 16,); \ + int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; \ +\ + dav1d_sgr_box3_h_##ext(sumsq, sum, left, src, stride, w, h, edges); \ + if (edges & LR_HAVE_TOP) \ + dav1d_sgr_box3_h_##ext(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \ + NULL, lpf, lpf_stride, w, 2, edges); \ +\ + if (edges & LR_HAVE_BOTTOM) \ + dav1d_sgr_box3_h_##ext(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \ + NULL, lpf + 6 * PXSTRIDE(lpf_stride), \ + lpf_stride, w, 2, edges); \ +\ + dav1d_sgr_box3_v_##ext(sumsq, sum, w, h, edges); \ + dav1d_sgr_calc_ab1_##ext(a, b, w, h, strength); \ + dav1d_sgr_finish_filter1_##ext(tmp, src, stride, a, b, w, h); \ +} \ +\ +void dav1d_sgr_box5_h_##ext(int32_t *sumsq, int16_t *sum, \ + const pixel (*left)[4], \ + const pixel *src, const ptrdiff_t stride, \ + const int w, const int h, \ + const enum LrEdgeFlags edges); \ +void dav1d_sgr_box5_v_##ext(int32_t *sumsq, int16_t *sum, \ + const int w, const int h, \ + const enum LrEdgeFlags edges); \ +void dav1d_sgr_calc_ab2_##ext(int32_t *a, int16_t *b, \ + const int w, const int h, const int strength); \ +void dav1d_sgr_finish_filter2_##ext(coef *tmp, \ + const pixel *src, const ptrdiff_t stride, \ + const int32_t *a, const int16_t *b, \ + const int w, const int h); \ +\ +/* filter with a 5x5 box (radius=2) */ \ +static void dav1d_sgr_filter2_##ext(coef *tmp, \ + const pixel *src, const ptrdiff_t stride, \ + const pixel (*left)[4], \ + const pixel *lpf, const ptrdiff_t lpf_stride, \ + const int w, const int h, const int strength, \ + const enum LrEdgeFlags edges) \ +{ \ + ALIGN_STK_32(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); \ + int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; \ + ALIGN_STK_32(int16_t, sum_mem, (384 + 16) * 68 + 16,); \ + int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; \ +\ + dav1d_sgr_box5_h_##ext(sumsq, sum, left, src, stride, w, h, edges); \ + if (edges & LR_HAVE_TOP) \ + dav1d_sgr_box5_h_##ext(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \ + NULL, lpf, lpf_stride, w, 2, edges); \ +\ + if (edges & LR_HAVE_BOTTOM) \ + dav1d_sgr_box5_h_##ext(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \ + NULL, lpf + 6 * PXSTRIDE(lpf_stride), \ + lpf_stride, w, 2, edges); \ +\ + dav1d_sgr_box5_v_##ext(sumsq, sum, w, h, edges); \ + dav1d_sgr_calc_ab2_##ext(a, b, w, h, strength); \ + dav1d_sgr_finish_filter2_##ext(tmp, src, stride, a, b, w, h); \ +} \ +\ +void dav1d_sgr_weighted1_##ext(pixel *dst, const ptrdiff_t stride, \ + const coef *t1, const int w, const int h, \ + const int wt); \ +void dav1d_sgr_weighted2_##ext(pixel *dst, const ptrdiff_t stride, \ + const coef *t1, const coef *t2, \ + const int w, const int h, \ + const uint32_t wt); \ +\ +static void sgr_filter_##ext(pixel *const dst, const ptrdiff_t dst_stride, \ + const pixel (*const left)[4], \ + const pixel *lpf, const ptrdiff_t lpf_stride, \ + const int w, const int h, const int sgr_idx, \ + const int16_t sgr_wt[7], const enum LrEdgeFlags edges) \ +{ \ + if (!dav1d_sgr_params[sgr_idx][0]) { \ + ALIGN_STK_32(coef, tmp, 64 * 384,); \ + dav1d_sgr_filter1_##ext(tmp, dst, dst_stride, left, lpf, lpf_stride, \ + w, h, dav1d_sgr_params[sgr_idx][3], edges); \ + dav1d_sgr_weighted1_##ext(dst, dst_stride, tmp, w, h, (1 << 7) - sgr_wt[1]); \ + } else if (!dav1d_sgr_params[sgr_idx][1]) { \ + ALIGN_STK_32(coef, tmp, 64 * 384,); \ + dav1d_sgr_filter2_##ext(tmp, dst, dst_stride, left, lpf, lpf_stride, \ + w, h, dav1d_sgr_params[sgr_idx][2], edges); \ + dav1d_sgr_weighted1_##ext(dst, dst_stride, tmp, w, h, sgr_wt[0]); \ + } else { \ + ALIGN_STK_32(coef, tmp1, 64 * 384,); \ + ALIGN_STK_32(coef, tmp2, 64 * 384,); \ + dav1d_sgr_filter2_##ext(tmp1, dst, dst_stride, left, lpf, lpf_stride, \ + w, h, dav1d_sgr_params[sgr_idx][2], edges); \ + dav1d_sgr_filter1_##ext(tmp2, dst, dst_stride, left, lpf, lpf_stride, \ + w, h, dav1d_sgr_params[sgr_idx][3], edges); \ + const uint32_t wt = ((128 - sgr_wt[0] - sgr_wt[1]) << 16) | (uint16_t) sgr_wt[0]; \ + dav1d_sgr_weighted2_##ext(dst, dst_stride, tmp1, tmp2, w, h, wt); \ + } \ +} + +#define DEF_LR_FILTERS(ext) \ +WIENER_FILTER(ext) \ +SGR_FILTER(ext) + +#if BITDEPTH == 8 +WIENER_FILTER(sse2) +DEF_LR_FILTERS(ssse3) +# if ARCH_X86_64 +DEF_LR_FILTERS(avx2) +# endif +#endif + +COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return; +#if BITDEPTH == 8 + c->wiener = wiener_filter_sse2; +#endif + + if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; +#if BITDEPTH == 8 + c->wiener = wiener_filter_ssse3; + c->selfguided = sgr_filter_ssse3; +#endif + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; +#if BITDEPTH == 8 && ARCH_X86_64 + c->wiener = wiener_filter_avx2; + c->selfguided = sgr_filter_avx2; +#endif +} diff --git a/third_party/dav1d/src/x86/looprestoration_ssse3.asm b/third_party/dav1d/src/x86/looprestoration_ssse3.asm new file mode 100644 index 0000000000..d11f68e736 --- /dev/null +++ b/third_party/dav1d/src/x86/looprestoration_ssse3.asm @@ -0,0 +1,1953 @@ +; Copyright © 2018, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; Copyright © 2018, VideoLabs +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA 16 + +pb_right_ext_mask: times 16 db 0xff + times 16 db 0 +pb_14x0_1_2: times 14 db 0 + db 1, 2 +pb_0_to_15_min_n: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13 + db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14 +pb_unpcklwdw: db 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13 +pb_0: times 16 db 0 +pb_2: times 16 db 2 +pb_3: times 16 db 3 +pb_4: times 16 db 4 +pb_15: times 16 db 15 +pb_0_1: times 8 db 0, 1 +pb_6_7: times 8 db 6, 7 +pb_14_15: times 8 db 14, 15 +pw_1: times 8 dw 1 +pw_16: times 8 dw 16 +pw_128: times 8 dw 128 +pw_255: times 8 dw 255 +pw_256: times 8 dw 256 +pw_2048: times 8 dw 2048 +pw_16380: times 8 dw 16380 +pw_5_6: times 4 dw 5, 6 +pw_0_128: times 4 dw 0, 128 +pd_1024: times 4 dd 1024 +%if ARCH_X86_32 +pd_256: times 4 dd 256 +pd_512: times 4 dd 512 +pd_2048: times 4 dd 2048 +%endif +pd_0xF0080029: times 4 dd 0xF0080029 +pd_0xF00801C7: times 4 dd 0XF00801C7 + +cextern sgr_x_by_x + +SECTION .text + +%if ARCH_X86_32 + %define PIC_base_offset $$ + + %macro SETUP_PIC 1-3 1,0 ; PIC_reg, save_PIC_reg, restore_PIC_reg + %assign pic_reg_stk_off 4 + %xdefine PIC_reg %1 + %if %2 == 1 + mov [esp], %1 + %endif + LEA PIC_reg, PIC_base_offset + %if %3 == 1 + XCHG_PIC_REG + %endif + %endmacro + + %macro XCHG_PIC_REG 0 + mov [esp+pic_reg_stk_off], PIC_reg + %assign pic_reg_stk_off (pic_reg_stk_off+4) % 8 + mov PIC_reg, [esp+pic_reg_stk_off] + %endmacro + + %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset) + +%else + %macro XCHG_PIC_REG 0 + %endmacro + + %define PIC_sym(sym) (sym) +%endif + +%macro PALIGNR 4 ; dst, src1, src2, shift + %if cpuflag(ssse3) + palignr %1, %2, %3, %4 + %else + %assign %%i regnumof%+%1 + 1 + %define %%tmp m %+ %%i + psrldq %1, %3, %4 + pslldq %%tmp, %2, 16-%4 + por %1, %%tmp + %endif +%endmacro + +%macro PMADDUBSW 5 ; dst, src, zero, tmp, reset_zero + %if cpuflag(ssse3) + pmaddubsw %1, %2 + %else + %if %5 == 1 + pxor %3, %3 + %endif + punpckhbw %4, %1, %3 + punpcklbw %1, %3 + pmaddwd %4, %2 + pmaddwd %1, %2 + packssdw %1, %4 + %endif +%endmacro + +;;;;;;;;;;;;;;;;;;;;;; +;; wiener ;; +;;;;;;;;;;;;;;;;;;;;;; + +%macro WIENER_H 0 +%if ARCH_X86_64 +cglobal wiener_filter_h, 5, 15, 16, dst, left, src, stride, fh, w, h, edge + mov edged, edgem + movifnidn wd, wm + mov hd, hm +%else +cglobal wiener_filter_h, 5, 7, 8, -84, dst, left, src, stride, fh, w, h, edge + mov r5, edgem + mov [esp+12], r5 + mov wd, wm + mov hd, hm + SETUP_PIC hd + %define m15 m0 + %define m14 m1 + %define m13 m2 + %define m12 m3 +%endif + + movq m15, [fhq] +%if cpuflag(ssse3) + pshufb m12, m15, [PIC_sym(pb_6_7)] + pshufb m13, m15, [PIC_sym(pb_4)] + pshufb m14, m15, [PIC_sym(pb_2)] + pshufb m15, m15, [PIC_sym(pb_0)] +%else + pshuflw m12, m15, q3333 + punpcklbw m15, m15 + pshufhw m13, m15, q0000 + pshuflw m14, m15, q2222 + pshuflw m15, m15, q0000 + punpcklqdq m12, m12 + punpckhqdq m13, m13 + punpcklqdq m14, m14 + punpcklqdq m15, m15 + psraw m13, 8 + psraw m14, 8 + psraw m15, 8 +%endif + +%if ARCH_X86_64 + mova m11, [pw_2048] + mova m10, [pw_16380] + lea r11, [pb_right_ext_mask] + + DEFINE_ARGS dst, left, src, stride, x, w, h, edge, srcptr, dstptr, xlim +%else + %define m10 [PIC_sym(pw_16380)] + %define m11 [PIC_sym(pw_2048)] + %define m12 [esp+0x14] + %define m13 [esp+0x24] + %define m14 [esp+0x34] + %define m15 [esp+0x44] + mova m12, m3 + mova m13, m2 + mova m14, m1 + mova m15, m0 + + DEFINE_ARGS dst, left, src, stride, x, w, h, edge + %define srcptrq srcq + %define dstptrq dstq + %define hd dword [esp+ 0] + %define edgeb byte [esp+12] + %define xlimd dword [esp+16] +%endif + + ; if (edge & has_right) align_w_to_16 + ; else w -= 3, and use that as limit in x loop + test edgeb, 2 ; has_right + jnz .align + mov xlimd, -3 + jmp .loop +.align: + add wd, 15 + and wd, ~15 +%if ARCH_X86_64 + xor xlimd, xlimd +%else + mov xlimd, 0 +%endif + + ; main y loop for vertical filter +.loop: +%if ARCH_X86_64 + mov srcptrq, srcq + mov dstptrq, dstq + lea xd, [wq+xlimq] +%else + mov [esp+8], srcq + mov [esp+4], dstq + mov xd, xlimd + add xd, wd +%endif + + ; load left edge pixels + test edgeb, 1 ; have_left + jz .emu_left + test leftq, leftq ; left == NULL for the edge-extended bottom/top + jz .load_left_combined + movd m0, [leftq] + movd m1, [srcq] + punpckldq m0, m1 + pslldq m0, 9 + add leftq, 4 + jmp .left_load_done +.load_left_combined: + movq m0, [srcq-3] + pslldq m0, 10 + jmp .left_load_done +.emu_left: + movd m0, [srcq] +%if cpuflag(ssse3) + pshufb m0, [PIC_sym(pb_14x0_1_2)] +%else + pslldq m1, m0, 13 + punpcklbw m0, m0 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 + psrldq m0, 2 + por m0, m1 +%endif + + ; load right edge pixels +.left_load_done: + cmp xd, 16 + jg .main_load + test xd, xd + jg .load_and_splat + je .splat_right + + ; for very small images (w=[1-2]), edge-extend the original cache, + ; ugly, but only runs in very odd cases +%if cpuflag(ssse3) + add wd, wd + %if ARCH_X86_64 + pshufb m0, [r11-pb_right_ext_mask+pb_0_to_15_min_n+wq*8-16] + %else + pshufb m0, [PIC_sym(pb_0_to_15_min_n)+wq*8-16] + %endif + shr wd, 1 +%else + shl wd, 4 + pcmpeqd m2, m2 + movd m3, wd + psrldq m2, 2 + punpckhbw m1, m0, m0 + pshufhw m1, m1, q1122 + psllq m1, m3 + pand m0, m2 + pandn m2, m1 + por m0, m2 + shr wd, 4 +%endif + + ; main x loop, mostly this starts in .main_load +.splat_right: + ; no need to load new pixels, just extend them from the (possibly previously + ; extended) previous load into m0 +%if cpuflag(ssse3) + pshufb m1, m0, [PIC_sym(pb_15)] +%else + punpckhbw m1, m0, m0 + pshufhw m1, m1, q3333 + punpckhqdq m1, m1 +%endif + jmp .main_loop +.load_and_splat: + ; load new pixels and extend edge for right-most + movu m1, [srcptrq+3] +%if ARCH_X86_64 + sub r11, xq + movu m2, [r11+16] + add r11, xq +%else + sub PIC_reg, xd + movu m2, [PIC_sym(pb_right_ext_mask)+16] + add PIC_reg, xd +%endif + movd m3, [srcptrq+2+xq] +%if cpuflag(ssse3) + pshufb m3, [PIC_sym(pb_0)] +%else + punpcklbw m3, m3 + pshuflw m3, m3, q0000 + punpcklqdq m3, m3 +%endif + pand m1, m2 + pxor m2, [PIC_sym(pb_right_ext_mask)] + pand m3, m2 + pxor m2, [PIC_sym(pb_right_ext_mask)] + por m1, m3 + jmp .main_loop +.main_load: + ; load subsequent line + movu m1, [srcptrq+3] +.main_loop: +%if ARCH_X86_64 + PALIGNR m2, m1, m0, 10 + PALIGNR m3, m1, m0, 11 + PALIGNR m4, m1, m0, 12 + PALIGNR m5, m1, m0, 13 + PALIGNR m6, m1, m0, 14 + PALIGNR m7, m1, m0, 15 + + punpcklbw m0, m2, m1 + punpckhbw m2, m1 + punpcklbw m8, m3, m7 + punpckhbw m3, m7 + punpcklbw m7, m4, m6 + punpckhbw m4, m6 + PMADDUBSW m0, m15, m6, m9, 1 + PMADDUBSW m2, m15, m6, m9, 0 + PMADDUBSW m8, m14, m6, m9, 0 + PMADDUBSW m3, m14, m6, m9, 0 + PMADDUBSW m7, m13, m6, m9, 0 + PMADDUBSW m4, m13, m6, m9, 0 + paddw m0, m8 + paddw m2, m3 + %if cpuflag(ssse3) + pxor m6, m6 + %endif + punpcklbw m3, m5, m6 + punpckhbw m5, m6 + psllw m8, m3, 7 + psllw m6, m5, 7 + psubw m8, m10 + psubw m6, m10 + pmullw m3, m12 + pmullw m5, m12 + paddw m0, m7 + paddw m2, m4 + paddw m0, m3 + paddw m2, m5 + paddsw m0, m8 ; see the avx2 for an explanation + paddsw m2, m6 ; of how the clipping works here + psraw m0, 3 + psraw m2, 3 + paddw m0, m11 + paddw m2, m11 + mova [dstptrq+ 0], m0 + mova [dstptrq+16], m2 +%else + PALIGNR m2, m1, m0, 10 + punpcklbw m3, m2, m1 + punpckhbw m2, m1 + PMADDUBSW m3, m15, m4, m5, 1 + PMADDUBSW m2, m15, m4, m5, 0 + PALIGNR m4, m1, m0, 11 + PALIGNR m5, m1, m0, 15 + punpcklbw m6, m4, m5 + punpckhbw m4, m5 + PMADDUBSW m6, m14, m5, m7, 1 + PMADDUBSW m4, m14, m5, m7, 0 + paddw m3, m6 + paddw m2, m4 + PALIGNR m4, m1, m0, 12 + PALIGNR m5, m1, m0, 14 + punpcklbw m6, m4, m5 + punpckhbw m4, m5 + PMADDUBSW m6, m13, m5, m7, 1 + PMADDUBSW m4, m13, m5, m7, 0 + paddw m3, m6 + paddw m2, m4 + PALIGNR m6, m1, m0, 13 + %if cpuflag(ssse3) + pxor m5, m5 + %endif + punpcklbw m4, m6, m5 + punpckhbw m6, m5 + psllw m5, m4, 7 + psllw m7, m6, 7 + psubw m5, m10 + psubw m7, m10 + pmullw m4, m12 + pmullw m6, m12 + paddw m3, m4 + paddw m2, m6 + paddsw m3, m5 + paddsw m2, m7 + psraw m3, 3 + psraw m2, 3 + paddw m3, m11 + paddw m2, m11 + mova [dstptrq+ 0], m3 + mova [dstptrq+16], m2 +%endif + + mova m0, m1 + add srcptrq, 16 + add dstptrq, 32 + sub xd, 16 + cmp xd, 16 + jg .main_load + test xd, xd + jg .load_and_splat + cmp xd, xlimd + jg .splat_right + +%if ARCH_X86_32 + mov srcq, [esp+8] + mov dstq, [esp+4] +%endif + add srcq, strideq + add dstq, 384*2 + dec hd + jg .loop + RET +%endmacro + +%macro WIENER_V 0 +%if ARCH_X86_64 +cglobal wiener_filter_v, 4, 10, 16, dst, stride, mid, w, h, fv, edge + mov edged, edgem + movifnidn fvq, fvmp + movifnidn hd, hm + movq m15, [fvq] + pshufd m14, m15, q1111 + pshufd m15, m15, q0000 + paddw m14, [pw_0_128] + mova m12, [pd_1024] + + DEFINE_ARGS dst, stride, mid, w, h, y, edge, ylim, mptr, dstptr + + mov ylimd, edged + and ylimd, 8 ; have_bottom + shr ylimd, 2 + sub ylimd, 3 +%else +cglobal wiener_filter_v, 5, 7, 8, -96, dst, stride, mid, w, h, fv, edge + %define ylimd [esp+12] + + mov r5d, edgem + and r5d, 8 + shr r5d, 2 + sub r5d, 3 + mov ylimd, r5d + mov fvq, fvmp + mov edged, edgem + + SETUP_PIC edged + + movq m0, [fvq] + pshufd m1, m0, q1111 + pshufd m0, m0, q0000 + paddw m1, [PIC_sym(pw_0_128)] + mova [esp+0x50], m0 + mova [esp+0x40], m1 + + DEFINE_ARGS dst, stride, mid, w, h, y, edge + %define mptrq midq + %define dstptrq dstq + %define edgeb byte [esp] +%endif + + ; main x loop for vertical filter, does one column of 16 pixels +.loop_x: + mova m3, [midq] ; middle line + + ; load top pixels + test edgeb, 4 ; have_top + jz .emu_top + mova m0, [midq-384*4] + mova m2, [midq-384*2] + mova m1, m0 + jmp .load_bottom_pixels +.emu_top: + mova m0, m3 + mova m1, m3 + mova m2, m3 + + ; load bottom pixels +.load_bottom_pixels: + mov yd, hd +%if ARCH_X86_64 + mov mptrq, midq + mov dstptrq, dstq + add yd, ylimd +%else + mov [esp+8], midq + mov [esp+4], dstq + add yd, ylimd +%endif + jg .load_threelines + + ; the remainder here is somewhat messy but only runs in very weird + ; circumstances at the bottom of the image in very small blocks (h=[1-3]), + ; so performance is not terribly important here... + je .load_twolines + cmp yd, -1 + je .load_oneline + ; h == 1 case + mova m5, m3 + mova m4, m3 + mova m6, m3 + jmp .loop +.load_oneline: + ; h == 2 case + mova m4, [midq+384*2] + mova m5, m4 + mova m6, m4 + jmp .loop +.load_twolines: + ; h == 3 case + mova m4, [midq+384*2] + mova m5, [midq+384*4] + mova m6, m5 + jmp .loop +.load_threelines: + ; h > 3 case + mova m4, [midq+384*2] + mova m5, [midq+384*4] + ; third line loaded in main loop below + + ; main y loop for vertical filter +.loop_load: + ; load one line into m6. if that pixel is no longer available, do + ; nothing, since m6 still has the data from the previous line in it. We + ; try to structure the loop so that the common case is evaluated fastest + mova m6, [mptrq+384*6] +.loop: +%if ARCH_X86_64 + paddw m7, m0, m6 + paddw m8, m1, m5 + paddw m9, m2, m4 + punpcklwd m10, m7, m8 + punpckhwd m7, m8 + punpcklwd m11, m9, m3 + punpckhwd m9, m3 + pmaddwd m10, m15 + pmaddwd m7, m15 + pmaddwd m11, m14 + pmaddwd m9, m14 + paddd m10, m12 + paddd m7, m12 + paddd m10, m11 + paddd m7, m9 + psrad m10, 11 + psrad m7, 11 + packssdw m10, m7 + packuswb m10, m10 + movq [dstptrq], m10 +%else + mova [esp+0x30], m1 + mova [esp+0x20], m2 + mova [esp+0x10], m3 + paddw m0, m6 + paddw m1, m5 + paddw m2, m4 + punpcklwd m7, m2, m3 + punpckhwd m2, m3 + punpcklwd m3, m0, m1 + punpckhwd m0, m1 + mova m1, [esp+0x50] + pmaddwd m3, m1 + pmaddwd m0, m1 + mova m1, [esp+0x40] + pmaddwd m7, m1 + pmaddwd m2, m1 + paddd m3, [PIC_sym(pd_1024)] + paddd m0, [PIC_sym(pd_1024)] + paddd m3, m7 + paddd m0, m2 + psrad m3, 11 + psrad m0, 11 + packssdw m3, m0 + packuswb m3, m3 + movq [dstq], m3 + mova m1, [esp+0x30] + mova m2, [esp+0x20] + mova m3, [esp+0x10] +%endif + ; shift pixels one position + mova m0, m1 + mova m1, m2 + mova m2, m3 + mova m3, m4 + mova m4, m5 + mova m5, m6 + add mptrq, 384*2 + add dstptrq, strideq + dec yd + jg .loop_load + ; for the bottom pixels, continue using m6 (as extended edge) + cmp yd, ylimd + jg .loop + +%if ARCH_X86_32 + mov midq, [esp+8] + mov dstq, [esp+4] +%endif + add midq, 16 + add dstq, 8 + sub wd, 8 + jg .loop_x + RET +%endmacro + +INIT_XMM sse2 +WIENER_H +WIENER_V + +INIT_XMM ssse3 +WIENER_H +WIENER_V + +;;;;;;;;;;;;;;;;;;;;;;;;;; +;; self-guided ;; +;;;;;;;;;;;;;;;;;;;;;;;;;; + +%macro MULLD 2 + pmulhuw m5, %1, %2 + pmullw %1, %2 + pslld m5, 16 + paddd %1, m5 +%endmacro + +%macro GATHERDD 2 + mova m5, m7 + movd r6d, %2 + %if ARCH_X86_64 + movd %1, [r5+r6] + pextrw r6d, %2, 2 + pinsrw m5, [r5+r6+2], 3 + pextrw r6d, %2, 4 + pinsrw %1, [r5+r6+2], 5 + pextrw r6d, %2, 6 + pinsrw m5, [r5+r6+2], 7 + %else + movd %1, [PIC_sym(sgr_x_by_x-0xF03)+r6] + pextrw r6d, %2, 2 + pinsrw m5, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 3 + pextrw r6d, %2, 4 + pinsrw %1, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 5 + pextrw r6d, %2, 6 + pinsrw m5, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 7 + %endif + por %1, m5 +%endmacro + +%if ARCH_X86_64 +cglobal sgr_box3_h, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim + mov xlimd, edgem + movifnidn xd, xm + mov hd, hm + mov edged, xlimd + and xlimd, 2 ; have_right + add xd, xlimd + xor xlimd, 2 ; 2*!have_right +%else +cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim + %define wq r0m + %define xlimd r1m + %define hd hmp + %define edgeb byte edgem + + mov r6, edgem + and r6, 2 ; have_right + add xd, r6 + xor r6, 2 ; 2*!have_right + mov xlimd, r6 + SETUP_PIC r6, 0 +%endif + + jnz .no_right + add xd, 7 + and xd, ~7 +.no_right: + pxor m1, m1 + lea srcq, [srcq+xq] + lea sumq, [sumq+xq*2-2] + lea sumsqq, [sumsqq+xq*4-4] + neg xq + mov wq, xq +%if ARCH_X86_64 + lea r10, [pb_right_ext_mask+16] +%endif +.loop_y: + mov xq, wq + + ; load left + test edgeb, 1 ; have_left + jz .no_left + test leftq, leftq + jz .load_left_from_main + movd m0, [leftq] + pslldq m0, 12 + add leftq, 4 + jmp .expand_x +.no_left: + movd m0, [srcq+xq] + pshufb m0, [PIC_sym(pb_0)] + jmp .expand_x +.load_left_from_main: + movd m0, [srcq+xq-2] + pslldq m0, 14 +.expand_x: + punpckhbw xm0, xm1 + + ; when we reach this, m0 contains left two px in highest words + cmp xd, -8 + jle .loop_x +.partial_load_and_extend: + movd m3, [srcq-4] + pshufb m3, [PIC_sym(pb_3)] + movq m2, [srcq+xq] + punpcklbw m2, m1 + punpcklbw m3, m1 +%if ARCH_X86_64 + movu m4, [r10+xq*2] +%else + movu m4, [PIC_sym(pb_right_ext_mask+16)+xd*2] +%endif + pand m2, m4 + pandn m4, m3 + por m2, m4 + jmp .loop_x_noload +.right_extend: + pshufb m2, m0, [PIC_sym(pb_14_15)] + jmp .loop_x_noload + +.loop_x: + movq m2, [srcq+xq] + punpcklbw m2, m1 +.loop_x_noload: + palignr m3, m2, m0, 12 + palignr m4, m2, m0, 14 + + punpcklwd m5, m3, m2 + punpckhwd m6, m3, m2 + paddw m3, m4 + punpcklwd m7, m4, m1 + punpckhwd m4, m1 + pmaddwd m5, m5 + pmaddwd m6, m6 + pmaddwd m7, m7 + pmaddwd m4, m4 + paddd m5, m7 + paddd m6, m4 + paddw m3, m2 + movu [sumq+xq*2], m3 + movu [sumsqq+xq*4+ 0], m5 + movu [sumsqq+xq*4+16], m6 + + mova m0, m2 + add xq, 8 + + ; if x <= -8 we can reload more pixels + ; else if x < 0 we reload and extend (this implies have_right=0) + ; else if x < xlimd we extend from previous load (this implies have_right=0) + ; else we are done + + cmp xd, -8 + jle .loop_x + test xd, xd + jl .partial_load_and_extend + cmp xd, xlimd + jl .right_extend + + add sumsqq, (384+16)*4 + add sumq, (384+16)*2 + add srcq, strideq + dec hd + jg .loop_y + RET + +%if ARCH_X86_64 +cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim + movifnidn edged, edgem +%else +cglobal sgr_box3_v, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y + %define sumsq_baseq dword [esp+0] + %define sum_baseq dword [esp+4] + %define ylimd dword [esp+8] + %define m8 [esp+12] + mov edged, r4m + mov hd, r3m +%endif + mov xq, -2 +%if ARCH_X86_64 + mov ylimd, edged + and ylimd, 8 ; have_bottom + shr ylimd, 2 + sub ylimd, 2 ; -2 if have_bottom=0, else 0 + mov sumsq_baseq, sumsqq + mov sum_baseq, sumq +.loop_x: + mov sumsqq, sumsq_baseq + mov sumq, sum_baseq + lea yd, [hq+ylimq+2] +%else + mov yd, edged + and yd, 8 ; have_bottom + shr yd, 2 + sub yd, 2 ; -2 if have_bottom=0, else 0 + mov sumsq_baseq, sumsqq + mov sum_baseq, sumq + mov ylimd, yd +.loop_x: + mov sumsqd, sumsq_baseq + mov sumd, sum_baseq + lea yd, [hq+2] + add yd, ylimd +%endif + lea sumsqq, [sumsqq+xq*4+4-(384+16)*4] + lea sumq, [sumq+xq*2+2-(384+16)*2] + test edgeb, 4 ; have_top + jnz .load_top + movu m0, [sumsqq+(384+16)*4*1] + movu m1, [sumsqq+(384+16)*4*1+16] + mova m2, m0 + mova m3, m1 + mova m4, m0 + mova m5, m1 + movu m6, [sumq+(384+16)*2*1] + mova m7, m6 + mova m8, m6 + jmp .loop_y_noload +.load_top: + movu m0, [sumsqq-(384+16)*4*1] ; l2sq [left] + movu m1, [sumsqq-(384+16)*4*1+16] ; l2sq [right] + movu m2, [sumsqq-(384+16)*4*0] ; l1sq [left] + movu m3, [sumsqq-(384+16)*4*0+16] ; l1sq [right] + movu m6, [sumq-(384+16)*2*1] ; l2 + movu m7, [sumq-(384+16)*2*0] ; l1 +.loop_y: +%if ARCH_X86_64 + movu m8, [sumq+(384+16)*2*1] ; l0 +%else + movu m4, [sumq+(384+16)*2*1] ; l0 + mova m8, m4 +%endif + movu m4, [sumsqq+(384+16)*4*1] ; l0sq [left] + movu m5, [sumsqq+(384+16)*4*1+16] ; l0sq [right] +.loop_y_noload: + paddd m0, m2 + paddd m1, m3 + paddw m6, m7 + paddd m0, m4 + paddd m1, m5 + paddw m6, m8 + movu [sumsqq+ 0], m0 + movu [sumsqq+16], m1 + movu [sumq], m6 + + ; shift position down by one + mova m0, m2 + mova m1, m3 + mova m2, m4 + mova m3, m5 + mova m6, m7 + mova m7, m8 + add sumsqq, (384+16)*4 + add sumq, (384+16)*2 + dec yd + jg .loop_y + cmp yd, ylimd + jg .loop_y_noload + add xd, 8 + cmp xd, wd + jl .loop_x + RET + +cglobal sgr_calc_ab1, 4, 7, 12, a, b, w, h, s + movifnidn sd, sm + sub aq, (384+16-1)*4 + sub bq, (384+16-1)*2 + add hd, 2 +%if ARCH_X86_64 + LEA r5, sgr_x_by_x-0xF03 +%else + SETUP_PIC r5, 0 +%endif + movd m6, sd + pshuflw m6, m6, q0000 + punpcklqdq m6, m6 + pxor m7, m7 + DEFINE_ARGS a, b, w, h, x +%if ARCH_X86_64 + mova m8, [pd_0xF00801C7] + mova m9, [pw_256] + psrld m10, m9, 13 ; pd_2048 + mova m11, [pb_unpcklwdw] +%else + %define m8 [PIC_sym(pd_0xF00801C7)] + %define m9 [PIC_sym(pw_256)] + %define m10 [PIC_sym(pd_2048)] + %define m11 [PIC_sym(pb_unpcklwdw)] +%endif +.loop_y: + mov xq, -2 +.loop_x: + movq m0, [bq+xq*2] + movq m1, [bq+xq*2+(384+16)*2] + punpcklwd m0, m7 + punpcklwd m1, m7 + movu m2, [aq+xq*4] + movu m3, [aq+xq*4+(384+16)*4] + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m2, m4 ; aa * 9 + paddd m3, m5 + pmaddwd m4, m0, m0 + pmaddwd m5, m1, m1 + pmaddwd m0, m8 + pmaddwd m1, m8 + psubd m2, m4 ; p = aa * 9 - bb * bb + psubd m3, m5 + MULLD m2, m6 + MULLD m3, m6 + paddusw m2, m8 + paddusw m3, m8 + psrld m2, 20 ; z + psrld m3, 20 + GATHERDD m4, m2 ; xx + GATHERDD m2, m3 + psrld m4, 24 + psrld m2, 24 + packssdw m3, m4, m2 + pshufb m4, m11 + MULLD m0, m4 + pshufb m2, m11 + MULLD m1, m2 + psubw m5, m9, m3 + paddd m0, m10 + paddd m1, m10 + psrld m0, 12 + psrld m1, 12 + movq [bq+xq*2], m5 + psrldq m5, 8 + movq [bq+xq*2+(384+16)*2], m5 + movu [aq+xq*4], m0 + movu [aq+xq*4+(384+16)*4], m1 + add xd, 4 + cmp xd, wd + jl .loop_x + add aq, (384+16)*4*2 + add bq, (384+16)*2*2 + sub hd, 2 + jg .loop_y + RET + +%if ARCH_X86_64 +cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \ + tmp_base, src_base, a_base, b_base, x, y + movifnidn wd, wm + mov hd, hm + mova m15, [pw_16] + mov tmp_baseq, tq + mov src_baseq, srcq + mov a_baseq, aq + mov b_baseq, bq + xor xd, xd +%else +cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y + %define tmp_baseq [esp+8] + %define src_baseq [esp+12] + %define a_baseq [esp+16] + %define b_baseq [esp+20] + %define wd [esp+24] + %define hd [esp+28] + mov tmp_baseq, tq + mov src_baseq, srcq + mov a_baseq, aq + mov b_baseq, bq + mov wd, xd + mov hd, yd + xor xd, xd + SETUP_PIC yd, 1, 1 + jmp .loop_start +%endif + +.loop_x: + mov tq, tmp_baseq + mov srcq, src_baseq + mov aq, a_baseq + mov bq, b_baseq +%if ARCH_X86_32 +.loop_start: + movu m0, [bq+xq*2-(384+16)*2-2] + movu m2, [bq+xq*2-(384+16)*2+2] + mova m1, [bq+xq*2-(384+16)*2] ; b:top + paddw m0, m2 ; b:tl+tr + movu m2, [bq+xq*2-2] + movu m3, [bq+xq*2+2] + paddw m1, [bq+xq*2] ; b:top+ctr + paddw m2, m3 ; b:l+r + mova [esp+0x80], m0 + mova [esp+0x70], m1 + mova [esp+0x60], m2 +%endif + movu m0, [aq+xq*4-(384+16)*4-4] + movu m2, [aq+xq*4-(384+16)*4+4] + mova m1, [aq+xq*4-(384+16)*4] ; a:top [first half] + paddd m0, m2 ; a:tl+tr [first half] + movu m2, [aq+xq*4-(384+16)*4-4+16] + movu m4, [aq+xq*4-(384+16)*4+4+16] + mova m3, [aq+xq*4-(384+16)*4+16] ; a:top [second half] + paddd m2, m4 ; a:tl+tr [second half] + movu m4, [aq+xq*4-4] + movu m5, [aq+xq*4+4] + paddd m1, [aq+xq*4] ; a:top+ctr [first half] + paddd m4, m5 ; a:l+r [first half] + movu m5, [aq+xq*4+16-4] + movu m6, [aq+xq*4+16+4] + paddd m3, [aq+xq*4+16] ; a:top+ctr [second half] + paddd m5, m6 ; a:l+r [second half] +%if ARCH_X86_64 + movu m6, [bq+xq*2-(384+16)*2-2] + movu m8, [bq+xq*2-(384+16)*2+2] + mova m7, [bq+xq*2-(384+16)*2] ; b:top + paddw m6, m8 ; b:tl+tr + movu m8, [bq+xq*2-2] + movu m9, [bq+xq*2+2] + paddw m7, [bq+xq*2] ; b:top+ctr + paddw m8, m9 ; b:l+r +%endif + + lea tq, [tq+xq*2] + lea srcq, [srcq+xq*1] + lea aq, [aq+xq*4+(384+16)*4] + lea bq, [bq+xq*2+(384+16)*2] + mov yd, hd +.loop_y: +%if ARCH_X86_64 + movu m9, [bq-2] + movu m10, [bq+2] + paddw m7, [bq] ; b:top+ctr+bottom + paddw m9, m10 ; b:bl+br + paddw m10, m7, m8 ; b:top+ctr+bottom+l+r + paddw m6, m9 ; b:tl+tr+bl+br + psubw m7, [bq-(384+16)*2*2] ; b:ctr+bottom + paddw m10, m6 + psllw m10, 2 + psubw m10, m6 ; aa + pxor m14, m14 + movq m12, [srcq] + punpcklbw m12, m14 + punpcklwd m6, m10, m15 + punpckhwd m10, m15 + punpcklwd m13, m12, m15 + punpckhwd m12, m15 + pmaddwd m6, m13 ; aa*src[x]+256 [first half] + pmaddwd m10, m12 ; aa*src[x]+256 [second half] +%else + paddd m1, [aq] ; a:top+ctr+bottom [first half] + paddd m3, [aq+16] ; a:top+ctr+bottom [second half] + mova [esp+0x50], m1 + mova [esp+0x40], m3 + mova [esp+0x30], m4 + movu m6, [aq-4] + movu m7, [aq+4] + paddd m1, m4 ; a:top+ctr+bottom+l+r [first half] + paddd m3, m5 ; a:top+ctr+bottom+l+r [second half] + paddd m6, m7 ; a:bl+br [first half] + movu m7, [aq+16-4] + movu m4, [aq+16+4] + paddd m7, m4 ; a:bl+br [second half] + paddd m0, m6 ; a:tl+tr+bl+br [first half] + paddd m2, m7 ; a:tl+tr+bl+br [second half] + paddd m1, m0 + paddd m3, m2 + pslld m1, 2 + pslld m3, 2 + psubd m1, m0 ; bb [first half] + psubd m3, m2 ; bb [second half] +%endif + +%if ARCH_X86_64 + movu m11, [aq-4] + movu m12, [aq+4] + paddd m1, [aq] ; a:top+ctr+bottom [first half] + paddd m11, m12 ; a:bl+br [first half] + movu m12, [aq+16-4] + movu m13, [aq+16+4] + paddd m3, [aq+16] ; a:top+ctr+bottom [second half] + paddd m12, m13 ; a:bl+br [second half] + paddd m13, m1, m4 ; a:top+ctr+bottom+l+r [first half] + paddd m14, m3, m5 ; a:top+ctr+bottom+l+r [second half] + paddd m0, m11 ; a:tl+tr+bl+br [first half] + paddd m2, m12 ; a:tl+tr+bl+br [second half] + paddd m13, m0 + paddd m14, m2 + pslld m13, 2 + pslld m14, 2 + psubd m13, m0 ; bb [first half] + psubd m14, m2 ; bb [second half] + psubd m1, [aq-(384+16)*4*2] ; a:ctr+bottom [first half] + psubd m3, [aq-(384+16)*4*2+16] ; a:ctr+bottom [second half] +%else + mova m4, [esp+0x80] + mova [esp+0x80], m5 + mova m5, [esp+0x70] + mova [esp+0x70], m6 + mova m6, [esp+0x60] + mova [esp+0x60], m7 + mova [esp+0x20], m1 + movu m7, [bq-2] + movu m1, [bq+2] + paddw m5, [bq] ; b:top+ctr+bottom + paddw m7, m1 + paddw m1, m5, m6 ; b:top+ctr+bottom+l+r + paddw m4, m7 ; b:tl+tr+bl+br + psubw m5, [bq-(384+16)*2*2] ; b:ctr+bottom + paddw m1, m4 + psllw m1, 2 + psubw m1, m4 ; aa + movq m0, [srcq] + XCHG_PIC_REG + punpcklbw m0, [PIC_sym(pb_right_ext_mask)+16] + punpcklwd m4, m1, [PIC_sym(pw_16)] + punpckhwd m1, [PIC_sym(pw_16)] + punpcklwd m2, m0, [PIC_sym(pw_16)] + punpckhwd m0, [PIC_sym(pw_16)] + XCHG_PIC_REG + pmaddwd m4, m2 ; aa*src[x]+256 [first half] + pmaddwd m1, m0 ; aa*src[x]+256 [second half] +%endif + +%if ARCH_X86_64 + paddd m6, m13 + paddd m10, m14 + psrad m6, 9 + psrad m10, 9 + packssdw m6, m10 + mova [tq], m6 +%else + paddd m4, [esp+0x20] + paddd m1, m3 + psrad m4, 9 + psrad m1, 9 + packssdw m4, m1 + mova [tq], m4 +%endif + + ; shift to next row +%if ARCH_X86_64 + mova m0, m4 + mova m2, m5 + mova m4, m11 + mova m5, m12 + mova m6, m8 + mova m8, m9 +%else + mova m1, [esp+0x50] + mova m3, [esp+0x40] + mova m0, [esp+0x30] + mova m2, [esp+0x80] + mova m4, [esp+0x70] + mova [esp+0x70], m5 + mova m5, [esp+0x60] + mova [esp+0x80], m6 + mova [esp+0x60], m7 + psubd m1, [aq-(384+16)*4*2] ; a:ctr+bottom [first half] + psubd m3, [aq-(384+16)*4*2+16] ; a:ctr+bottom [second half] +%endif + + add srcq, strideq + add aq, (384+16)*4 + add bq, (384+16)*2 + add tq, 384*2 + dec yd + jg .loop_y + add xd, 8 + cmp xd, wd + jl .loop_x + RET + +cglobal sgr_weighted1, 4, 7, 8, dst, stride, t, w, h, wt + movifnidn hd, hm +%if ARCH_X86_32 + SETUP_PIC r6, 0 +%endif + movd m0, wtm + pshufb m0, [PIC_sym(pb_0_1)] + psllw m0, 4 + pxor m7, m7 + DEFINE_ARGS dst, stride, t, w, h, idx +.loop_y: + xor idxd, idxd +.loop_x: + mova m1, [tq+idxq*2+ 0] + mova m4, [tq+idxq*2+16] + mova m5, [dstq+idxq] + punpcklbw m2, m5, m7 + punpckhbw m5, m7 + psllw m3, m2, 4 + psllw m6, m5, 4 + psubw m1, m3 + psubw m4, m6 + pmulhrsw m1, m0 + pmulhrsw m4, m0 + paddw m1, m2 + paddw m4, m5 + packuswb m1, m4 + mova [dstq+idxq], m1 + add idxd, 16 + cmp idxd, wd + jl .loop_x + add dstq, strideq + add tq, 384 * 2 + dec hd + jg .loop_y + RET + +%if ARCH_X86_64 +cglobal sgr_box5_h, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xlim + mov edged, edgem + movifnidn wd, wm + mov hd, hm + mova m10, [pb_0] + mova m11, [pb_0_1] +%else +cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge + %define edgeb byte edgem + %define wd xd + %define wq wd + %define wm r5m + %define strideq r4m + SUB esp, 8 + SETUP_PIC sumsqd, 1, 1 + + %define m10 [PIC_sym(pb_0)] + %define m11 [PIC_sym(pb_0_1)] +%endif + + test edgeb, 2 ; have_right + jz .no_right + xor xlimd, xlimd + add wd, 2 + add wd, 15 + and wd, ~15 + jmp .right_done +.no_right: + mov xlimd, 3 + dec wd +.right_done: + pxor m1, m1 + lea srcq, [srcq+wq+1] + lea sumq, [sumq+wq*2-2] + lea sumsqq, [sumsqq+wq*4-4] + neg wq +%if ARCH_X86_64 + lea r10, [pb_right_ext_mask+16] +%else + mov wm, xd + %define wq wm +%endif + +.loop_y: + mov xq, wq + ; load left + test edgeb, 1 ; have_left + jz .no_left + test leftq, leftq + jz .load_left_from_main + movd m0, [leftq] + movd m2, [srcq+xq-1] + pslldq m2, 4 + por m0, m2 + pslldq m0, 11 + add leftq, 4 + jmp .expand_x +.no_left: + movd m0, [srcq+xq-1] + XCHG_PIC_REG + pshufb m0, m10 + XCHG_PIC_REG + jmp .expand_x +.load_left_from_main: + movd m0, [srcq+xq-4] + pslldq m0, 12 +.expand_x: + punpckhbw m0, m1 + + ; when we reach this, m0 contains left two px in highest words + cmp xd, -8 + jle .loop_x + test xd, xd + jge .right_extend +.partial_load_and_extend: + XCHG_PIC_REG + movd m3, [srcq-1] + movq m2, [srcq+xq] + pshufb m3, m10 + punpcklbw m3, m1 + punpcklbw m2, m1 +%if ARCH_X86_64 + movu m4, [r10+xq*2] +%else + movu m4, [PIC_sym(pb_right_ext_mask+16)+xd*2] + XCHG_PIC_REG +%endif + pand m2, m4 + pandn m4, m3 + por m2, m4 + jmp .loop_x_noload +.right_extend: + psrldq m2, m0, 14 + XCHG_PIC_REG + pshufb m2, m11 + XCHG_PIC_REG + jmp .loop_x_noload + +.loop_x: + movq m2, [srcq+xq] + punpcklbw m2, m1 +.loop_x_noload: + palignr m3, m2, m0, 8 + palignr m4, m2, m0, 10 + palignr m5, m2, m0, 12 + palignr m6, m2, m0, 14 + +%if ARCH_X86_64 + paddw m0, m3, m2 + punpcklwd m7, m3, m2 + punpckhwd m3, m2 + paddw m0, m4 + punpcklwd m8, m4, m5 + punpckhwd m4, m5 + paddw m0, m5 + punpcklwd m9, m6, m1 + punpckhwd m5, m6, m1 + paddw m0, m6 + pmaddwd m7, m7 + pmaddwd m3, m3 + pmaddwd m8, m8 + pmaddwd m4, m4 + pmaddwd m9, m9 + pmaddwd m5, m5 + paddd m7, m8 + paddd m3, m4 + paddd m7, m9 + paddd m3, m5 + movu [sumq+xq*2], m0 + movu [sumsqq+xq*4+ 0], m7 + movu [sumsqq+xq*4+16], m3 +%else + paddw m0, m3, m2 + paddw m0, m4 + paddw m0, m5 + paddw m0, m6 + movu [sumq+xq*2], m0 + punpcklwd m7, m3, m2 + punpckhwd m3, m2 + punpcklwd m0, m4, m5 + punpckhwd m4, m5 + punpckhwd m5, m6, m1 + pmaddwd m7, m7 + pmaddwd m3, m3 + pmaddwd m0, m0 + pmaddwd m4, m4 + pmaddwd m5, m5 + paddd m7, m0 + paddd m3, m4 + paddd m3, m5 + punpcklwd m0, m6, m1 + pmaddwd m0, m0 + paddd m7, m0 + movu [sumsqq+xq*4+ 0], m7 + movu [sumsqq+xq*4+16], m3 +%endif + + mova m0, m2 + add xq, 8 + + ; if x <= -8 we can reload more pixels + ; else if x < 0 we reload and extend (this implies have_right=0) + ; else if x < xlimd we extend from previous load (this implies have_right=0) + ; else we are done + + cmp xd, -8 + jle .loop_x + test xd, xd + jl .partial_load_and_extend + cmp xd, xlimd + jl .right_extend + + add srcq, strideq + add sumsqq, (384+16)*4 + add sumq, (384+16)*2 + dec hd + jg .loop_y +%if ARCH_X86_32 + ADD esp, 8 +%endif + RET + +%if ARCH_X86_64 +cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim + movifnidn edged, edgem + mov ylimd, edged +%else +cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr + %define wm [esp+0] + %define hm [esp+4] + %define edgem [esp+8] + mov wm, xd + mov hm, yd + mov edgem, ylimd +%endif + + and ylimd, 8 ; have_bottom + shr ylimd, 2 + sub ylimd, 3 ; -3 if have_bottom=0, else -1 + mov xq, -2 +%if ARCH_X86_64 +.loop_x: + lea yd, [hd+ylimd+2] + lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4] + lea sum_ptrq, [ sumq+xq*2+2-(384+16)*2] + test edgeb, 4 ; have_top + jnz .load_top + movu m0, [sumsq_ptrq+(384+16)*4*1] + movu m1, [sumsq_ptrq+(384+16)*4*1+16] + mova m2, m0 + mova m3, m1 + mova m4, m0 + mova m5, m1 + mova m6, m0 + mova m7, m1 + movu m10, [sum_ptrq+(384+16)*2*1] + mova m11, m10 + mova m12, m10 + mova m13, m10 + jmp .loop_y_second_load +.load_top: + movu m0, [sumsq_ptrq-(384+16)*4*1] ; l3/4sq [left] + movu m1, [sumsq_ptrq-(384+16)*4*1+16] ; l3/4sq [right] + movu m4, [sumsq_ptrq-(384+16)*4*0] ; l2sq [left] + movu m5, [sumsq_ptrq-(384+16)*4*0+16] ; l2sq [right] + mova m2, m0 + mova m3, m1 + movu m10, [sum_ptrq-(384+16)*2*1] ; l3/4 + movu m12, [sum_ptrq-(384+16)*2*0] ; l2 + mova m11, m10 +.loop_y: + movu m6, [sumsq_ptrq+(384+16)*4*1] ; l1sq [left] + movu m7, [sumsq_ptrq+(384+16)*4*1+16] ; l1sq [right] + movu m13, [sum_ptrq+(384+16)*2*1] ; l1 +.loop_y_second_load: + test yd, yd + jle .emulate_second_load + movu m8, [sumsq_ptrq+(384+16)*4*2] ; l0sq [left] + movu m9, [sumsq_ptrq+(384+16)*4*2+16] ; l0sq [right] + movu m14, [sum_ptrq+(384+16)*2*2] ; l0 +.loop_y_noload: + paddd m0, m2 + paddd m1, m3 + paddw m10, m11 + paddd m0, m4 + paddd m1, m5 + paddw m10, m12 + paddd m0, m6 + paddd m1, m7 + paddw m10, m13 + paddd m0, m8 + paddd m1, m9 + paddw m10, m14 + movu [sumsq_ptrq+ 0], m0 + movu [sumsq_ptrq+16], m1 + movu [sum_ptrq], m10 + + ; shift position down by one + mova m0, m4 + mova m1, m5 + mova m2, m6 + mova m3, m7 + mova m4, m8 + mova m5, m9 + mova m10, m12 + mova m11, m13 + mova m12, m14 + add sumsq_ptrq, (384+16)*4*2 + add sum_ptrq, (384+16)*2*2 + sub yd, 2 + jge .loop_y + ; l1 = l0 + mova m6, m8 + mova m7, m9 + mova m13, m14 + cmp yd, ylimd + jg .loop_y_noload + add xd, 8 + cmp xd, wd + jl .loop_x + RET +.emulate_second_load: + mova m8, m6 + mova m9, m7 + mova m14, m13 + jmp .loop_y_noload +%else +.sumsq_loop_x: + lea yd, [ylimd+2] + add yd, hm + lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4] + test byte edgem, 4 ; have_top + jnz .sumsq_load_top + movu m0, [sumsq_ptrq+(384+16)*4*1] + movu m1, [sumsq_ptrq+(384+16)*4*1+16] + mova m4, m0 + mova m5, m1 + mova m6, m0 + mova m7, m1 + mova [esp+0x1c], m0 + mova [esp+0x0c], m1 + jmp .sumsq_loop_y_second_load +.sumsq_load_top: + movu m0, [sumsq_ptrq-(384+16)*4*1] ; l3/4sq [left] + movu m1, [sumsq_ptrq-(384+16)*4*1+16] ; l3/4sq [right] + movu m4, [sumsq_ptrq-(384+16)*4*0] ; l2sq [left] + movu m5, [sumsq_ptrq-(384+16)*4*0+16] ; l2sq [right] + mova [esp+0x1c], m0 + mova [esp+0x0c], m1 +.sumsq_loop_y: + movu m6, [sumsq_ptrq+(384+16)*4*1] ; l1sq [left] + movu m7, [sumsq_ptrq+(384+16)*4*1+16] ; l1sq [right] +.sumsq_loop_y_second_load: + test yd, yd + jle .sumsq_emulate_second_load + movu m2, [sumsq_ptrq+(384+16)*4*2] ; l0sq [left] + movu m3, [sumsq_ptrq+(384+16)*4*2+16] ; l0sq [right] +.sumsq_loop_y_noload: + paddd m0, [esp+0x1c] + paddd m1, [esp+0x0c] + paddd m0, m4 + paddd m1, m5 + paddd m0, m6 + paddd m1, m7 + paddd m0, m2 + paddd m1, m3 + movu [sumsq_ptrq+ 0], m0 + movu [sumsq_ptrq+16], m1 + + ; shift position down by one + mova m0, m4 + mova m1, m5 + mova m4, m2 + mova m5, m3 + mova [esp+0x1c], m6 + mova [esp+0x0c], m7 + add sumsq_ptrq, (384+16)*4*2 + sub yd, 2 + jge .sumsq_loop_y + ; l1 = l0 + mova m6, m2 + mova m7, m3 + cmp yd, ylimd + jg .sumsq_loop_y_noload + add xd, 8 + cmp xd, wm + jl .sumsq_loop_x + + mov xd, -2 +.sum_loop_x: + lea yd, [ylimd+2] + add yd, hm + lea sum_ptrq, [sumq+xq*2+2-(384+16)*2] + test byte edgem, 4 ; have_top + jnz .sum_load_top + movu m0, [sum_ptrq+(384+16)*2*1] + mova m1, m0 + mova m2, m0 + mova m3, m0 + jmp .sum_loop_y_second_load +.sum_load_top: + movu m0, [sum_ptrq-(384+16)*2*1] ; l3/4 + movu m2, [sum_ptrq-(384+16)*2*0] ; l2 + mova m1, m0 +.sum_loop_y: + movu m3, [sum_ptrq+(384+16)*2*1] ; l1 +.sum_loop_y_second_load: + test yd, yd + jle .sum_emulate_second_load + movu m4, [sum_ptrq+(384+16)*2*2] ; l0 +.sum_loop_y_noload: + paddw m0, m1 + paddw m0, m2 + paddw m0, m3 + paddw m0, m4 + movu [sum_ptrq], m0 + + ; shift position down by one + mova m0, m2 + mova m1, m3 + mova m2, m4 + add sum_ptrq, (384+16)*2*2 + sub yd, 2 + jge .sum_loop_y + ; l1 = l0 + mova m3, m4 + cmp yd, ylimd + jg .sum_loop_y_noload + add xd, 8 + cmp xd, wm + jl .sum_loop_x + RET +.sumsq_emulate_second_load: + mova m2, m6 + mova m3, m7 + jmp .sumsq_loop_y_noload +.sum_emulate_second_load: + mova m4, m3 + jmp .sum_loop_y_noload +%endif + +cglobal sgr_calc_ab2, 4, 7, 11, a, b, w, h, s + movifnidn sd, sm + sub aq, (384+16-1)*4 + sub bq, (384+16-1)*2 + add hd, 2 +%if ARCH_X86_64 + LEA r5, sgr_x_by_x-0xF03 +%else + SETUP_PIC r5, 0 +%endif + movd m6, sd + pshuflw m6, m6, q0000 + punpcklqdq m6, m6 + pxor m7, m7 + DEFINE_ARGS a, b, w, h, x +%if ARCH_X86_64 + mova m8, [pd_0xF0080029] + mova m9, [pw_256] + psrld m10, m9, 15 ; pd_512 +%else + %define m8 [PIC_sym(pd_0xF0080029)] + %define m9 [PIC_sym(pw_256)] + %define m10 [PIC_sym(pd_512)] +%endif +.loop_y: + mov xq, -2 +.loop_x: + movq m0, [bq+xq*2+0] + movq m1, [bq+xq*2+8] + punpcklwd m0, m7 + punpcklwd m1, m7 + movu m2, [aq+xq*4+ 0] + movu m3, [aq+xq*4+16] + pslld m4, m2, 3 ; aa * 8 + pslld m5, m3, 3 + paddd m2, m4 ; aa * 9 + paddd m3, m5 + paddd m4, m4 ; aa * 16 + paddd m5, m5 + paddd m2, m4 ; aa * 25 + paddd m3, m5 + pmaddwd m4, m0, m0 + pmaddwd m5, m1, m1 + psubd m2, m4 ; p = aa * 25 - bb * bb + psubd m3, m5 + MULLD m2, m6 + MULLD m3, m6 + paddusw m2, m8 + paddusw m3, m8 + psrld m2, 20 ; z + psrld m3, 20 + GATHERDD m4, m2 ; xx + GATHERDD m2, m3 + psrld m4, 24 + psrld m2, 24 + packssdw m3, m4, m2 + pmullw m4, m8 + pmullw m2, m8 + psubw m5, m9, m3 + pmaddwd m0, m4 + pmaddwd m1, m2 + paddd m0, m10 + paddd m1, m10 + psrld m0, 10 + psrld m1, 10 + movu [bq+xq*2], m5 + movu [aq+xq*4+ 0], m0 + movu [aq+xq*4+16], m1 + add xd, 8 + cmp xd, wd + jl .loop_x + add aq, (384+16)*4*2 + add bq, (384+16)*2*2 + sub hd, 2 + jg .loop_y + RET + +%if ARCH_X86_64 +cglobal sgr_finish_filter2, 5, 13, 14, t, src, stride, a, b, w, h, \ + tmp_base, src_base, a_base, b_base, x, y + movifnidn wd, wm + mov hd, hm + mov tmp_baseq, tq + mov src_baseq, srcq + mov a_baseq, aq + mov b_baseq, bq + mova m9, [pw_5_6] + mova m12, [pw_256] + psrlw m10, m12, 8 ; pw_1 + psrlw m11, m12, 1 ; pw_128 + pxor m13, m13 +%else +cglobal sgr_finish_filter2, 6, 7, 8, t, src, stride, a, b, x, y + %define tmp_baseq r0m + %define src_baseq r1m + %define a_baseq r3m + %define b_baseq r4m + %define wd r5m + %define hd r6m + + SUB esp, 8 + SETUP_PIC yd + + %define m8 m5 + %define m9 [PIC_sym(pw_5_6)] + %define m10 [PIC_sym(pw_1)] + %define m11 [PIC_sym(pw_128)] + %define m12 [PIC_sym(pw_256)] + %define m13 m0 +%endif + xor xd, xd +.loop_x: + mov tq, tmp_baseq + mov srcq, src_baseq + mov aq, a_baseq + mov bq, b_baseq + movu m0, [aq+xq*4-(384+16)*4-4] + mova m1, [aq+xq*4-(384+16)*4] + movu m2, [aq+xq*4-(384+16)*4+4] + movu m3, [aq+xq*4-(384+16)*4-4+16] + mova m4, [aq+xq*4-(384+16)*4+16] + movu m5, [aq+xq*4-(384+16)*4+4+16] + paddd m0, m2 + paddd m3, m5 + paddd m0, m1 + paddd m3, m4 + pslld m2, m0, 2 + pslld m5, m3, 2 + paddd m2, m0 + paddd m5, m3 + paddd m0, m2, m1 ; prev_odd_b [first half] + paddd m1, m5, m4 ; prev_odd_b [second half] + movu m3, [bq+xq*2-(384+16)*2-2] + mova m4, [bq+xq*2-(384+16)*2] + movu m5, [bq+xq*2-(384+16)*2+2] + paddw m3, m5 + punpcklwd m5, m3, m4 + punpckhwd m3, m4 + pmaddwd m5, m9 + pmaddwd m3, m9 + mova m2, m5 + packssdw m2, m3 ; prev_odd_a + lea tq, [tq+xq*2] + lea srcq, [srcq+xq*1] + lea aq, [aq+xq*4+(384+16)*4] + lea bq, [bq+xq*2+(384+16)*2] +%if ARCH_X86_32 + mov [esp], PIC_reg +%endif + mov yd, hd + XCHG_PIC_REG +.loop_y: + movu m3, [aq-4] + mova m4, [aq] + movu m5, [aq+4] + paddd m3, m5 + paddd m3, m4 + pslld m5, m3, 2 + paddd m5, m3 + paddd m5, m4 ; cur_odd_b [first half] + movu m3, [aq+16-4] + mova m6, [aq+16] + movu m7, [aq+16+4] + paddd m3, m7 + paddd m3, m6 + pslld m7, m3, 2 + paddd m7, m3 + paddd m4, m7, m6 ; cur_odd_b [second half] + movu m3, [bq-2] + mova m6, [bq] + movu m7, [bq+2] + paddw m3, m7 + punpcklwd m7, m3, m6 + punpckhwd m3, m6 + pmaddwd m7, m9 + pmaddwd m3, m9 + packssdw m6, m7, m3 ; cur_odd_a + + paddd m0, m5 ; cur_even_b [first half] + paddd m1, m4 ; cur_even_b [second half] + paddw m2, m6 ; cur_even_a + + movq m3, [srcq] +%if ARCH_X86_64 + punpcklbw m3, m13 +%else + mova [td], m5 + pxor m7, m7 + punpcklbw m3, m7 +%endif + punpcklwd m7, m3, m10 + punpckhwd m3, m10 + punpcklwd m8, m2, m12 + punpckhwd m2, m12 + pmaddwd m7, m8 + pmaddwd m3, m2 + paddd m7, m0 + paddd m3, m1 + psrad m7, 9 + psrad m3, 9 + +%if ARCH_X86_32 + pxor m13, m13 +%endif + movq m8, [srcq+strideq] + punpcklbw m8, m13 + punpcklwd m0, m8, m10 + punpckhwd m8, m10 + punpcklwd m1, m6, m11 + punpckhwd m2, m6, m11 + pmaddwd m0, m1 + pmaddwd m8, m2 +%if ARCH_X86_64 + paddd m0, m5 +%else + paddd m0, [td] +%endif + paddd m8, m4 + psrad m0, 8 + psrad m8, 8 + + packssdw m7, m3 + packssdw m0, m8 +%if ARCH_X86_32 + mova m5, [td] +%endif + mova [tq+384*2*0], m7 + mova [tq+384*2*1], m0 + + mova m0, m5 + mova m1, m4 + mova m2, m6 + add aq, (384+16)*4*2 + add bq, (384+16)*2*2 + add tq, 384*2*2 + lea srcq, [srcq+strideq*2] +%if ARCH_X86_64 + sub yd, 2 +%else + sub dword [esp+4], 2 +%endif + jg .loop_y + add xd, 8 + cmp xd, wd + jl .loop_x +%if ARCH_X86_32 + ADD esp, 8 +%endif + RET + +cglobal sgr_weighted2, 4, 7, 12, dst, stride, t1, t2, w, h, wt + movifnidn wd, wm + movd m0, wtm +%if ARCH_X86_64 + movifnidn hd, hm + mova m10, [pd_1024] + pxor m11, m11 +%else + SETUP_PIC hd, 0 + %define m10 [PIC_sym(pd_1024)] + %define m11 m7 +%endif + pshufd m0, m0, 0 + DEFINE_ARGS dst, stride, t1, t2, w, h, idx +%if ARCH_X86_32 + %define hd hmp +%endif + +.loop_y: + xor idxd, idxd +.loop_x: + mova m1, [t1q+idxq*2+ 0] + mova m2, [t1q+idxq*2+16] + mova m3, [t2q+idxq*2+ 0] + mova m4, [t2q+idxq*2+16] + mova m6, [dstq+idxq] +%if ARCH_X86_32 + pxor m11, m11 +%endif + punpcklbw m5, m6, m11 + punpckhbw m6, m11 + psllw m7, m5, 4 + psubw m1, m7 + psubw m3, m7 + psllw m7, m6, 4 + psubw m2, m7 + psubw m4, m7 + punpcklwd m7, m1, m3 + punpckhwd m1, m3 + punpcklwd m3, m2, m4 + punpckhwd m2, m4 + pmaddwd m7, m0 + pmaddwd m1, m0 + pmaddwd m3, m0 + pmaddwd m2, m0 + paddd m7, m10 + paddd m1, m10 + paddd m3, m10 + paddd m2, m10 + psrad m7, 11 + psrad m1, 11 + psrad m3, 11 + psrad m2, 11 + packssdw m7, m1 + packssdw m3, m2 + paddw m7, m5 + paddw m3, m6 + packuswb m7, m3 + mova [dstq+idxq], m7 + add idxd, 16 + cmp idxd, wd + jl .loop_x + add dstq, strideq + add t1q, 384 * 2 + add t2q, 384 * 2 + dec hd + jg .loop_y + RET diff --git a/third_party/dav1d/src/x86/mc_avx2.asm b/third_party/dav1d/src/x86/mc_avx2.asm new file mode 100644 index 0000000000..dda8234f13 --- /dev/null +++ b/third_party/dav1d/src/x86/mc_avx2.asm @@ -0,0 +1,5704 @@ +; Copyright © 2018-2020, VideoLAN and dav1d authors +; Copyright © 2018-2020, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 32 + +; dav1d_obmc_masks[] with 64-x interleaved +obmc_masks: db 0, 0, 0, 0 + ; 2 + db 45, 19, 64, 0 + ; 4 + db 39, 25, 50, 14, 59, 5, 64, 0 + ; 8 + db 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0 + ; 16 + db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10 + db 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0 + ; 32 + db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20 + db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9 + db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2 + db 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0 + +warp_8x8_shufA: db 0, 2, 4, 6, 1, 3, 5, 7, 1, 3, 5, 7, 2, 4, 6, 8 + db 4, 6, 8, 10, 5, 7, 9, 11, 5, 7, 9, 11, 6, 8, 10, 12 +warp_8x8_shufB: db 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9, 4, 6, 8, 10 + db 6, 8, 10, 12, 7, 9, 11, 13, 7, 9, 11, 13, 8, 10, 12, 14 +subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12 + db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14 +subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 +subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 +subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 +subpel_v_shuf4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 +subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11 +subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 +bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 +bilin_h_shuf8: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 +bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7 +deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11 +blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 +pb_8x0_8x8: db 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8 +bdct_lb_dw: db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 +wswap: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 +rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7 +resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7 + db 7, 7, 7, 7, 7, 7, 7, 7 + +wm_420_sign: dd 0x01020102, 0x01010101 +wm_422_sign: dd 0x80808080, 0x7f7f7f7f + +pb_64: times 4 db 64 +pw_m256: times 2 dw -256 +pw_15: times 2 dw 15 +pw_32: times 2 dw 32 +pw_34: times 2 dw 34 +pw_258: times 2 dw 258 +pw_512: times 2 dw 512 +pw_1024: times 2 dw 1024 +pw_2048: times 2 dw 2048 +pw_6903: times 2 dw 6903 +pw_8192: times 2 dw 8192 +pd_32: dd 32 +pd_63: dd 63 +pd_512: dd 512 +pd_32768: dd 32768 +pd_0x3ff: dd 0x3ff +pd_0x4000: dd 0x4000 +pq_0x40000000: dq 0x40000000 + +cextern mc_subpel_filters +cextern mc_warp_filter +cextern resize_filter + +%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) + +%macro BASE_JMP_TABLE 3-* + %xdefine %1_%2_table (%%table - %3) + %xdefine %%base %1_%2 + %%table: + %rep %0 - 2 + dw %%base %+ _w%3 - %%base + %rotate 1 + %endrep +%endmacro + +%macro HV_JMP_TABLE 5-* + %xdefine %%prefix mangle(private_prefix %+ _%1_%2_%3) + %xdefine %%base %1_%3 + %assign %%types %4 + %if %%types & 1 + %xdefine %1_%2_h_%3_table (%%h - %5) + %%h: + %rep %0 - 4 + dw %%prefix %+ .h_w%5 - %%base + %rotate 1 + %endrep + %rotate 4 + %endif + %if %%types & 2 + %xdefine %1_%2_v_%3_table (%%v - %5) + %%v: + %rep %0 - 4 + dw %%prefix %+ .v_w%5 - %%base + %rotate 1 + %endrep + %rotate 4 + %endif + %if %%types & 4 + %xdefine %1_%2_hv_%3_table (%%hv - %5) + %%hv: + %rep %0 - 4 + dw %%prefix %+ .hv_w%5 - %%base + %rotate 1 + %endrep + %endif +%endmacro + +%macro BIDIR_JMP_TABLE 1-* + %xdefine %1_table (%%table - 2*%2) + %xdefine %%base %1_table + %xdefine %%prefix mangle(private_prefix %+ _%1) + %%table: + %rep %0 - 1 + dd %%prefix %+ .w%2 - %%base + %rotate 1 + %endrep +%endmacro + +%macro SCALED_JMP_TABLE 1-* + %xdefine %1_table (%%table - %2) + %xdefine %%base mangle(private_prefix %+ _%1) +%%table: + %rep %0 - 1 + dw %%base %+ .w%2 - %%base + %rotate 1 + %endrep + %rotate 1 +%%dy_1024: + %xdefine %1_dy1_table (%%dy_1024 - %2) + %rep %0 - 1 + dw %%base %+ .dy1_w%2 - %%base + %rotate 1 + %endrep + %rotate 1 +%%dy_2048: + %xdefine %1_dy2_table (%%dy_2048 - %2) + %rep %0 - 1 + dw %%base %+ .dy2_w%2 - %%base + %rotate 1 + %endrep +%endmacro + +%xdefine put_avx2 mangle(private_prefix %+ _put_bilin_avx2.put) +%xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_avx2.prep) + +%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX + +BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128 +BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE put, 8tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, 8tap, avx2, 1, 4, 8, 16, 32, 64, 128 +SCALED_JMP_TABLE put_8tap_scaled_avx2, 2, 4, 8, 16, 32, 64, 128 +SCALED_JMP_TABLE prep_8tap_scaled_avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE avg_avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_avg_avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE mask_avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_420_avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_422_avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_444_avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE blend_avx2, 4, 8, 16, 32 +BIDIR_JMP_TABLE blend_v_avx2, 2, 4, 8, 16, 32 +BIDIR_JMP_TABLE blend_h_avx2, 2, 4, 8, 16, 32, 32, 32 + +SECTION .text + +INIT_XMM avx2 +cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy + movifnidn mxyd, r6m ; mx + lea r7, [put_avx2] + tzcnt wd, wm + movifnidn hd, hm + test mxyd, mxyd + jnz .h + mov mxyd, r7m ; my + test mxyd, mxyd + jnz .v +.put: + movzx wd, word [r7+wq*2+table_offset(put,)] + add wq, r7 + jmp wq +.put_w2: + movzx r6d, word [srcq+ssq*0] + movzx r7d, word [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mov [dstq+dsq*0], r6w + mov [dstq+dsq*1], r7w + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w2 + RET +.put_w4: + mov r6d, [srcq+ssq*0] + mov r7d, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mov [dstq+dsq*0], r6d + mov [dstq+dsq*1], r7d + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w4 + RET +.put_w8: + mov r6, [srcq+ssq*0] + mov r7, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mov [dstq+dsq*0], r6 + mov [dstq+dsq*1], r7 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w8 + RET +.put_w16: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w16 + RET +INIT_YMM avx2 +.put_w32: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w32 + RET +.put_w64: + movu m0, [srcq+ssq*0+32*0] + movu m1, [srcq+ssq*0+32*1] + movu m2, [srcq+ssq*1+32*0] + movu m3, [srcq+ssq*1+32*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0+32*0], m0 + mova [dstq+dsq*0+32*1], m1 + mova [dstq+dsq*1+32*0], m2 + mova [dstq+dsq*1+32*1], m3 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w64 + RET +.put_w128: + movu m0, [srcq+32*0] + movu m1, [srcq+32*1] + movu m2, [srcq+32*2] + movu m3, [srcq+32*3] + add srcq, ssq + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + mova [dstq+32*2], m2 + mova [dstq+32*3], m3 + add dstq, dsq + dec hd + jg .put_w128 + RET +.h: + ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4 + ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4 + imul mxyd, 255 + vbroadcasti128 m4, [bilin_h_shuf8] + add mxyd, 16 + movd xm5, mxyd + mov mxyd, r7m ; my + vpbroadcastw m5, xm5 + test mxyd, mxyd + jnz .hv + movzx wd, word [r7+wq*2+table_offset(put, _bilin_h)] + vpbroadcastd m3, [pw_2048] + add wq, r7 + jmp wq +.h_w2: + movd xm0, [srcq+ssq*0] + pinsrd xm0, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + pshufb xm0, xm4 + pmaddubsw xm0, xm5 + pmulhrsw xm0, xm3 + packuswb xm0, xm0 + pextrw [dstq+dsq*0], xm0, 0 + pextrw [dstq+dsq*1], xm0, 2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w2 + RET +.h_w4: + mova xm4, [bilin_h_shuf4] +.h_w4_loop: + movq xm0, [srcq+ssq*0] + movhps xm0, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb xm0, xm4 + pmaddubsw xm0, xm5 + pmulhrsw xm0, xm3 + packuswb xm0, xm0 + movd [dstq+dsq*0], xm0 + pextrd [dstq+dsq*1], xm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w4_loop + RET +.h_w8: + movu xm0, [srcq+ssq*0] + movu xm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb xm0, xm4 + pshufb xm1, xm4 + pmaddubsw xm0, xm5 + pmaddubsw xm1, xm5 + pmulhrsw xm0, xm3 + pmulhrsw xm1, xm3 + packuswb xm0, xm1 + movq [dstq+dsq*0], xm0 + movhps [dstq+dsq*1], xm0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w8 + RET +.h_w16: + movu xm0, [srcq+ssq*0+8*0] + vinserti128 m0, [srcq+ssq*1+8*0], 1 + movu xm1, [srcq+ssq*0+8*1] + vinserti128 m1, [srcq+ssq*1+8*1], 1 + lea srcq, [srcq+ssq*2] + pshufb m0, m4 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packuswb m0, m1 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w16 + RET +.h_w32: + movu m0, [srcq+8*0] + movu m1, [srcq+8*1] + add srcq, ssq + pshufb m0, m4 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packuswb m0, m1 + mova [dstq], m0 + add dstq, dsq + dec hd + jg .h_w32 + RET +.h_w64: + movu m0, [srcq+8*0] + movu m1, [srcq+8*1] + pshufb m0, m4 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packuswb m0, m1 + movu m1, [srcq+8*4] + movu m2, [srcq+8*5] + add srcq, ssq + pshufb m1, m4 + pshufb m2, m4 + pmaddubsw m1, m5 + pmaddubsw m2, m5 + pmulhrsw m1, m3 + pmulhrsw m2, m3 + packuswb m1, m2 + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + add dstq, dsq + dec hd + jg .h_w64 + RET +.h_w128: + mov r6, -32*3 +.h_w128_loop: + movu m0, [srcq+r6+32*3+8*0] + movu m1, [srcq+r6+32*3+8*1] + pshufb m0, m4 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packuswb m0, m1 + mova [dstq+r6+32*3], m0 + add r6, 32 + jle .h_w128_loop + add srcq, ssq + add dstq, dsq + dec hd + jg .h_w128 + RET +.v: + movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)] + imul mxyd, 255 + vpbroadcastd m5, [pw_2048] + add mxyd, 16 + add wq, r7 + movd xm4, mxyd + vpbroadcastw m4, xm4 + jmp wq +.v_w2: + movd xm0, [srcq+ssq*0] +.v_w2_loop: + pinsrw xm1, xm0, [srcq+ssq*1], 1 ; 0 1 + lea srcq, [srcq+ssq*2] + pinsrw xm0, xm1, [srcq+ssq*0], 0 ; 2 1 + pshuflw xm1, xm1, q2301 ; 1 0 + punpcklbw xm1, xm0 + pmaddubsw xm1, xm4 + pmulhrsw xm1, xm5 + packuswb xm1, xm1 + pextrw [dstq+dsq*0], xm1, 1 + pextrw [dstq+dsq*1], xm1, 0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: + movd xm0, [srcq+ssq*0] +.v_w4_loop: + vpbroadcastd xm2, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpblendd xm1, xm2, xm0, 0x01 ; 0 1 + vpbroadcastd xm0, [srcq+ssq*0] + vpblendd xm2, xm0, 0x02 ; 1 2 + punpcklbw xm1, xm2 + pmaddubsw xm1, xm4 + pmulhrsw xm1, xm5 + packuswb xm1, xm1 + movd [dstq+dsq*0], xm1 + pextrd [dstq+dsq*1], xm1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: + movq xm0, [srcq+ssq*0] +.v_w8_loop: + movq xm2, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklbw xm1, xm0, xm2 + movq xm0, [srcq+ssq*0] + punpcklbw xm2, xm0 + pmaddubsw xm1, xm4 + pmaddubsw xm2, xm4 + pmulhrsw xm1, xm5 + pmulhrsw xm2, xm5 + packuswb xm1, xm2 + movq [dstq+dsq*0], xm1 + movhps [dstq+dsq*1], xm1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w8_loop + RET +.v_w16: + movu xm0, [srcq+ssq*0] +.v_w16_loop: + vbroadcasti128 m3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpblendd m2, m3, m0, 0x0f ; 0 1 + vbroadcasti128 m0, [srcq+ssq*0] + vpblendd m3, m0, 0xf0 ; 1 2 + punpcklbw m1, m2, m3 + punpckhbw m2, m3 + pmaddubsw m1, m4 + pmaddubsw m2, m4 + pmulhrsw m1, m5 + pmulhrsw m2, m5 + packuswb m1, m2 + mova [dstq+dsq*0], xm1 + vextracti128 [dstq+dsq*1], m1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w16_loop + RET +.v_w32: +%macro PUT_BILIN_V_W32 0 + movu m0, [srcq+ssq*0] +%%loop: + movu m3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklbw m1, m0, m3 + punpckhbw m2, m0, m3 + movu m0, [srcq+ssq*0] + pmaddubsw m1, m4 + pmaddubsw m2, m4 + pmulhrsw m1, m5 + pmulhrsw m2, m5 + packuswb m1, m2 + punpcklbw m2, m3, m0 + punpckhbw m3, m0 + pmaddubsw m2, m4 + pmaddubsw m3, m4 + pmulhrsw m2, m5 + pmulhrsw m3, m5 + packuswb m2, m3 + mova [dstq+dsq*0], m1 + mova [dstq+dsq*1], m2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg %%loop +%endmacro + PUT_BILIN_V_W32 + RET +.v_w64: + movu m0, [srcq+32*0] + movu m1, [srcq+32*1] +.v_w64_loop: + add srcq, ssq + movu m3, [srcq+32*0] + punpcklbw m2, m0, m3 + punpckhbw m0, m3 + pmaddubsw m2, m4 + pmaddubsw m0, m4 + pmulhrsw m2, m5 + pmulhrsw m0, m5 + packuswb m2, m0 + mova m0, m3 + movu m3, [srcq+32*1] + mova [dstq+32*0], m2 + punpcklbw m2, m1, m3 + punpckhbw m1, m3 + pmaddubsw m2, m4 + pmaddubsw m1, m4 + pmulhrsw m2, m5 + pmulhrsw m1, m5 + packuswb m2, m1 + mova m1, m3 + mova [dstq+32*1], m2 + add dstq, dsq + dec hd + jg .v_w64_loop + RET +.v_w128: + lea r6d, [hq+(3<<8)] + mov r4, srcq + mov r7, dstq +.v_w128_loop: + PUT_BILIN_V_W32 + add r4, 32 + add r7, 32 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 + sub r6d, 1<<8 + jg .v_w128_loop + RET +.hv: + ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8 + ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4 + movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)] + WIN64_SPILL_XMM 8 + shl mxyd, 11 ; can't shift by 12 due to signed overflow + vpbroadcastd m7, [pw_15] + movd xm6, mxyd + add wq, r7 + paddb m5, m5 + vpbroadcastw m6, xm6 + jmp wq +.hv_w2: + vpbroadcastd xm0, [srcq+ssq*0] + pshufb xm0, xm4 + pmaddubsw xm0, xm5 +.hv_w2_loop: + movd xm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pinsrd xm1, [srcq+ssq*0], 1 + pshufb xm1, xm4 + pmaddubsw xm1, xm5 ; 1 _ 2 _ + shufps xm2, xm0, xm1, q1032 ; 0 _ 1 _ + mova xm0, xm1 + psubw xm1, xm2 + pmulhw xm1, xm6 + pavgw xm2, xm7 + paddw xm1, xm2 + psrlw xm1, 4 + packuswb xm1, xm1 + pextrw [dstq+dsq*0], xm1, 0 + pextrw [dstq+dsq*1], xm1, 2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +.hv_w4: + mova xm4, [bilin_h_shuf4] + movddup xm0, [srcq+ssq*0] + pshufb xm0, xm4 + pmaddubsw xm0, xm5 +.hv_w4_loop: + movq xm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movhps xm1, [srcq+ssq*0] + pshufb xm1, xm4 + pmaddubsw xm1, xm5 ; 1 2 + shufps xm2, xm0, xm1, q1032 ; 0 1 + mova xm0, xm1 + psubw xm1, xm2 + pmulhw xm1, xm6 + pavgw xm2, xm7 + paddw xm1, xm2 + psrlw xm1, 4 + packuswb xm1, xm1 + movd [dstq+dsq*0], xm1 + pextrd [dstq+dsq*1], xm1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: + vbroadcasti128 m0, [srcq+ssq*0] + pshufb m0, m4 + pmaddubsw m0, m5 +.hv_w8_loop: + movu xm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vinserti128 m1, [srcq+ssq*0], 1 + pshufb m1, m4 + pmaddubsw m1, m5 ; 1 2 + vperm2i128 m2, m0, m1, 0x21 ; 0 1 + mova m0, m1 + psubw m1, m2 + pmulhw m1, m6 + pavgw m2, m7 + paddw m1, m2 + psrlw m1, 4 + vextracti128 xm2, m1, 1 + packuswb xm1, xm2 + movq [dstq+dsq*0], xm1 + movhps [dstq+dsq*1], xm1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w8_loop + RET +.hv_w16: + movu m0, [srcq+ssq*0+8*0] + vinserti128 m0, [srcq+ssq*0+8*1], 1 + pshufb m0, m4 + pmaddubsw m0, m5 +.hv_w16_loop: + movu xm2, [srcq+ssq*1+8*0] + vinserti128 m2, [srcq+ssq*1+8*1], 1 + lea srcq, [srcq+ssq*2] + movu xm3, [srcq+ssq*0+8*0] + vinserti128 m3, [srcq+ssq*0+8*1], 1 + pshufb m2, m4 + pshufb m3, m4 + pmaddubsw m2, m5 + psubw m1, m2, m0 + pmulhw m1, m6 + pavgw m0, m7 + paddw m1, m0 + pmaddubsw m0, m3, m5 + psubw m3, m0, m2 + pmulhw m3, m6 + pavgw m2, m7 + paddw m3, m2 + psrlw m1, 4 + psrlw m3, 4 + packuswb m1, m3 + vpermq m1, m1, q3120 + mova [dstq+dsq*0], xm1 + vextracti128 [dstq+dsq*1], m1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w16_loop + RET +.hv_w128: + lea r6d, [hq+(3<<16)] + jmp .hv_w32_start +.hv_w64: + lea r6d, [hq+(1<<16)] +.hv_w32_start: + mov r4, srcq + mov r7, dstq +.hv_w32: +%if WIN64 + movaps r4m, xmm8 +%endif +.hv_w32_loop0: + movu m0, [srcq+8*0] + movu m1, [srcq+8*1] + pshufb m0, m4 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 +.hv_w32_loop: + add srcq, ssq + movu m2, [srcq+8*0] + movu m3, [srcq+8*1] + pshufb m2, m4 + pshufb m3, m4 + pmaddubsw m2, m5 + pmaddubsw m3, m5 + psubw m8, m2, m0 + pmulhw m8, m6 + pavgw m0, m7 + paddw m8, m0 + mova m0, m2 + psubw m2, m3, m1 + pmulhw m2, m6 + pavgw m1, m7 + paddw m2, m1 + mova m1, m3 + psrlw m8, 4 + psrlw m2, 4 + packuswb m8, m2 + mova [dstq], m8 + add dstq, dsq + dec hd + jg .hv_w32_loop + add r4, 32 + add r7, 32 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 + sub r6d, 1<<16 + jg .hv_w32_loop0 +%if WIN64 + movaps xmm8, r4m +%endif + RET + +cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 + movifnidn mxyd, r5m ; mx + lea r6, [prep%+SUFFIX] + tzcnt wd, wm + movifnidn hd, hm + test mxyd, mxyd + jnz .h + mov mxyd, r6m ; my + test mxyd, mxyd + jnz .v +.prep: + movzx wd, word [r6+wq*2+table_offset(prep,)] + add wq, r6 + lea stride3q, [strideq*3] + jmp wq +.prep_w4: + movd xm0, [srcq+strideq*0] + pinsrd xm0, [srcq+strideq*1], 1 + pinsrd xm0, [srcq+strideq*2], 2 + pinsrd xm0, [srcq+stride3q ], 3 + lea srcq, [srcq+strideq*4] + pmovzxbw m0, xm0 + psllw m0, 4 + mova [tmpq], m0 + add tmpq, 32 + sub hd, 4 + jg .prep_w4 + RET +.prep_w8: + movq xm0, [srcq+strideq*0] + movhps xm0, [srcq+strideq*1] + movq xm1, [srcq+strideq*2] + movhps xm1, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + pmovzxbw m0, xm0 + pmovzxbw m1, xm1 + psllw m0, 4 + psllw m1, 4 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + add tmpq, 32*2 + sub hd, 4 + jg .prep_w8 + RET +.prep_w16: + pmovzxbw m0, [srcq+strideq*0] + pmovzxbw m1, [srcq+strideq*1] + pmovzxbw m2, [srcq+strideq*2] + pmovzxbw m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + psllw m0, 4 + psllw m1, 4 + psllw m2, 4 + psllw m3, 4 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + mova [tmpq+32*2], m2 + mova [tmpq+32*3], m3 + add tmpq, 32*4 + sub hd, 4 + jg .prep_w16 + RET +.prep_w32: + pmovzxbw m0, [srcq+strideq*0+16*0] + pmovzxbw m1, [srcq+strideq*0+16*1] + pmovzxbw m2, [srcq+strideq*1+16*0] + pmovzxbw m3, [srcq+strideq*1+16*1] + lea srcq, [srcq+strideq*2] + psllw m0, 4 + psllw m1, 4 + psllw m2, 4 + psllw m3, 4 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + mova [tmpq+32*2], m2 + mova [tmpq+32*3], m3 + add tmpq, 32*4 + sub hd, 2 + jg .prep_w32 + RET +.prep_w64: + pmovzxbw m0, [srcq+16*0] + pmovzxbw m1, [srcq+16*1] + pmovzxbw m2, [srcq+16*2] + pmovzxbw m3, [srcq+16*3] + add srcq, strideq + psllw m0, 4 + psllw m1, 4 + psllw m2, 4 + psllw m3, 4 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + mova [tmpq+32*2], m2 + mova [tmpq+32*3], m3 + add tmpq, 32*4 + dec hd + jg .prep_w64 + RET +.prep_w128: + pmovzxbw m0, [srcq+16*0] + pmovzxbw m1, [srcq+16*1] + pmovzxbw m2, [srcq+16*2] + pmovzxbw m3, [srcq+16*3] + psllw m0, 4 + psllw m1, 4 + psllw m2, 4 + psllw m3, 4 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + mova [tmpq+32*2], m2 + mova [tmpq+32*3], m3 + pmovzxbw m0, [srcq+16*4] + pmovzxbw m1, [srcq+16*5] + pmovzxbw m2, [srcq+16*6] + pmovzxbw m3, [srcq+16*7] + add tmpq, 32*8 + add srcq, strideq + psllw m0, 4 + psllw m1, 4 + psllw m2, 4 + psllw m3, 4 + mova [tmpq-32*4], m0 + mova [tmpq-32*3], m1 + mova [tmpq-32*2], m2 + mova [tmpq-32*1], m3 + dec hd + jg .prep_w128 + RET +.h: + ; 16 * src[x] + (mx * (src[x + 1] - src[x])) + ; = (16 - mx) * src[x] + mx * src[x + 1] + imul mxyd, 255 + vbroadcasti128 m4, [bilin_h_shuf8] + add mxyd, 16 + movd xm5, mxyd + mov mxyd, r6m ; my + vpbroadcastw m5, xm5 + test mxyd, mxyd + jnz .hv + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)] + add wq, r6 + lea stride3q, [strideq*3] + jmp wq +.h_w4: + vbroadcasti128 m4, [bilin_h_shuf4] +.h_w4_loop: + movq xm0, [srcq+strideq*0] + movhps xm0, [srcq+strideq*1] + movq xm1, [srcq+strideq*2] + movhps xm1, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vinserti128 m0, xm1, 1 + pshufb m0, m4 + pmaddubsw m0, m5 + mova [tmpq], m0 + add tmpq, 32 + sub hd, 4 + jg .h_w4_loop + RET +.h_w8: +.h_w8_loop: + movu xm0, [srcq+strideq*0] + vinserti128 m0, [srcq+strideq*1], 1 + movu xm1, [srcq+strideq*2] + vinserti128 m1, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + pshufb m0, m4 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + add tmpq, 32*2 + sub hd, 4 + jg .h_w8_loop + RET +.h_w16: +.h_w16_loop: + movu xm0, [srcq+strideq*0+8*0] + vinserti128 m0, [srcq+strideq*0+8*1], 1 + movu xm1, [srcq+strideq*1+8*0] + vinserti128 m1, [srcq+strideq*1+8*1], 1 + movu xm2, [srcq+strideq*2+8*0] + vinserti128 m2, [srcq+strideq*2+8*1], 1 + movu xm3, [srcq+stride3q +8*0] + vinserti128 m3, [srcq+stride3q +8*1], 1 + lea srcq, [srcq+strideq*4] + pshufb m0, m4 + pshufb m1, m4 + pshufb m2, m4 + pshufb m3, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmaddubsw m2, m5 + pmaddubsw m3, m5 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + mova [tmpq+32*2], m2 + mova [tmpq+32*3], m3 + add tmpq, 32*4 + sub hd, 4 + jg .h_w16_loop + RET +.h_w32: +.h_w32_loop: + movu xm0, [srcq+strideq*0+8*0] + vinserti128 m0, [srcq+strideq*0+8*1], 1 + movu xm1, [srcq+strideq*0+8*2] + vinserti128 m1, [srcq+strideq*0+8*3], 1 + movu xm2, [srcq+strideq*1+8*0] + vinserti128 m2, [srcq+strideq*1+8*1], 1 + movu xm3, [srcq+strideq*1+8*2] + vinserti128 m3, [srcq+strideq*1+8*3], 1 + lea srcq, [srcq+strideq*2] + pshufb m0, m4 + pshufb m1, m4 + pshufb m2, m4 + pshufb m3, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmaddubsw m2, m5 + pmaddubsw m3, m5 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + mova [tmpq+32*2], m2 + mova [tmpq+32*3], m3 + add tmpq, 32*4 + sub hd, 2 + jg .h_w32_loop + RET +.h_w64: + movu xm0, [srcq+8*0] + vinserti128 m0, [srcq+8*1], 1 + movu xm1, [srcq+8*2] + vinserti128 m1, [srcq+8*3], 1 + movu xm2, [srcq+8*4] + vinserti128 m2, [srcq+8*5], 1 + movu xm3, [srcq+8*6] + vinserti128 m3, [srcq+8*7], 1 + add srcq, strideq + pshufb m0, m4 + pshufb m1, m4 + pshufb m2, m4 + pshufb m3, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmaddubsw m2, m5 + pmaddubsw m3, m5 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + mova [tmpq+32*2], m2 + mova [tmpq+32*3], m3 + add tmpq, 32*4 + dec hd + jg .h_w64 + RET +.h_w128: + movu xm0, [srcq+8*0] + vinserti128 m0, [srcq+8*1], 1 + movu xm1, [srcq+8*2] + vinserti128 m1, [srcq+8*3], 1 + movu xm2, [srcq+8*4] + vinserti128 m2, [srcq+8*5], 1 + movu xm3, [srcq+8*6] + vinserti128 m3, [srcq+8*7], 1 + pshufb m0, m4 + pshufb m1, m4 + pshufb m2, m4 + pshufb m3, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmaddubsw m2, m5 + pmaddubsw m3, m5 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + mova [tmpq+32*2], m2 + mova [tmpq+32*3], m3 + movu xm0, [srcq+8* 8] + vinserti128 m0, [srcq+8* 9], 1 + movu xm1, [srcq+8*10] + vinserti128 m1, [srcq+8*11], 1 + movu xm2, [srcq+8*12] + vinserti128 m2, [srcq+8*13], 1 + movu xm3, [srcq+8*14] + vinserti128 m3, [srcq+8*15], 1 + add tmpq, 32*8 + add srcq, strideq + pshufb m0, m4 + pshufb m1, m4 + pshufb m2, m4 + pshufb m3, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmaddubsw m2, m5 + pmaddubsw m3, m5 + mova [tmpq-32*4], m0 + mova [tmpq-32*3], m1 + mova [tmpq-32*2], m2 + mova [tmpq-32*1], m3 + dec hd + jg .h_w128 + RET +.v: + WIN64_SPILL_XMM 7 + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)] + imul mxyd, 255 + add mxyd, 16 + add wq, r6 + lea stride3q, [strideq*3] + movd xm6, mxyd + vpbroadcastw m6, xm6 + jmp wq +.v_w4: + movd xm0, [srcq+strideq*0] +.v_w4_loop: + vpbroadcastd m1, [srcq+strideq*2] + vpbroadcastd xm2, [srcq+strideq*1] + vpbroadcastd m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpblendd m1, m0, 0x05 ; 0 2 2 2 + vpbroadcastd m0, [srcq+strideq*0] + vpblendd m3, m2, 0x0f ; 1 1 3 3 + vpblendd m2, m1, m0, 0xa0 ; 0 2 2 4 + vpblendd m1, m3, 0xaa ; 0 1 2 3 + vpblendd m2, m3, 0x55 ; 1 2 3 4 + punpcklbw m1, m2 + pmaddubsw m1, m6 + mova [tmpq], m1 + add tmpq, 32 + sub hd, 4 + jg .v_w4_loop + RET +.v_w8: + movq xm0, [srcq+strideq*0] +.v_w8_loop: + vpbroadcastq m1, [srcq+strideq*2] + vpbroadcastq m2, [srcq+strideq*1] + vpbroadcastq m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpblendd m1, m0, 0x03 ; 0 2 2 2 + vpbroadcastq m0, [srcq+strideq*0] + vpblendd m2, m3, 0xcc ; 1 3 1 3 + vpblendd m3, m2, m1, 0xf0 ; 1 3 2 2 + vpblendd m2, m1, 0x0f ; 0 2 1 3 + vpblendd m3, m0, 0xc0 ; 1 3 2 4 + punpcklbw m1, m2, m3 + punpckhbw m2, m3 + pmaddubsw m1, m6 + pmaddubsw m2, m6 + mova [tmpq+32*0], m1 + mova [tmpq+32*1], m2 + add tmpq, 32*2 + sub hd, 4 + jg .v_w8_loop + RET +.v_w16: + vbroadcasti128 m0, [srcq+strideq*0] +.v_w16_loop: + vbroadcasti128 m1, [srcq+strideq*1] + vbroadcasti128 m2, [srcq+strideq*2] + vbroadcasti128 m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + shufpd m4, m0, m2, 0x0c ; 0 2 + vbroadcasti128 m0, [srcq+strideq*0] + shufpd m1, m3, 0x0c ; 1 3 + shufpd m2, m0, 0x0c ; 2 4 + punpcklbw m3, m4, m1 + punpcklbw m5, m1, m2 + punpckhbw m4, m1 + punpckhbw m1, m2 + pmaddubsw m3, m6 + pmaddubsw m5, m6 + pmaddubsw m4, m6 + pmaddubsw m1, m6 + mova [tmpq+32*0], m3 + mova [tmpq+32*1], m5 + mova [tmpq+32*2], m4 + mova [tmpq+32*3], m1 + add tmpq, 32*4 + sub hd, 4 + jg .v_w16_loop + RET +.v_w32: + vpermq m0, [srcq+strideq*0], q3120 +.v_w32_loop: + vpermq m1, [srcq+strideq*1], q3120 + vpermq m2, [srcq+strideq*2], q3120 + vpermq m3, [srcq+stride3q ], q3120 + lea srcq, [srcq+strideq*4] + punpcklbw m4, m0, m1 + punpckhbw m5, m0, m1 + vpermq m0, [srcq+strideq*0], q3120 + pmaddubsw m4, m6 + pmaddubsw m5, m6 + mova [tmpq+32*0], m4 + mova [tmpq+32*1], m5 + punpcklbw m4, m1, m2 + punpckhbw m1, m2 + pmaddubsw m4, m6 + pmaddubsw m1, m6 + punpcklbw m5, m2, m3 + punpckhbw m2, m3 + pmaddubsw m5, m6 + pmaddubsw m2, m6 + mova [tmpq+32*2], m4 + mova [tmpq+32*3], m1 + add tmpq, 32*8 + punpcklbw m1, m3, m0 + punpckhbw m3, m0 + pmaddubsw m1, m6 + pmaddubsw m3, m6 + mova [tmpq-32*4], m5 + mova [tmpq-32*3], m2 + mova [tmpq-32*2], m1 + mova [tmpq-32*1], m3 + sub hd, 4 + jg .v_w32_loop + RET +.v_w64: + vpermq m0, [srcq+strideq*0+32*0], q3120 + vpermq m1, [srcq+strideq*0+32*1], q3120 +.v_w64_loop: + vpermq m2, [srcq+strideq*1+32*0], q3120 + vpermq m3, [srcq+strideq*1+32*1], q3120 + lea srcq, [srcq+strideq*2] + punpcklbw m4, m0, m2 + punpckhbw m0, m2 + pmaddubsw m4, m6 + pmaddubsw m0, m6 + mova [tmpq+32*0], m4 + mova [tmpq+32*1], m0 + punpcklbw m4, m1, m3 + punpckhbw m5, m1, m3 + vpermq m0, [srcq+strideq*0+32*0], q3120 + vpermq m1, [srcq+strideq*0+32*1], q3120 + pmaddubsw m4, m6 + pmaddubsw m5, m6 + mova [tmpq+32*2], m4 + mova [tmpq+32*3], m5 + add tmpq, 32*8 + punpcklbw m4, m2, m0 + punpckhbw m2, m0 + punpcklbw m5, m3, m1 + punpckhbw m3, m1 + pmaddubsw m4, m6 + pmaddubsw m2, m6 + pmaddubsw m5, m6 + pmaddubsw m3, m6 + mova [tmpq-32*4], m4 + mova [tmpq-32*3], m2 + mova [tmpq-32*2], m5 + mova [tmpq-32*1], m3 + sub hd, 2 + jg .v_w64_loop + RET +.v_w128: + lea r6d, [hq+(3<<8)] + mov r3, srcq + mov r5, tmpq +.v_w128_loop0: + vpermq m0, [srcq+strideq*0], q3120 +.v_w128_loop: + vpermq m1, [srcq+strideq*1], q3120 + lea srcq, [srcq+strideq*2] + punpcklbw m2, m0, m1 + punpckhbw m3, m0, m1 + vpermq m0, [srcq+strideq*0], q3120 + pmaddubsw m2, m6 + pmaddubsw m3, m6 + punpcklbw m4, m1, m0 + punpckhbw m1, m0 + pmaddubsw m4, m6 + pmaddubsw m1, m6 + mova [tmpq+32*0], m2 + mova [tmpq+32*1], m3 + mova [tmpq+32*8], m4 + mova [tmpq+32*9], m1 + add tmpq, 32*16 + sub hd, 2 + jg .v_w128_loop + add r3, 32 + add r5, 64 + movzx hd, r6b + mov srcq, r3 + mov tmpq, r5 + sub r6d, 1<<8 + jg .v_w128_loop0 + RET +.hv: + ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4 + ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4) + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 7 + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] + shl mxyd, 11 + movd xm6, mxyd + vpbroadcastw m6, xm6 + add wq, r6 + lea stride3q, [strideq*3] + jmp wq +.hv_w4: + vbroadcasti128 m4, [bilin_h_shuf4] + vpbroadcastq m0, [srcq+strideq*0] + pshufb m0, m4 + pmaddubsw m0, m5 +.hv_w4_loop: + movq xm1, [srcq+strideq*1] + movhps xm1, [srcq+strideq*2] + movq xm2, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + movhps xm2, [srcq+strideq*0] + vinserti128 m1, xm2, 1 + pshufb m1, m4 + pmaddubsw m1, m5 ; 1 2 3 4 + vpblendd m2, m1, m0, 0xc0 + vpermq m2, m2, q2103 ; 0 1 2 3 + mova m0, m1 + psubw m1, m2 + pmulhrsw m1, m6 + paddw m1, m2 + mova [tmpq], m1 + add tmpq, 32 + sub hd, 4 + jg .hv_w4_loop + RET +.hv_w8: + vbroadcasti128 m0, [srcq+strideq*0] + pshufb m0, m4 + pmaddubsw m0, m5 +.hv_w8_loop: + movu xm1, [srcq+strideq*1] + vinserti128 m1, [srcq+strideq*2], 1 + movu xm2, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vinserti128 m2, [srcq+strideq*0], 1 + pshufb m1, m4 + pshufb m2, m4 + pmaddubsw m1, m5 ; 1 2 + vperm2i128 m3, m0, m1, 0x21 ; 0 1 + pmaddubsw m0, m2, m5 ; 3 4 + vperm2i128 m2, m1, m0, 0x21 ; 2 3 + psubw m1, m3 + pmulhrsw m1, m6 + paddw m1, m3 + psubw m3, m0, m2 + pmulhrsw m3, m6 + paddw m3, m2 + mova [tmpq+32*0], m1 + mova [tmpq+32*1], m3 + add tmpq, 32*2 + sub hd, 4 + jg .hv_w8_loop + RET +.hv_w16: + movu xm0, [srcq+strideq*0+8*0] + vinserti128 m0, [srcq+strideq*0+8*1], 1 + pshufb m0, m4 + pmaddubsw m0, m5 +.hv_w16_loop: + movu xm1, [srcq+strideq*1+8*0] + vinserti128 m1, [srcq+strideq*1+8*1], 1 + lea srcq, [srcq+strideq*2] + movu xm2, [srcq+strideq*0+8*0] + vinserti128 m2, [srcq+strideq*0+8*1], 1 + pshufb m1, m4 + pshufb m2, m4 + pmaddubsw m1, m5 + psubw m3, m1, m0 + pmulhrsw m3, m6 + paddw m3, m0 + pmaddubsw m0, m2, m5 + psubw m2, m0, m1 + pmulhrsw m2, m6 + paddw m2, m1 + mova [tmpq+32*0], m3 + mova [tmpq+32*1], m2 + add tmpq, 32*2 + sub hd, 2 + jg .hv_w16_loop + RET +.hv_w32: + movu xm0, [srcq+8*0] + vinserti128 m0, [srcq+8*1], 1 + movu xm1, [srcq+8*2] + vinserti128 m1, [srcq+8*3], 1 + pshufb m0, m4 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 +.hv_w32_loop: + add srcq, strideq + movu xm2, [srcq+8*0] + vinserti128 m2, [srcq+8*1], 1 + pshufb m2, m4 + pmaddubsw m2, m5 + psubw m3, m2, m0 + pmulhrsw m3, m6 + paddw m3, m0 + mova m0, m2 + movu xm2, [srcq+8*2] + vinserti128 m2, [srcq+8*3], 1 + pshufb m2, m4 + pmaddubsw m2, m5 + mova [tmpq+32*0], m3 + psubw m3, m2, m1 + pmulhrsw m3, m6 + paddw m3, m1 + mova m1, m2 + mova [tmpq+32*1], m3 + add tmpq, 32*2 + dec hd + jg .hv_w32_loop + RET +.hv_w128: + lea r3d, [hq+(7<<8)] + mov r6d, 256 + jmp .hv_w64_start +.hv_w64: + lea r3d, [hq+(3<<8)] + mov r6d, 128 +.hv_w64_start: +%if WIN64 + PUSH r7 +%endif + mov r5, srcq + mov r7, tmpq +.hv_w64_loop0: + movu xm0, [srcq+strideq*0+8*0] + vinserti128 m0, [srcq+strideq*0+8*1], 1 + pshufb m0, m4 + pmaddubsw m0, m5 +.hv_w64_loop: + movu xm1, [srcq+strideq*1+8*0] + vinserti128 m1, [srcq+strideq*1+8*1], 1 + lea srcq, [srcq+strideq*2] + movu xm2, [srcq+strideq*0+8*0] + vinserti128 m2, [srcq+strideq*0+8*1], 1 + pshufb m1, m4 + pshufb m2, m4 + pmaddubsw m1, m5 + psubw m3, m1, m0 + pmulhrsw m3, m6 + paddw m3, m0 + pmaddubsw m0, m2, m5 + psubw m2, m0, m1 + pmulhrsw m2, m6 + paddw m2, m1 + mova [tmpq+r6*0], m3 + mova [tmpq+r6*1], m2 + lea tmpq, [tmpq+r6*2] + sub hd, 2 + jg .hv_w64_loop + add r5, 16 + add r7, 32 + movzx hd, r3b + mov srcq, r5 + mov tmpq, r7 + sub r3d, 1<<8 + jg .hv_w64_loop0 +%if WIN64 + POP r7 +%endif + RET + +; int8_t subpel_filters[5][15][8] +%assign FILTER_REGULAR (0*15 << 16) | 3*15 +%assign FILTER_SMOOTH (1*15 << 16) | 4*15 +%assign FILTER_SHARP (2*15 << 16) | 3*15 + +%macro FN 4 ; fn, type, type_h, type_v +cglobal %1_%2 + mov t0d, FILTER_%3 +%ifidn %3, %4 + mov t1d, t0d +%else + mov t1d, FILTER_%4 +%endif +%ifnidn %2, regular ; skip the jump in the last filter + jmp mangle(private_prefix %+ _%1 %+ SUFFIX) +%endif +%endmacro + +%if WIN64 +DECLARE_REG_TMP 4, 5 +%else +DECLARE_REG_TMP 7, 8 +%endif + +%define PUT_8TAP_FN FN put_8tap, + +PUT_8TAP_FN sharp, SHARP, SHARP +PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH +PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP +PUT_8TAP_FN smooth, SMOOTH, SMOOTH +PUT_8TAP_FN sharp_regular, SHARP, REGULAR +PUT_8TAP_FN regular_sharp, REGULAR, SHARP +PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR +PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH +PUT_8TAP_FN regular, REGULAR, REGULAR + +cglobal put_8tap, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + lea r8, [put_avx2] + movsxd wq, wm + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jnz .v + tzcnt wd, wd + movzx wd, word [r8+wq*2+table_offset(put,)] + add wq, r8 + lea r6, [ssq*3] + lea r7, [dsq*3] +%if WIN64 + pop r8 +%endif + jmp wq +.h: + test myd, 0xf00 + jnz .hv + vpbroadcastd m5, [pw_34] ; 2 + (8 << 2) + WIN64_SPILL_XMM 11 + cmp wd, 4 + jl .h_w2 + vbroadcasti128 m6, [subpel_h_shufA] + je .h_w4 + tzcnt wd, wd + vbroadcasti128 m7, [subpel_h_shufB] + vbroadcasti128 m8, [subpel_h_shufC] + shr mxd, 16 + sub srcq, 3 + movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)] + vpbroadcastd m9, [r8+mxq*8+subpel_filters-put_avx2+0] + vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+4] + add wq, r8 + jmp wq +.h_w2: + movzx mxd, mxb + dec srcq + mova xm4, [subpel_h_shuf4] + vpbroadcastd xm3, [r8+mxq*8+subpel_filters-put_avx2+2] +.h_w2_loop: + movq xm0, [srcq+ssq*0] + movhps xm0, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb xm0, xm4 + pmaddubsw xm0, xm3 + phaddw xm0, xm0 + paddw xm0, xm5 + psraw xm0, 6 + packuswb xm0, xm0 + pextrw [dstq+dsq*0], xm0, 0 + pextrw [dstq+dsq*1], xm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w2_loop + RET +.h_w4: + movzx mxd, mxb + dec srcq + vpbroadcastd xm3, [r8+mxq*8+subpel_filters-put_avx2+2] +.h_w4_loop: + movq xm0, [srcq+ssq*0] + movq xm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb xm0, xm6 + pshufb xm1, xm6 + pmaddubsw xm0, xm3 + pmaddubsw xm1, xm3 + phaddw xm0, xm1 + paddw xm0, xm5 + psraw xm0, 6 + packuswb xm0, xm0 + movd [dstq+dsq*0], xm0 + pextrd [dstq+dsq*1], xm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w4_loop + RET +.h_w8: +%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3] + pshufb m%2, m%1, m7 + pshufb m%3, m%1, m8 + pshufb m%1, m6 + pmaddubsw m%4, m%2, m9 + pmaddubsw m%2, m10 + pmaddubsw m%3, m10 + pmaddubsw m%1, m9 + paddw m%3, m%4 + paddw m%1, m%2 + phaddw m%1, m%3 + paddw m%1, m5 + psraw m%1, 6 +%endmacro + movu xm0, [srcq+ssq*0] + vinserti128 m0, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + PUT_8TAP_H 0, 1, 2, 3 + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + movq [dstq+dsq*0], xm0 + movhps [dstq+dsq*1], xm0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w8 + RET +.h_w16: + movu xm0, [srcq+ssq*0+8*0] + vinserti128 m0, [srcq+ssq*1+8*0], 1 + movu xm1, [srcq+ssq*0+8*1] + vinserti128 m1, [srcq+ssq*1+8*1], 1 + PUT_8TAP_H 0, 2, 3, 4 + lea srcq, [srcq+ssq*2] + PUT_8TAP_H 1, 2, 3, 4 + packuswb m0, m1 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w16 + RET +.h_w32: + xor r6d, r6d + jmp .h_start +.h_w64: + mov r6, -32*1 + jmp .h_start +.h_w128: + mov r6, -32*3 +.h_start: + sub srcq, r6 + sub dstq, r6 + mov r4, r6 +.h_loop: + movu m0, [srcq+r6+8*0] + movu m1, [srcq+r6+8*1] + PUT_8TAP_H 0, 2, 3, 4 + PUT_8TAP_H 1, 2, 3, 4 + packuswb m0, m1 + mova [dstq+r6], m0 + add r6, 32 + jle .h_loop + add srcq, ssq + add dstq, dsq + mov r6, r4 + dec hd + jg .h_loop + RET +.v: + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 16 + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + tzcnt r6d, wd + movzx r6d, word [r8+r6*2+table_offset(put, _8tap_v)] + vpbroadcastd m7, [pw_512] + lea myq, [r8+myq*8+subpel_filters-put_avx2] + vpbroadcastw m8, [myq+0] + vpbroadcastw m9, [myq+2] + vpbroadcastw m10, [myq+4] + vpbroadcastw m11, [myq+6] + add r6, r8 + lea ss3q, [ssq*3] + sub srcq, ss3q + jmp r6 +.v_w2: + movd xm2, [srcq+ssq*0] + pinsrw xm2, [srcq+ssq*1], 2 + pinsrw xm2, [srcq+ssq*2], 4 + add srcq, ss3q + pinsrw xm2, [srcq+ssq*0], 6 ; 0 1 2 3 + movd xm3, [srcq+ssq*1] + vpbroadcastd xm1, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastd xm0, [srcq+ssq*0] + vpblendd xm3, xm1, 0x02 ; 4 5 + vpblendd xm1, xm0, 0x02 ; 5 6 + palignr xm4, xm3, xm2, 4 ; 1 2 3 4 + punpcklbw xm3, xm1 ; 45 56 + punpcklbw xm1, xm2, xm4 ; 01 12 + punpckhbw xm2, xm4 ; 23 34 +.v_w2_loop: + pmaddubsw xm5, xm1, xm8 ; a0 b0 + mova xm1, xm2 + pmaddubsw xm2, xm9 ; a1 b1 + paddw xm5, xm2 + mova xm2, xm3 + pmaddubsw xm3, xm10 ; a2 b2 + paddw xm5, xm3 + vpbroadcastd xm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpblendd xm3, xm0, xm4, 0x02 ; 6 7 + vpbroadcastd xm0, [srcq+ssq*0] + vpblendd xm4, xm0, 0x02 ; 7 8 + punpcklbw xm3, xm4 ; 67 78 + pmaddubsw xm4, xm3, xm11 ; a3 b3 + paddw xm5, xm4 + pmulhrsw xm5, xm7 + packuswb xm5, xm5 + pextrw [dstq+dsq*0], xm5, 0 + pextrw [dstq+dsq*1], xm5, 2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: + movd xm2, [srcq+ssq*0] + pinsrd xm2, [srcq+ssq*1], 1 + pinsrd xm2, [srcq+ssq*2], 2 + add srcq, ss3q + pinsrd xm2, [srcq+ssq*0], 3 ; 0 1 2 3 + movd xm3, [srcq+ssq*1] + vpbroadcastd xm1, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastd xm0, [srcq+ssq*0] + vpblendd xm3, xm1, 0x02 ; 4 5 + vpblendd xm1, xm0, 0x02 ; 5 6 + palignr xm4, xm3, xm2, 4 ; 1 2 3 4 + punpcklbw xm3, xm1 ; 45 56 + punpcklbw xm1, xm2, xm4 ; 01 12 + punpckhbw xm2, xm4 ; 23 34 +.v_w4_loop: + pmaddubsw xm5, xm1, xm8 ; a0 b0 + mova xm1, xm2 + pmaddubsw xm2, xm9 ; a1 b1 + paddw xm5, xm2 + mova xm2, xm3 + pmaddubsw xm3, xm10 ; a2 b2 + paddw xm5, xm3 + vpbroadcastd xm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpblendd xm3, xm0, xm4, 0x02 ; 6 7 + vpbroadcastd xm0, [srcq+ssq*0] + vpblendd xm4, xm0, 0x02 ; 7 8 + punpcklbw xm3, xm4 ; 67 78 + pmaddubsw xm4, xm3, xm11 ; a3 b3 + paddw xm5, xm4 + pmulhrsw xm5, xm7 + packuswb xm5, xm5 + movd [dstq+dsq*0], xm5 + pextrd [dstq+dsq*1], xm5, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: + movq xm1, [srcq+ssq*0] + vpbroadcastq m4, [srcq+ssq*1] + vpbroadcastq m2, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastq m5, [srcq+ssq*0] + vpbroadcastq m3, [srcq+ssq*1] + vpbroadcastq m6, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastq m0, [srcq+ssq*0] + vpblendd m1, m4, 0x30 + vpblendd m4, m2, 0x30 + punpcklbw m1, m4 ; 01 12 + vpblendd m2, m5, 0x30 + vpblendd m5, m3, 0x30 + punpcklbw m2, m5 ; 23 34 + vpblendd m3, m6, 0x30 + vpblendd m6, m0, 0x30 + punpcklbw m3, m6 ; 45 56 +.v_w8_loop: + vpbroadcastq m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddubsw m5, m1, m8 ; a0 b0 + mova m1, m2 + pmaddubsw m2, m9 ; a1 b1 + paddw m5, m2 + mova m2, m3 + pmaddubsw m3, m10 ; a2 b2 + paddw m5, m3 + vpblendd m3, m0, m4, 0x30 + vpbroadcastq m0, [srcq+ssq*0] + vpblendd m4, m0, 0x30 + punpcklbw m3, m4 ; 67 78 + pmaddubsw m4, m3, m11 ; a3 b3 + paddw m5, m4 + pmulhrsw m5, m7 + vextracti128 xm4, m5, 1 + packuswb xm5, xm4 + movq [dstq+dsq*0], xm5 + movhps [dstq+dsq*1], xm5 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w8_loop + RET +.v_w16: +.v_w32: +.v_w64: +.v_w128: + lea r6d, [wq*8-128] + mov r4, srcq + mov r7, dstq + lea r6d, [hq+r6*2] +.v_w16_loop0: + vbroadcasti128 m4, [srcq+ssq*0] + vbroadcasti128 m5, [srcq+ssq*1] + vbroadcasti128 m6, [srcq+ssq*2] + add srcq, ss3q + vbroadcasti128 m0, [srcq+ssq*0] + vbroadcasti128 m1, [srcq+ssq*1] + vbroadcasti128 m2, [srcq+ssq*2] + add srcq, ss3q + vbroadcasti128 m3, [srcq+ssq*0] + shufpd m4, m0, 0x0c + shufpd m5, m1, 0x0c + punpcklbw m1, m4, m5 ; 01 + punpckhbw m4, m5 ; 34 + shufpd m6, m2, 0x0c + punpcklbw m2, m5, m6 ; 12 + punpckhbw m5, m6 ; 45 + shufpd m0, m3, 0x0c + punpcklbw m3, m6, m0 ; 23 + punpckhbw m6, m0 ; 56 +.v_w16_loop: + vbroadcasti128 m12, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vbroadcasti128 m13, [srcq+ssq*0] + pmaddubsw m14, m1, m8 ; a0 + pmaddubsw m15, m2, m8 ; b0 + mova m1, m3 + mova m2, m4 + pmaddubsw m3, m9 ; a1 + pmaddubsw m4, m9 ; b1 + paddw m14, m3 + paddw m15, m4 + mova m3, m5 + mova m4, m6 + pmaddubsw m5, m10 ; a2 + pmaddubsw m6, m10 ; b2 + paddw m14, m5 + paddw m15, m6 + shufpd m6, m0, m12, 0x0d + shufpd m0, m12, m13, 0x0c + punpcklbw m5, m6, m0 ; 67 + punpckhbw m6, m0 ; 78 + pmaddubsw m12, m5, m11 ; a3 + pmaddubsw m13, m6, m11 ; b3 + paddw m14, m12 + paddw m15, m13 + pmulhrsw m14, m7 + pmulhrsw m15, m7 + packuswb m14, m15 + vpermq m14, m14, q3120 + mova [dstq+dsq*0], xm14 + vextracti128 [dstq+dsq*1], m14, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w16_loop + add r4, 16 + add r7, 16 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 + sub r6d, 1<<8 + jg .v_w16_loop0 + RET +.hv: + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 16 + cmp wd, 4 + jg .hv_w8 + movzx mxd, mxb + dec srcq + vpbroadcastd m7, [r8+mxq*8+subpel_filters-put_avx2+2] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + vpbroadcastq m0, [r8+myq*8+subpel_filters-put_avx2] + lea ss3q, [ssq*3] + sub srcq, ss3q + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + vpbroadcastd m8, [pw_8192] + vpbroadcastd m9, [pd_512] + pshufd m10, m0, q0000 + pshufd m11, m0, q1111 + pshufd m12, m0, q2222 + pshufd m13, m0, q3333 + cmp wd, 4 + je .hv_w4 + vbroadcasti128 m6, [subpel_h_shuf4] + movq xm2, [srcq+ssq*0] + movhps xm2, [srcq+ssq*1] + movq xm0, [srcq+ssq*2] + add srcq, ss3q + movhps xm0, [srcq+ssq*0] + vpbroadcastq m3, [srcq+ssq*1] + vpbroadcastq m4, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastq m1, [srcq+ssq*0] + vpblendd m2, m3, 0x30 + vpblendd m0, m1, 0x30 + vpblendd m2, m4, 0xc0 + pshufb m2, m6 + pshufb m0, m6 + pmaddubsw m2, m7 + pmaddubsw m0, m7 + phaddw m2, m0 + pmulhrsw m2, m8 + vextracti128 xm3, m2, 1 + palignr xm4, xm3, xm2, 4 + punpcklwd xm1, xm2, xm4 ; 01 12 + punpckhwd xm2, xm4 ; 23 34 + pshufd xm0, xm3, q2121 + punpcklwd xm3, xm0 ; 45 56 +.hv_w2_loop: + movq xm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movhps xm4, [srcq+ssq*0] + pshufb xm4, xm6 + pmaddubsw xm4, xm7 + pmaddwd xm5, xm1, xm10 ; a0 b0 + mova xm1, xm2 + pmaddwd xm2, xm11 ; a1 b1 + paddd xm5, xm2 + mova xm2, xm3 + pmaddwd xm3, xm12 ; a2 b2 + phaddw xm4, xm4 + pmulhrsw xm4, xm8 + paddd xm5, xm3 + palignr xm3, xm4, xm0, 12 + mova xm0, xm4 + punpcklwd xm3, xm0 ; 67 78 + pmaddwd xm4, xm3, xm13 ; a3 b3 + paddd xm5, xm9 + paddd xm5, xm4 + psrad xm5, 10 + packssdw xm5, xm5 + packuswb xm5, xm5 + pextrw [dstq+dsq*0], xm5, 0 + pextrw [dstq+dsq*1], xm5, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +.hv_w4: + mova m6, [subpel_h_shuf4] + vpbroadcastq m2, [srcq+ssq*0] + vpbroadcastq m4, [srcq+ssq*1] + vpbroadcastq m0, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastq m5, [srcq+ssq*0] + vpbroadcastq m3, [srcq+ssq*1] + vpblendd m2, m4, 0xcc ; 0 1 + vpbroadcastq m4, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastq m1, [srcq+ssq*0] + vpblendd m0, m5, 0xcc ; 2 3 + vpblendd m3, m4, 0xcc ; 4 5 + pshufb m2, m6 + pshufb m0, m6 + pshufb m3, m6 + pshufb m1, m6 + pmaddubsw m2, m7 + pmaddubsw m0, m7 + pmaddubsw m3, m7 + pmaddubsw m1, m7 + phaddw m2, m0 + phaddw m3, m1 + pmulhrsw m2, m8 + pmulhrsw m3, m8 + palignr m4, m3, m2, 4 + punpcklwd m1, m2, m4 ; 01 12 + punpckhwd m2, m4 ; 23 34 + pshufd m0, m3, q2121 + punpcklwd m3, m0 ; 45 56 +.hv_w4_loop: + vpbroadcastq m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddwd m5, m1, m10 ; a0 b0 + mova m1, m2 + pmaddwd m2, m11 ; a1 b1 + paddd m5, m2 + mova m2, m3 + pmaddwd m3, m12 ; a2 b2 + paddd m5, m3 + vpbroadcastq m3, [srcq+ssq*0] + vpblendd m4, m3, 0xcc ; 7 8 + pshufb m4, m6 + pmaddubsw m4, m7 + phaddw m4, m4 + pmulhrsw m4, m8 + palignr m3, m4, m0, 12 + mova m0, m4 + punpcklwd m3, m0 ; 67 78 + pmaddwd m4, m3, m13 ; a3 b3 + paddd m5, m9 + paddd m5, m4 + psrad m5, 10 + vextracti128 xm4, m5, 1 + packssdw xm5, xm4 + packuswb xm5, xm5 + pshuflw xm5, xm5, q3120 + movd [dstq+dsq*0], xm5 + pextrd [dstq+dsq*1], xm5, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: + shr mxd, 16 + sub srcq, 3 + vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+0] + vpbroadcastd m11, [r8+mxq*8+subpel_filters-put_avx2+4] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + vpbroadcastq m0, [r8+myq*8+subpel_filters-put_avx2] + lea ss3q, [ssq*3] + sub srcq, ss3q + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + pshufd m12, m0, q0000 + pshufd m13, m0, q1111 + pshufd m14, m0, q2222 + pshufd m15, m0, q3333 + lea r6d, [wq*8-64] + mov r4, srcq + mov r7, dstq + lea r6d, [hq+r6*4] +.hv_w8_loop0: + vbroadcasti128 m7, [subpel_h_shufA] + movu xm4, [srcq+ssq*0] + vbroadcasti128 m8, [subpel_h_shufB] + movu xm5, [srcq+ssq*1] + vbroadcasti128 m9, [subpel_h_shufC] + movu xm6, [srcq+ssq*2] + add srcq, ss3q + vbroadcasti128 m0, [srcq+ssq*0] + vpblendd m4, m0, 0xf0 ; 0 3 + vinserti128 m5, [srcq+ssq*1], 1 ; 1 4 + vinserti128 m6, [srcq+ssq*2], 1 ; 2 5 + add srcq, ss3q + vinserti128 m0, [srcq+ssq*0], 1 ; 3 6 +%macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3] + pshufb %3, %1, %6 + pshufb %4, %1, %7 + pshufb %1, %5 + pmaddubsw %2, %3, m10 + pmaddubsw %4, m11 + pmaddubsw %3, m11 + pmaddubsw %1, m10 + paddw %2, %4 + paddw %1, %3 + phaddw %1, %2 +%endmacro + HV_H_W8 m4, m1, m2, m3, m7, m8, m9 + HV_H_W8 m5, m1, m2, m3, m7, m8, m9 + HV_H_W8 m6, m1, m2, m3, m7, m8, m9 + HV_H_W8 m0, m1, m2, m3, m7, m8, m9 + vpbroadcastd m7, [pw_8192] + vpermq m4, m4, q3120 + vpermq m5, m5, q3120 + vpermq m6, m6, q3120 + pmulhrsw m0, m7 + pmulhrsw m4, m7 + pmulhrsw m5, m7 + pmulhrsw m6, m7 + vpermq m7, m0, q3120 + punpcklwd m1, m4, m5 ; 01 + punpckhwd m4, m5 ; 34 + punpcklwd m2, m5, m6 ; 12 + punpckhwd m5, m6 ; 45 + punpcklwd m3, m6, m7 ; 23 + punpckhwd m6, m7 ; 56 +.hv_w8_loop: + vextracti128 r6m, m0, 1 ; not enough registers + movu xm0, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vinserti128 m0, [srcq+ssq*0], 1 ; 7 8 + pmaddwd m8, m1, m12 ; a0 + pmaddwd m9, m2, m12 ; b0 + mova m1, m3 + mova m2, m4 + pmaddwd m3, m13 ; a1 + pmaddwd m4, m13 ; b1 + paddd m8, m3 + paddd m9, m4 + mova m3, m5 + mova m4, m6 + pmaddwd m5, m14 ; a2 + pmaddwd m6, m14 ; b2 + paddd m8, m5 + paddd m9, m6 + vbroadcasti128 m6, [subpel_h_shufB] + vbroadcasti128 m7, [subpel_h_shufC] + vbroadcasti128 m5, [subpel_h_shufA] + HV_H_W8 m0, m5, m6, m7, m5, m6, m7 + vpbroadcastd m5, [pw_8192] + vpbroadcastd m7, [pd_512] + vbroadcasti128 m6, r6m + pmulhrsw m0, m5 + paddd m8, m7 + paddd m9, m7 + vpermq m7, m0, q3120 ; 7 8 + shufpd m6, m6, m7, 0x04 ; 6 7 + punpcklwd m5, m6, m7 ; 67 + punpckhwd m6, m7 ; 78 + pmaddwd m7, m5, m15 ; a3 + paddd m8, m7 + pmaddwd m7, m6, m15 ; b3 + paddd m7, m9 + psrad m8, 10 + psrad m7, 10 + packssdw m8, m7 + vextracti128 xm7, m8, 1 + packuswb xm8, xm7 + pshufd xm7, xm8, q3120 + movq [dstq+dsq*0], xm7 + movhps [dstq+dsq*1], xm7 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w8_loop + add r4, 8 + add r7, 8 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 + sub r6d, 1<<8 + jg .hv_w8_loop0 + RET + +%macro PREP_8TAP_H 0 + pshufb m1, m0, m5 + pshufb m2, m0, m6 + pshufb m3, m0, m7 + pmaddubsw m1, m8 + pmaddubsw m0, m2, m8 + pmaddubsw m2, m9 + pmaddubsw m3, m9 + paddw m1, m2 + paddw m0, m3 + phaddw m0, m1, m0 + pmulhrsw m0, m4 +%endmacro + +%if WIN64 +DECLARE_REG_TMP 6, 4 +%else +DECLARE_REG_TMP 6, 7 +%endif + +%define PREP_8TAP_FN FN prep_8tap, + +PREP_8TAP_FN sharp, SHARP, SHARP +PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH +PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP +PREP_8TAP_FN smooth, SMOOTH, SMOOTH +PREP_8TAP_FN sharp_regular, SHARP, REGULAR +PREP_8TAP_FN regular_sharp, REGULAR, SHARP +PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR +PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH +PREP_8TAP_FN regular, REGULAR, REGULAR + +cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + lea r7, [prep%+SUFFIX] + movsxd wq, wm + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jnz .v + tzcnt wd, wd + movzx wd, word [r7+wq*2+table_offset(prep,)] + add wq, r7 + lea r6, [strideq*3] +%if WIN64 + pop r7 +%endif + jmp wq +.h: + test myd, 0xf00 + jnz .hv + vpbroadcastd m4, [pw_8192] + vbroadcasti128 m5, [subpel_h_shufA] + WIN64_SPILL_XMM 10 + cmp wd, 4 + je .h_w4 + tzcnt wd, wd + vbroadcasti128 m6, [subpel_h_shufB] + vbroadcasti128 m7, [subpel_h_shufC] + shr mxd, 16 + sub srcq, 3 + movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)] + vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0] + vpbroadcastd m9, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4] + add wq, r7 + jmp wq +.h_w4: + movzx mxd, mxb + dec srcq + vpbroadcastd m6, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2] + lea stride3q, [strideq*3] +.h_w4_loop: + movq xm0, [srcq+strideq*0] + vpbroadcastq m2, [srcq+strideq*2] + movq xm1, [srcq+strideq*1] + vpblendd m0, m2, 0xf0 + vpbroadcastq m2, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpblendd m1, m2, 0xf0 + pshufb m0, m5 + pshufb m1, m5 + pmaddubsw m0, m6 + pmaddubsw m1, m6 + phaddw m0, m1 + pmulhrsw m0, m4 + mova [tmpq], m0 + add tmpq, 32 + sub hd, 4 + jg .h_w4_loop + RET +.h_w8: + movu xm0, [srcq+strideq*0] + vinserti128 m0, [srcq+strideq*1], 1 + lea srcq, [srcq+strideq*2] + PREP_8TAP_H + mova [tmpq], m0 + add tmpq, 32 + sub hd, 2 + jg .h_w8 + RET +.h_w16: + movu xm0, [srcq+strideq*0+8*0] + vinserti128 m0, [srcq+strideq*0+8*1], 1 + PREP_8TAP_H + mova [tmpq+32*0], m0 + movu xm0, [srcq+strideq*1+8*0] + vinserti128 m0, [srcq+strideq*1+8*1], 1 + lea srcq, [srcq+strideq*2] + PREP_8TAP_H + mova [tmpq+32*1], m0 + add tmpq, 32*2 + sub hd, 2 + jg .h_w16 + RET +.h_w32: + xor r6d, r6d + jmp .h_start +.h_w64: + mov r6, -32*1 + jmp .h_start +.h_w128: + mov r6, -32*3 +.h_start: + sub srcq, r6 + mov r5, r6 +.h_loop: + movu xm0, [srcq+r6+8*0] + vinserti128 m0, [srcq+r6+8*1], 1 + PREP_8TAP_H + mova [tmpq+32*0], m0 + movu xm0, [srcq+r6+8*2] + vinserti128 m0, [srcq+r6+8*3], 1 + PREP_8TAP_H + mova [tmpq+32*1], m0 + add tmpq, 32*2 + add r6, 32 + jle .h_loop + add srcq, strideq + mov r6, r5 + dec hd + jg .h_loop + RET +.v: + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 16 + movzx mxd, myb ; Select 4-tap/8-tap filter multipliers. + shr myd, 16 ; Note that the code is 8-tap only, having + cmp hd, 4 ; a separate 4-tap code path for (4|8|16)x4 + cmove myd, mxd ; had a negligible effect on performance. + ; TODO: Would a 6-tap code path be worth it? + lea myq, [r7+myq*8+subpel_filters-prep%+SUFFIX] + lea stride3q, [strideq*3] + sub srcq, stride3q + vpbroadcastd m7, [pw_8192] + vpbroadcastw m8, [myq+0] + vpbroadcastw m9, [myq+2] + vpbroadcastw m10, [myq+4] + vpbroadcastw m11, [myq+6] + cmp wd, 8 + jg .v_w16 + je .v_w8 +.v_w4: + movd xm0, [srcq+strideq*0] + vpbroadcastd m1, [srcq+strideq*2] + vpbroadcastd xm2, [srcq+strideq*1] + add srcq, stride3q + vpbroadcastd m3, [srcq+strideq*0] + vpblendd m1, m0, 0x01 ; 0 2 2 _ 2 _ _ _ + vpblendd m3, m2, 0x03 ; 1 1 3 3 3 3 _ _ + vpbroadcastd m0, [srcq+strideq*1] + vpbroadcastd m2, [srcq+strideq*2] + vpblendd m1, m0, 0x68 ; 0 2 2 4 2 4 4 _ + vpbroadcastd m0, [srcq+stride3q ] + vbroadcasti128 m5, [deint_shuf4] + vpblendd m3, m2, 0xc0 ; 1 1 3 3 3 3 5 5 + vpblendd m2, m3, m1, 0x55 ; 0 1 2 3 2 3 4 5 + vpblendd m3, m1, 0xaa ; 1 2 3 4 3 4 5 _ + punpcklbw m1, m2, m3 ; 01 12 23 34 + vpblendd m3, m0, 0x80 ; 1 2 3 4 3 4 5 6 + punpckhbw m2, m3 ; 23 34 45 56 +.v_w4_loop: + lea srcq, [srcq+strideq*4] + pinsrd xm0, [srcq+strideq*0], 1 + vpbroadcastd m3, [srcq+strideq*1] + vpbroadcastd m4, [srcq+strideq*2] + vpblendd m3, m0, 0x03 ; 6 7 8 _ 8 _ _ _ + vpbroadcastd m0, [srcq+stride3q ] + vpblendd m3, m4, 0x20 ; 6 7 8 _ 8 9 _ _ + vpblendd m3, m0, 0x40 ; 6 7 8 _ 8 9 a _ + pshufb m3, m5 ; 67 78 89 9a + pmaddubsw m4, m1, m8 + vperm2i128 m1, m2, m3, 0x21 ; 45 56 67 78 + pmaddubsw m2, m9 + paddw m4, m2 + mova m2, m3 + pmaddubsw m3, m11 + paddw m3, m4 + pmaddubsw m4, m1, m10 + paddw m3, m4 + pmulhrsw m3, m7 + mova [tmpq], m3 + add tmpq, 32 + sub hd, 4 + jg .v_w4_loop + RET +.v_w8: + movq xm1, [srcq+strideq*0] + vpbroadcastq m4, [srcq+strideq*1] + vpbroadcastq m2, [srcq+strideq*2] + vpbroadcastq m5, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpbroadcastq m3, [srcq+strideq*0] + vpbroadcastq m6, [srcq+strideq*1] + vpbroadcastq m0, [srcq+strideq*2] + vpblendd m1, m4, 0x30 + vpblendd m4, m2, 0x30 + punpcklbw m1, m4 ; 01 12 + vpblendd m2, m5, 0x30 + vpblendd m5, m3, 0x30 + punpcklbw m2, m5 ; 23 34 + vpblendd m3, m6, 0x30 + vpblendd m6, m0, 0x30 + punpcklbw m3, m6 ; 45 56 +.v_w8_loop: + vpbroadcastq m4, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + pmaddubsw m5, m2, m9 ; a1 + pmaddubsw m6, m2, m8 ; b0 + vpblendd m2, m0, m4, 0x30 + vpbroadcastq m0, [srcq+strideq*0] + vpblendd m4, m0, 0x30 + punpcklbw m2, m4 ; 67 78 + pmaddubsw m1, m8 ; a0 + pmaddubsw m4, m3, m9 ; b1 + paddw m5, m1 + mova m1, m3 + pmaddubsw m3, m10 ; a2 + paddw m6, m4 + paddw m5, m3 + vpbroadcastq m4, [srcq+strideq*1] + vpblendd m3, m0, m4, 0x30 + vpbroadcastq m0, [srcq+strideq*2] + vpblendd m4, m0, 0x30 + punpcklbw m3, m4 ; 89 9a + pmaddubsw m4, m2, m11 ; a3 + paddw m5, m4 + pmaddubsw m4, m2, m10 ; b2 + paddw m6, m4 + pmaddubsw m4, m3, m11 ; b3 + paddw m6, m4 + pmulhrsw m5, m7 + pmulhrsw m6, m7 + mova [tmpq+32*0], m5 + mova [tmpq+32*1], m6 + add tmpq, 32*2 + sub hd, 4 + jg .v_w8_loop + RET +.v_w16: + add wd, wd + mov r5, srcq + mov r7, tmpq + lea r6d, [hq+wq*8-256] +.v_w16_loop0: + vbroadcasti128 m4, [srcq+strideq*0] + vbroadcasti128 m5, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vbroadcasti128 m0, [srcq+strideq*1] + vbroadcasti128 m6, [srcq+strideq*0] + lea srcq, [srcq+strideq*2] + vbroadcasti128 m1, [srcq+strideq*0] + vbroadcasti128 m2, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vbroadcasti128 m3, [srcq+strideq*0] + shufpd m4, m4, m0, 0x0c + shufpd m5, m5, m1, 0x0c + punpcklbw m1, m4, m5 ; 01 + punpckhbw m4, m5 ; 34 + shufpd m6, m6, m2, 0x0c + punpcklbw m2, m5, m6 ; 12 + punpckhbw m5, m6 ; 45 + shufpd m0, m0, m3, 0x0c + punpcklbw m3, m6, m0 ; 23 + punpckhbw m6, m0 ; 56 +.v_w16_loop: + vbroadcasti128 m12, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vbroadcasti128 m13, [srcq+strideq*0] + pmaddubsw m14, m1, m8 ; a0 + pmaddubsw m15, m2, m8 ; b0 + mova m1, m3 + mova m2, m4 + pmaddubsw m3, m9 ; a1 + pmaddubsw m4, m9 ; b1 + paddw m14, m3 + paddw m15, m4 + mova m3, m5 + mova m4, m6 + pmaddubsw m5, m10 ; a2 + pmaddubsw m6, m10 ; b2 + paddw m14, m5 + paddw m15, m6 + shufpd m6, m0, m12, 0x0d + shufpd m0, m12, m13, 0x0c + punpcklbw m5, m6, m0 ; 67 + punpckhbw m6, m0 ; 78 + pmaddubsw m12, m5, m11 ; a3 + pmaddubsw m13, m6, m11 ; b3 + paddw m14, m12 + paddw m15, m13 + pmulhrsw m14, m7 + pmulhrsw m15, m7 + mova [tmpq+wq*0], m14 + mova [tmpq+wq*1], m15 + lea tmpq, [tmpq+wq*2] + sub hd, 2 + jg .v_w16_loop + add r5, 16 + add r7, 32 + movzx hd, r6b + mov srcq, r5 + mov tmpq, r7 + sub r6d, 1<<8 + jg .v_w16_loop0 + RET +.hv: + %assign stack_offset stack_offset - stack_size_padded + %assign stack_size_padded 0 + WIN64_SPILL_XMM 16 + cmp wd, 4 + je .hv_w4 + shr mxd, 16 + sub srcq, 3 + vpbroadcastd m10, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0] + vpbroadcastd m11, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + vpbroadcastq m0, [r7+myq*8+subpel_filters-prep%+SUFFIX] + lea stride3q, [strideq*3] + sub srcq, stride3q + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + pshufd m12, m0, q0000 + pshufd m13, m0, q1111 + pshufd m14, m0, q2222 + pshufd m15, m0, q3333 + jmp .hv_w8 +.hv_w4: + movzx mxd, mxb + dec srcq + vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + vpbroadcastq m0, [r7+myq*8+subpel_filters-prep%+SUFFIX] + lea stride3q, [strideq*3] + sub srcq, stride3q + mova m7, [subpel_h_shuf4] + pmovzxbd m9, [deint_shuf4] + vpbroadcastd m10, [pw_8192] + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + vpbroadcastd m11, [pd_32] + pshufd m12, m0, q0000 + pshufd m13, m0, q1111 + pshufd m14, m0, q2222 + pshufd m15, m0, q3333 + vpbroadcastq m2, [srcq+strideq*0] + vpbroadcastq m4, [srcq+strideq*1] + vpbroadcastq m0, [srcq+strideq*2] + vpbroadcastq m5, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpbroadcastq m3, [srcq+strideq*0] + vpbroadcastq m6, [srcq+strideq*1] + vpbroadcastq m1, [srcq+strideq*2] + vpblendd m2, m4, 0xcc ; 0 1 + vpblendd m0, m5, 0xcc ; 2 3 + vpblendd m3, m6, 0xcc ; 4 5 + pshufb m2, m7 ; 00 01 10 11 02 03 12 13 + pshufb m0, m7 ; 20 21 30 31 22 23 32 33 + pshufb m3, m7 ; 40 41 50 51 42 43 52 53 + pshufb m1, m7 ; 60 61 60 61 62 63 62 63 + pmaddubsw m2, m8 + pmaddubsw m0, m8 + pmaddubsw m3, m8 + pmaddubsw m1, m8 + phaddw m2, m0 ; 0a 1a 2a 3a 0b 1b 2b 3b + phaddw m3, m1 ; 4a 5a 6a __ 4b 5b 6b __ + pmulhrsw m2, m10 + pmulhrsw m3, m10 + palignr m4, m3, m2, 4 ; 1a 2a 3a 4a 1b 2b 3b 4b + punpcklwd m1, m2, m4 ; 01 12 + punpckhwd m2, m4 ; 23 34 + pshufd m0, m3, q2121 + punpcklwd m3, m0 ; 45 56 +.hv_w4_loop: + pmaddwd m5, m1, m12 ; a0 b0 + pmaddwd m6, m2, m12 ; c0 d0 + pmaddwd m2, m13 ; a1 b1 + pmaddwd m4, m3, m13 ; c1 d1 + mova m1, m3 + pmaddwd m3, m14 ; a2 b2 + paddd m5, m2 + vpbroadcastq m2, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + paddd m6, m4 + vpbroadcastq m4, [srcq+strideq*0] + paddd m5, m3 + vpbroadcastq m3, [srcq+strideq*1] + vpblendd m2, m4, 0xcc + vpbroadcastq m4, [srcq+strideq*2] + vpblendd m3, m4, 0xcc + pshufb m2, m7 + pshufb m3, m7 + pmaddubsw m2, m8 + pmaddubsw m3, m8 + phaddw m2, m3 + pmulhrsw m2, m10 + palignr m3, m2, m0, 12 + mova m0, m2 + punpcklwd m2, m3, m0 ; 67 78 + punpckhwd m3, m0 ; 89 9a + pmaddwd m4, m2, m14 ; c2 d2 + paddd m6, m11 + paddd m5, m11 + paddd m6, m4 + pmaddwd m4, m2, m15 ; a3 b3 + paddd m5, m4 + pmaddwd m4, m3, m15 ; c3 d3 + paddd m6, m4 + psrad m5, 6 + psrad m6, 6 + packssdw m5, m6 + vpermd m5, m9, m5 + mova [tmpq], m5 + add tmpq, 32 + sub hd, 4 + jg .hv_w4_loop + RET +.hv_w8: + lea r6d, [wq*8-64] + mov r5, srcq + mov r7, tmpq + lea r6d, [hq+r6*4] +.hv_w8_loop0: + vbroadcasti128 m7, [subpel_h_shufA] + movu xm4, [srcq+strideq*0] + vbroadcasti128 m8, [subpel_h_shufB] + movu xm5, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vbroadcasti128 m9, [subpel_h_shufC] + movu xm6, [srcq+strideq*0] + vbroadcasti128 m0, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vpblendd m4, m0, 0xf0 ; 0 3 + vinserti128 m5, [srcq+strideq*0], 1 ; 1 4 + vinserti128 m6, [srcq+strideq*1], 1 ; 2 5 + lea srcq, [srcq+strideq*2] + vinserti128 m0, [srcq+strideq*0], 1 ; 3 6 + HV_H_W8 m4, m1, m2, m3, m7, m8, m9 + HV_H_W8 m5, m1, m2, m3, m7, m8, m9 + HV_H_W8 m6, m1, m2, m3, m7, m8, m9 + HV_H_W8 m0, m1, m2, m3, m7, m8, m9 + vpbroadcastd m7, [pw_8192] + vpermq m4, m4, q3120 + vpermq m5, m5, q3120 + vpermq m6, m6, q3120 + pmulhrsw m0, m7 + pmulhrsw m4, m7 + pmulhrsw m5, m7 + pmulhrsw m6, m7 + vpermq m7, m0, q3120 + punpcklwd m1, m4, m5 ; 01 + punpckhwd m4, m5 ; 34 + punpcklwd m2, m5, m6 ; 12 + punpckhwd m5, m6 ; 45 + punpcklwd m3, m6, m7 ; 23 + punpckhwd m6, m7 ; 56 +.hv_w8_loop: + vextracti128 [tmpq], m0, 1 ; not enough registers + movu xm0, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vinserti128 m0, [srcq+strideq*0], 1 ; 7 8 + pmaddwd m8, m1, m12 ; a0 + pmaddwd m9, m2, m12 ; b0 + mova m1, m3 + mova m2, m4 + pmaddwd m3, m13 ; a1 + pmaddwd m4, m13 ; b1 + paddd m8, m3 + paddd m9, m4 + mova m3, m5 + mova m4, m6 + pmaddwd m5, m14 ; a2 + pmaddwd m6, m14 ; b2 + paddd m8, m5 + paddd m9, m6 + vbroadcasti128 m6, [subpel_h_shufB] + vbroadcasti128 m7, [subpel_h_shufC] + vbroadcasti128 m5, [subpel_h_shufA] + HV_H_W8 m0, m5, m6, m7, m5, m6, m7 + vpbroadcastd m5, [pw_8192] + vpbroadcastd m7, [pd_32] + vbroadcasti128 m6, [tmpq] + pmulhrsw m0, m5 + paddd m8, m7 + paddd m9, m7 + vpermq m7, m0, q3120 ; 7 8 + shufpd m6, m6, m7, 0x04 ; 6 7 + punpcklwd m5, m6, m7 ; 67 + punpckhwd m6, m7 ; 78 + pmaddwd m7, m5, m15 ; a3 + paddd m8, m7 + pmaddwd m7, m6, m15 ; b3 + paddd m7, m9 + psrad m8, 6 + psrad m7, 6 + packssdw m8, m7 + vpermq m7, m8, q3120 + mova [tmpq+wq*0], xm7 + vextracti128 [tmpq+wq*2], m7, 1 + lea tmpq, [tmpq+wq*4] + sub hd, 2 + jg .hv_w8_loop + add r5, 8 + add r7, 16 + movzx hd, r6b + mov srcq, r5 + mov tmpq, r7 + sub r6d, 1<<8 + jg .hv_w8_loop0 + RET + +%macro movifprep 2 + %if isprep + mov %1, %2 + %endif +%endmacro + +%macro REMAP_REG 2 + %xdefine r%1 r%2 + %xdefine r%1q r%2q + %xdefine r%1d r%2d +%endmacro + +%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0 + %if isprep + %xdefine r14_save r14 + %assign %%i 14 + %rep 14 + %assign %%j %%i-1 + REMAP_REG %%i, %%j + %assign %%i %%i-1 + %endrep + %endif +%endmacro + +%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0 + %if isprep + %assign %%i 1 + %rep 13 + %assign %%j %%i+1 + REMAP_REG %%i, %%j + %assign %%i %%i+1 + %endrep + %xdefine r14 r14_save + %undef r14_save + %endif +%endmacro + +%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged + MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT + RET + %if %1 + MCT_8TAP_SCALED_REMAP_REGS_TO_PREV + %endif +%endmacro + +%macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6] + movq xm%1, [srcq+ r4] + movq xm%2, [srcq+ r6] + movhps xm%1, [srcq+ r7] + movhps xm%2, [srcq+ r9] + vinserti128 m%1, [srcq+r10], 1 + vinserti128 m%2, [srcq+r11], 1 + vpbroadcastq m%5, [srcq+r13] + vpbroadcastq m%6, [srcq+ rX] + add srcq, ssq + movq xm%3, [srcq+ r4] + movq xm%4, [srcq+ r6] + movhps xm%3, [srcq+ r7] + movhps xm%4, [srcq+ r9] + vinserti128 m%3, [srcq+r10], 1 + vinserti128 m%4, [srcq+r11], 1 + vpbroadcastq m%7, [srcq+r13] + vpbroadcastq m%8, [srcq+ rX] + add srcq, ssq + vpblendd m%1, m%5, 0xc0 + vpblendd m%2, m%6, 0xc0 + vpblendd m%3, m%7, 0xc0 + vpblendd m%4, m%8, 0xc0 + pmaddubsw m%1, m15 + pmaddubsw m%2, m10 + pmaddubsw m%3, m15 + pmaddubsw m%4, m10 + phaddw m%1, m%2 + phaddw m%3, m%4 + phaddw m%1, m%3 + pmulhrsw m%1, m12 +%endmacro + +%macro MC_8TAP_SCALED 1 +%ifidn %1, put + %assign isprep 0 + %if required_stack_alignment <= STACK_ALIGNMENT +cglobal put_8tap_scaled, 4, 15, 16, 112, dst, ds, src, ss, w, h, mx, my, dx, dy + %else +cglobal put_8tap_scaled, 4, 14, 16, 128, dst, ds, src, ss, w, h, mx, my, dx, dy + %endif + %xdefine base_reg r12 + %define rndshift 10 +%else + %assign isprep 1 + %if required_stack_alignment <= STACK_ALIGNMENT +cglobal prep_8tap_scaled, 4, 15, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy + %xdefine tmp_stridem r14q + %else +cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy + %define tmp_stridem qword [rsp+120] + %endif + %xdefine base_reg r11 + %define rndshift 6 +%endif + lea base_reg, [%1_8tap_scaled_avx2] +%define base base_reg-%1_8tap_scaled_avx2 + tzcnt wd, wm + vpbroadcastd m8, dxm +%if isprep && UNIX64 + movd xm14, mxd + vpbroadcastd m14, xm14 + mov r5d, t0d + DECLARE_REG_TMP 5, 7 +%else + vpbroadcastd m14, mxm +%endif + mov dyd, dym +%ifidn %1, put + %if WIN64 + mov r8d, hm + DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3 + %define hm r5m + %define dxm r8m + %else + DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3 + %define hm r6m + %endif + %if required_stack_alignment > STACK_ALIGNMENT + %define dsm [rsp+112] + %define rX r1 + %define rXd r1d + %else + %define dsm dsq + %define rX r14 + %define rXd r14d + %endif +%else ; prep + %if WIN64 + mov r7d, hm + DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3 + %define hm r4m + %define dxm r7m + %else + DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3 + %define hm [rsp+112] + %endif + MCT_8TAP_SCALED_REMAP_REGS_TO_PREV + %define rX r14 + %define rXd r14d +%endif + vpbroadcastd m10, [base+pd_0x3ff] + vpbroadcastd m12, [base+pw_8192] +%ifidn %1, put + vpbroadcastd m13, [base+pd_512] +%else + vpbroadcastd m13, [base+pd_32] +%endif + pxor m9, m9 + lea ss3q, [ssq*3] + movzx r7d, t1b + shr t1d, 16 + cmp hd, 6 + cmovs t1d, r7d + sub srcq, ss3q + cmp dyd, 1024 + je .dy1 + cmp dyd, 2048 + je .dy2 + movzx wd, word [base+%1_8tap_scaled_avx2_table+wq*2] + add wq, base_reg + jmp wq +%ifidn %1, put +.w2: + mov myd, mym + movzx t0d, t0b + dec srcq + movd xm15, t0d + punpckldq m8, m9, m8 + paddd m14, m8 ; mx+dx*[0-1] + vpbroadcastd m11, [base+pd_0x4000] + vpbroadcastd xm15, xm15 + pand m8, m14, m10 + psrld m8, 6 + paddd xm15, xm8 + movd r4d, xm15 + pextrd r6d, xm15, 1 + vbroadcasti128 m5, [base+bdct_lb_dw] + vbroadcasti128 m6, [base+subpel_s_shuf2] + vpbroadcastd m15, [base+subpel_filters+r4*8+2] + vpbroadcastd m7, [base+subpel_filters+r6*8+2] + pcmpeqd m8, m9 + psrld m14, 10 + movq xm0, [srcq+ssq*0] + movq xm1, [srcq+ssq*2] + movhps xm0, [srcq+ssq*1] + movhps xm1, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + pshufb m14, m5 + paddb m14, m6 + vinserti128 m0, [srcq+ssq*0], 1 + vinserti128 m1, [srcq+ssq*2], 1 + vpbroadcastq m2, [srcq+ssq*1] + vpbroadcastq m3, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + vpblendd m15, m7, 0xaa + vpblendd m0, m2, 0xc0 ; 0 1 4 5 + vpblendd m1, m3, 0xc0 ; 2 3 6 7 + pblendvb m15, m11, m8 + pshufb m0, m14 + pshufb m1, m14 + pmaddubsw m0, m15 + pmaddubsw m1, m15 + phaddw m0, m1 + pmulhrsw m0, m12 ; 0 1 2 3 4 5 6 7 + vextracti128 xm1, m0, 1 ; 4 5 6 7 + palignr xm2, xm1, xm0, 4 ; 1 2 3 4 + punpcklwd xm3, xm0, xm2 ; 01 12 + punpckhwd xm0, xm2 ; 23 34 + pshufd xm4, xm1, q0321 ; 5 6 7 _ + punpcklwd xm2, xm1, xm4 ; 45 56 + punpckhwd xm4, xm1, xm4 ; 67 __ +.w2_loop: + and myd, 0x3ff + mov r6d, 64 << 24 + mov r4d, myd + shr r4d, 6 + lea r4d, [t1+r4] + cmovnz r6q, [base+subpel_filters+r4*8] + movq xm11, r6q + punpcklbw xm11, xm11 + psraw xm11, 8 + pshufd xm8, xm11, q0000 + pshufd xm9, xm11, q1111 + pshufd xm10, xm11, q2222 + pshufd xm11, xm11, q3333 + pmaddwd xm5, xm3, xm8 + pmaddwd xm6, xm0, xm9 + pmaddwd xm7, xm2, xm10 + pmaddwd xm8, xm4, xm11 + paddd xm5, xm6 + paddd xm7, xm8 + paddd xm5, xm13 + paddd xm5, xm7 + psrad xm5, 10 + packssdw xm5, xm5 + packuswb xm5, xm5 + pextrw [dstq], xm5, 0 + add dstq, dsq + dec hd + jz .ret + add myd, dyd + test myd, ~0x3ff + jz .w2_loop + movq xm5, [srcq] + test myd, 0x400 + jz .w2_skip_line + add srcq, ssq + shufps xm3, xm0, q1032 ; 01 12 + shufps xm0, xm2, q1032 ; 23 34 + shufps xm2, xm4, q1032 ; 45 56 + pshufb xm5, xm14 + pmaddubsw xm5, xm15 + phaddw xm5, xm5 + pmulhrsw xm5, xm12 + palignr xm1, xm5, xm1, 12 + punpcklqdq xm1, xm1 ; 6 7 6 7 + punpcklwd xm4, xm1, xm5 ; 67 __ + jmp .w2_loop +.w2_skip_line: + movhps xm5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova xm3, xm0 ; 01 12 + mova xm0, xm2 ; 23 34 + pshufb xm5, xm14 + pmaddubsw xm5, xm15 + phaddw xm5, xm5 + pmulhrsw xm5, xm12 ; 6 7 6 7 + palignr xm1, xm5, xm1, 8 ; 4 5 6 7 + pshufd xm5, xm1, q0321 ; 5 6 7 _ + punpcklwd xm2, xm1, xm5 ; 45 56 + punpckhwd xm4, xm1, xm5 ; 67 __ + jmp .w2_loop +%endif +.w4: + mov myd, mym + vbroadcasti128 m7, [base+rescale_mul] + movzx t0d, t0b + dec srcq + movd xm15, t0d + pmaddwd m8, m7 + vpbroadcastd m11, [base+pd_0x4000] + vpbroadcastd xm15, xm15 + paddd m14, m8 ; mx+dx*[0-3] + pand m0, m14, m10 + psrld m0, 6 + paddd xm15, xm0 + movd r4d, xm15 + pextrd r6d, xm15, 1 + pextrd r11d, xm15, 2 + pextrd r13d, xm15, 3 + movd xm15, [base+subpel_filters+r4*8+2] + vbroadcasti128 m5, [base+bdct_lb_dw] + vpbroadcastq m6, [base+subpel_s_shuf2] + pinsrd xm15, [base+subpel_filters+r6*8+2], 1 + pcmpeqd m0, m9 + psrld m14, 10 + movu xm7, [srcq+ssq*0] + movu xm9, [srcq+ssq*1] + pinsrd xm15, [base+subpel_filters+r11*8+2], 2 + movu xm8, [srcq+ssq*2] + movu xm10, [srcq+ss3q ] + pinsrd xm15, [base+subpel_filters+r13*8+2], 3 + lea srcq, [srcq+ssq*4] + pshufb m14, m5 + paddb m14, m6 + vinserti128 m7, [srcq+ssq*0], 1 + vinserti128 m9, [srcq+ssq*1], 1 + vinserti128 m15, xm15, 1 + vinserti128 m8, [srcq+ssq*2], 1 + vinserti128 m10, [srcq+ss3q ], 1 + lea srcq, [srcq+ssq*4] + pblendvb m15, m11, m0 + pshufb m7, m14 + pshufb m9, m14 + pshufb m8, m14 + pshufb m10, m14 + pmaddubsw m7, m15 + pmaddubsw m9, m15 + pmaddubsw m8, m15 + pmaddubsw m10, m15 + phaddw m7, m9 + phaddw m8, m10 + pmulhrsw m7, m12 ; 0 1 4 5 + pmulhrsw m8, m12 ; 2 3 6 7 + vextracti128 xm9, m7, 1 ; 4 5 + vextracti128 xm3, m8, 1 ; 6 7 + shufps xm4, xm7, xm8, q1032 ; 1 2 + shufps xm5, xm8, xm9, q1032 ; 3 4 + shufps xm6, xm9, xm3, q1032 ; 5 6 + psrldq xm11, xm3, 8 ; 7 _ + punpcklwd xm0, xm7, xm4 ; 01 + punpckhwd xm7, xm4 ; 12 + punpcklwd xm1, xm8, xm5 ; 23 + punpckhwd xm8, xm5 ; 34 + punpcklwd xm2, xm9, xm6 ; 45 + punpckhwd xm9, xm6 ; 56 + punpcklwd xm3, xm11 ; 67 + mova [rsp+0x00], xm7 + mova [rsp+0x10], xm8 + mova [rsp+0x20], xm9 +.w4_loop: + and myd, 0x3ff + mov r6d, 64 << 24 + mov r4d, myd + shr r4d, 6 + lea r4d, [t1+r4] + cmovnz r6q, [base+subpel_filters+r4*8] + movq xm10, r6q + punpcklbw xm10, xm10 + psraw xm10, 8 + pshufd xm7, xm10, q0000 + pshufd xm8, xm10, q1111 + pshufd xm9, xm10, q2222 + pshufd xm10, xm10, q3333 + pmaddwd xm4, xm0, xm7 + pmaddwd xm5, xm1, xm8 + pmaddwd xm6, xm2, xm9 + pmaddwd xm7, xm3, xm10 + paddd xm4, xm5 + paddd xm6, xm7 + paddd xm4, xm13 + paddd xm4, xm6 + psrad xm4, rndshift + packssdw xm4, xm4 +%ifidn %1, put + packuswb xm4, xm4 + movd [dstq], xm4 + add dstq, dsq +%else + movq [tmpq], xm4 + add tmpq, 8 +%endif + dec hd + jz .ret + add myd, dyd + test myd, ~0x3ff + jz .w4_loop + movu xm4, [srcq] + test myd, 0x400 + jz .w4_skip_line + mova xm0, [rsp+0x00] + mova [rsp+0x00], xm1 + mova xm1, [rsp+0x10] + mova [rsp+0x10], xm2 + mova xm2, [rsp+0x20] + mova [rsp+0x20], xm3 + pshufb xm4, xm14 + pmaddubsw xm4, xm15 + phaddw xm4, xm4 + pmulhrsw xm4, xm12 + punpcklwd xm3, xm11, xm4 + mova xm11, xm4 + add srcq, ssq + jmp .w4_loop +.w4_skip_line: + movu xm5, [srcq+ssq*1] + movu m6, [rsp+0x10] + pshufb xm4, xm14 + pshufb xm5, xm14 + pmaddubsw xm4, xm15 + pmaddubsw xm5, xm15 + movu [rsp+0x00], m6 + phaddw xm4, xm5 + pmulhrsw xm4, xm12 + punpcklwd xm9, xm11, xm4 + mova [rsp+0x20], xm9 + psrldq xm11, xm4, 8 + mova xm0, xm1 + mova xm1, xm2 + mova xm2, xm3 + punpcklwd xm3, xm4, xm11 + lea srcq, [srcq+ssq*2] + jmp .w4_loop +.w8: + mov dword [rsp+48], 1 + movifprep tmp_stridem, 16 + jmp .w_start +.w16: + mov dword [rsp+48], 2 + movifprep tmp_stridem, 32 + jmp .w_start +.w32: + mov dword [rsp+48], 4 + movifprep tmp_stridem, 64 + jmp .w_start +.w64: + mov dword [rsp+48], 8 + movifprep tmp_stridem, 128 + jmp .w_start +.w128: + mov dword [rsp+48], 16 + movifprep tmp_stridem, 256 +.w_start: +%ifidn %1, put + movifnidn dsm, dsq +%endif + shr t0d, 16 + sub srcq, 3 + pmaddwd m8, [base+rescale_mul] + movd xm15, t0d + mov [rsp+72], t0d + mov [rsp+56], srcq + mov [rsp+64], r0q ; dstq / tmpq +%if UNIX64 + mov hm, hd +%endif + shl dword dxm, 3 ; dx*8 + vpbroadcastd m15, xm15 + paddd m14, m8 ; mx+dx*[0-7] + jmp .hloop +.hloop_prep: + dec dword [rsp+48] + jz .ret + add qword [rsp+64], 8*(isprep+1) + mov hd, hm + vpbroadcastd m8, dxm + vpbroadcastd m10, [base+pd_0x3ff] + paddd m14, m8, [rsp+16] + vpbroadcastd m15, [rsp+72] + pxor m9, m9 + mov srcq, [rsp+56] + mov r0q, [rsp+64] ; dstq / tmpq +.hloop: + vpbroadcastq m11, [base+pq_0x40000000] + pand m6, m14, m10 + psrld m6, 6 + paddd m15, m6 + pcmpeqd m6, m9 + vextracti128 xm7, m15, 1 + movd r4d, xm15 + pextrd r6d, xm15, 2 + pextrd r7d, xm15, 1 + pextrd r9d, xm15, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + movu [rsp+16], m14 + movq xm15, [base+subpel_filters+ r4*8] + movq xm10, [base+subpel_filters+ r6*8] + movhps xm15, [base+subpel_filters+ r7*8] + movhps xm10, [base+subpel_filters+ r9*8] + vinserti128 m15, [base+subpel_filters+r10*8], 1 + vinserti128 m10, [base+subpel_filters+r11*8], 1 + vpbroadcastq m9, [base+subpel_filters+r13*8] + vpbroadcastq m8, [base+subpel_filters+ rX*8] + psrld m14, 10 + vextracti128 xm7, m14, 1 + mova [rsp], xm14 + movd r4d, xm14 + pextrd r6d, xm14, 2 + pextrd r7d, xm14, 1 + pextrd r9d, xm14, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + pshufd m5, m6, q1100 + pshufd m6, m6, q3322 + vpblendd m15, m9, 0xc0 + vpblendd m10, m8, 0xc0 + pblendvb m15, m11, m5 + pblendvb m10, m11, m6 + vbroadcasti128 m14, [base+subpel_s_shuf8] + MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b + MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b + MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b + mov myd, mym + mov dyd, dym + pshufb m0, m14 ; 01a 01b + pshufb m1, m14 ; 23a 23b + pshufb m2, m14 ; 45a 45b + pshufb m3, m14 ; 67a 67b + vbroadcasti128 m14, [base+wswap] +.vloop: + and myd, 0x3ff + mov r6d, 64 << 24 + mov r4d, myd + shr r4d, 6 + lea r4d, [t1+r4] + cmovnz r6q, [base+subpel_filters+r4*8] + movq xm11, r6q + punpcklbw xm11, xm11 + psraw xm11, 8 + vinserti128 m11, xm11, 1 + pshufd m8, m11, q0000 + pshufd m9, m11, q1111 + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m9 + pshufd m8, m11, q2222 + pshufd m11, m11, q3333 + pmaddwd m6, m2, m8 + pmaddwd m7, m3, m11 + paddd m4, m5 + paddd m6, m7 + paddd m4, m13 + paddd m4, m6 + psrad m4, rndshift + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 +%ifidn %1, put + packuswb xm4, xm4 + movq [dstq], xm4 + add dstq, dsm +%else + mova [tmpq], xm4 + add tmpq, tmp_stridem +%endif + dec hd + jz .hloop_prep + add myd, dyd + test myd, ~0x3ff + jz .vloop + test myd, 0x400 + mov [rsp+52], myd + mov r4d, [rsp+ 0] + mov r6d, [rsp+ 8] + mov r7d, [rsp+ 4] + mov r9d, [rsp+12] + jz .skip_line + vpbroadcastq m6, [srcq+r13] + vpbroadcastq m7, [srcq+ rX] + movq xm4, [srcq+ r4] + movq xm5, [srcq+ r6] + movhps xm4, [srcq+ r7] + movhps xm5, [srcq+ r9] + vinserti128 m4, [srcq+r10], 1 + vinserti128 m5, [srcq+r11], 1 + add srcq, ssq + mov myd, [rsp+52] + mov dyd, dym + pshufb m0, m14 + pshufb m1, m14 + pshufb m2, m14 + pshufb m3, m14 + vpblendd m4, m6, 0xc0 + vpblendd m5, m7, 0xc0 + pmaddubsw m4, m15 + pmaddubsw m5, m10 + phaddw m4, m5 + pslld m5, m4, 16 + paddw m4, m5 + pmulhrsw m4, m12 + pblendw m0, m1, 0xaa + pblendw m1, m2, 0xaa + pblendw m2, m3, 0xaa + pblendw m3, m4, 0xaa + jmp .vloop +.skip_line: + mova m0, m1 + mova m1, m2 + mova m2, m3 + vpbroadcastq m7, [srcq+r13] + vpbroadcastq m8, [srcq+ rX] + movq xm3, [srcq+ r4] + movq xm4, [srcq+ r6] + movhps xm3, [srcq+ r7] + movhps xm4, [srcq+ r9] + vinserti128 m3, [srcq+r10], 1 + vinserti128 m4, [srcq+r11], 1 + add srcq, ssq + movq xm5, [srcq+ r4] + movq xm6, [srcq+ r6] + movhps xm5, [srcq+ r7] + movhps xm6, [srcq+ r9] + vinserti128 m5, [srcq+r10], 1 + vinserti128 m6, [srcq+r11], 1 + vpbroadcastq m9, [srcq+r13] + vpbroadcastq m11, [srcq+ rX] + add srcq, ssq + mov myd, [rsp+52] + mov dyd, dym + vpblendd m3, m7, 0xc0 + vpblendd m4, m8, 0xc0 + vpblendd m5, m9, 0xc0 + vpblendd m6, m11, 0xc0 + pmaddubsw m3, m15 + pmaddubsw m4, m10 + pmaddubsw m5, m15 + pmaddubsw m6, m10 + phaddw m3, m4 + phaddw m5, m6 + psrld m4, m3, 16 + pslld m6, m5, 16 + paddw m3, m4 + paddw m5, m6 + pblendw m3, m5, 0xaa + pmulhrsw m3, m12 + jmp .vloop +.dy1: + movzx wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2] + add wq, base_reg + jmp wq +%ifidn %1, put +.dy1_w2: + mov myd, mym + movzx t0d, t0b + dec srcq + movd xm15, t0d + punpckldq m8, m9, m8 + paddd m14, m8 ; mx+dx*[0-1] + vpbroadcastd m11, [base+pd_0x4000] + vpbroadcastd xm15, xm15 + pand m8, m14, m10 + psrld m8, 6 + paddd xm15, xm8 + movd r4d, xm15 + pextrd r6d, xm15, 1 + vbroadcasti128 m5, [base+bdct_lb_dw] + vbroadcasti128 m6, [base+subpel_s_shuf2] + vpbroadcastd m15, [base+subpel_filters+r4*8+2] + vpbroadcastd m7, [base+subpel_filters+r6*8+2] + pcmpeqd m8, m9 + psrld m14, 10 + movq xm0, [srcq+ssq*0] + movq xm1, [srcq+ssq*2] + movhps xm0, [srcq+ssq*1] + movhps xm1, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + pshufb m14, m5 + paddb m14, m6 + vinserti128 m0, [srcq+ssq*0], 1 + vinserti128 m1, [srcq+ssq*2], 1 + vpbroadcastq m2, [srcq+ssq*1] + add srcq, ss3q + movq xm10, r4q + punpcklbw xm10, xm10 + psraw xm10, 8 + vpblendd m15, m7, 0xaa + pblendvb m15, m11, m8 + pshufd xm8, xm10, q0000 + pshufd xm9, xm10, q1111 + pshufd xm11, xm10, q3333 + pshufd xm10, xm10, q2222 + vpblendd m0, m2, 0xc0 + pshufb m1, m14 + pshufb m0, m14 + pmaddubsw m1, m15 + pmaddubsw m0, m15 + phaddw m0, m1 + pmulhrsw m0, m12 + vextracti128 xm1, m0, 1 + palignr xm2, xm1, xm0, 4 + pshufd xm4, xm1, q2121 + punpcklwd xm3, xm0, xm2 ; 01 12 + punpckhwd xm0, xm2 ; 23 34 + punpcklwd xm2, xm1, xm4 ; 45 56 +.dy1_w2_loop: + movq xm1, [srcq+ssq*0] + movhps xm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddwd xm5, xm3, xm8 + pmaddwd xm6, xm0, xm9 + pmaddwd xm7, xm2, xm10 + mova xm3, xm0 + mova xm0, xm2 + paddd xm5, xm13 + paddd xm6, xm7 + pshufb xm1, xm14 + pmaddubsw xm1, xm15 + phaddw xm1, xm1 + pmulhrsw xm1, xm12 + palignr xm7, xm1, xm4, 12 + punpcklwd xm2, xm7, xm1 ; 67 78 + pmaddwd xm7, xm2, xm11 + mova xm4, xm1 + paddd xm5, xm6 + paddd xm5, xm7 + psrad xm5, rndshift + packssdw xm5, xm5 + packuswb xm5, xm5 + pextrw [dstq+dsq*0], xm5, 0 + pextrw [dstq+dsq*1], xm5, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .dy1_w2_loop + RET +%endif +.dy1_w4: + mov myd, mym + vbroadcasti128 m7, [base+rescale_mul] + movzx t0d, t0b + dec srcq + movd xm15, t0d + pmaddwd m8, m7 + vpbroadcastd m11, [base+pd_0x4000] + vpbroadcastd xm15, xm15 + paddd m14, m8 ; mx+dx*[0-3] + pand m8, m14, m10 + psrld m8, 6 + paddd xm15, xm8 + vpermq m8, m8, q3120 + movd r4d, xm15 + pextrd r6d, xm15, 2 + pextrd r11d, xm15, 1 + pextrd r13d, xm15, 3 + movd xm15, [base+subpel_filters+r4*8+2] + vpbroadcastd m7, [base+subpel_filters+r6*8+2] + movu xm2, [srcq+ssq*0] + movu xm3, [srcq+ssq*2] + vbroadcasti128 m5, [base+bdct_lb_dw] + vpbroadcastq m6, [base+subpel_s_shuf2] + pcmpeqd m8, m9 + psrld m14, 10 + pinsrd xm15, [base+subpel_filters+r11*8+2], 1 + vpblendd m7, [base+subpel_filters+r13*8+2-20], 0x20 + vinserti128 m2, [srcq+ssq*1], 1 + vinserti128 m3, [srcq+ss3q ], 1 + lea srcq, [srcq+ssq*4] + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + pshufb m14, m5 + paddb m14, m6 + movu xm4, [srcq+ssq*0] + movu xm5, [srcq+ssq*2] + vinserti128 m4, [srcq+ssq*1], 1 + add srcq, ss3q + vpblendd m15, m7, 0x30 + punpcklqdq m15, m15 + pblendvb m15, m11, m8 + movq xm10, r4q + punpcklbw xm10, xm10 + psraw xm10, 8 + vinserti128 m10, xm10, 1 + pshufb m2, m14 + pshufb m3, m14 + pshufb m4, m14 + pshufb xm5, xm14 + vpermq m2, m2, q3120 + vpermq m3, m3, q3120 + vpermq m4, m4, q3120 + vpermq m5, m5, q3120 + pshufd m7, m10, q0000 + pshufd m8, m10, q1111 + pshufd m9, m10, q2222 + pshufd m10, m10, q3333 + pmaddubsw m2, m15 + pmaddubsw m3, m15 + pmaddubsw m4, m15 + pmaddubsw m5, m15 + phaddw m2, m3 + phaddw m4, m5 + pmulhrsw m2, m12 + pmulhrsw m4, m12 + palignr m5, m4, m2, 4 + pshufd m3, m4, q2121 + punpcklwd m0, m2, m5 ; 01 12 + punpckhwd m1, m2, m5 ; 23 34 + punpcklwd m2, m4, m3 ; 45 56 +.dy1_w4_loop: + movu xm11, [srcq+ssq*0] + vinserti128 m11, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + pmaddwd m4, m0, m7 + pmaddwd m5, m1, m8 + pmaddwd m6, m2, m9 + mova m0, m1 + mova m1, m2 + paddd m4, m13 + paddd m5, m6 + pshufb m11, m14 + vpermq m11, m11, q3120 + pmaddubsw m11, m15 + phaddw m11, m11 + pmulhrsw m11, m12 + palignr m6, m11, m3, 12 + punpcklwd m2, m6, m11 ; 67 78 + mova m3, m11 + pmaddwd m6, m2, m10 + paddd m4, m5 + paddd m4, m6 + psrad m4, rndshift + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 +%ifidn %1, put + packuswb xm4, xm4 + pshuflw xm4, xm4, q3120 + movd [dstq+dsq*0], xm4 + pextrd [dstq+dsq*1], xm4, 1 + lea dstq, [dstq+dsq*2] +%else + pshufd xm4, xm4, q3120 + mova [tmpq], xm4 + add tmpq, 16 +%endif + sub hd, 2 + jg .dy1_w4_loop + MC_8TAP_SCALED_RET +.dy1_w8: + mov dword [rsp+72], 1 + movifprep tmp_stridem, 16 + jmp .dy1_w_start +.dy1_w16: + mov dword [rsp+72], 2 + movifprep tmp_stridem, 32 + jmp .dy1_w_start +.dy1_w32: + mov dword [rsp+72], 4 + movifprep tmp_stridem, 64 + jmp .dy1_w_start +.dy1_w64: + mov dword [rsp+72], 8 + movifprep tmp_stridem, 128 + jmp .dy1_w_start +.dy1_w128: + mov dword [rsp+72], 16 + movifprep tmp_stridem, 256 +.dy1_w_start: + mov myd, mym +%ifidn %1, put + movifnidn dsm, dsq +%endif + shr t0d, 16 + sub srcq, 3 + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + pmaddwd m8, [base+rescale_mul] + movd xm15, t0d + mov [rsp+76], t0d + mov [rsp+80], srcq + mov [rsp+88], r0q ; dstq / tmpq +%if UNIX64 + mov hm, hd +%endif + shl dword dxm, 3 ; dx*8 + vpbroadcastd m15, xm15 + paddd m14, m8 ; mx+dx*[0-7] + movq xm0, r4q + punpcklbw xm0, xm0 + psraw xm0, 8 + mova [rsp+96], xm0 + jmp .dy1_hloop +.dy1_hloop_prep: + dec dword [rsp+72] + jz .ret + add qword [rsp+88], 8*(isprep+1) + mov hd, hm + vpbroadcastd m8, dxm + vpbroadcastd m10, [base+pd_0x3ff] + paddd m14, m8, [rsp+32] + vpbroadcastd m15, [rsp+76] + pxor m9, m9 + mov srcq, [rsp+80] + mov r0q, [rsp+88] ; dstq / tmpq +.dy1_hloop: + vpbroadcastq m11, [base+pq_0x40000000] + pand m6, m14, m10 + psrld m6, 6 + paddd m15, m6 + pcmpeqd m6, m9 + vextracti128 xm7, m15, 1 + movd r4d, xm15 + pextrd r6d, xm15, 2 + pextrd r7d, xm15, 1 + pextrd r9d, xm15, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + movu [rsp+32], m14 + movq xm15, [base+subpel_filters+ r4*8] + movq xm10, [base+subpel_filters+ r6*8] + movhps xm15, [base+subpel_filters+ r7*8] + movhps xm10, [base+subpel_filters+ r9*8] + vinserti128 m15, [base+subpel_filters+r10*8], 1 + vinserti128 m10, [base+subpel_filters+r11*8], 1 + vpbroadcastq m9, [base+subpel_filters+r13*8] + vpbroadcastq m8, [base+subpel_filters+ rX*8] + psrld m14, 10 + vextracti128 xm7, m14, 1 + movq [rsp+64], xm14 + movd r4d, xm14 + pextrd r6d, xm14, 2 + pextrd r7d, xm14, 1 + pextrd r9d, xm14, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + pshufd m5, m6, q1100 + pshufd m6, m6, q3322 + vpblendd m15, m9, 0xc0 + vpblendd m10, m8, 0xc0 + pblendvb m15, m11, m5 + pblendvb m10, m11, m6 + vbroadcasti128 m14, [base+subpel_s_shuf8] + MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b + MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b + MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b + movu [rsp], m10 + vpbroadcastd m8, [rsp+0x60] + vpbroadcastd m9, [rsp+0x64] + vpbroadcastd m10, [rsp+0x68] + vpbroadcastd m11, [rsp+0x6c] + pshufb m0, m14 ; 01a 01b + pshufb m1, m14 ; 23a 23b + pshufb m2, m14 ; 45a 45b + pshufb m3, m14 ; 67a 67b + vbroadcasti128 m14, [base+wswap] +.dy1_vloop: + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m9 + pmaddwd m6, m2, m10 + pmaddwd m7, m3, m11 + paddd m4, m5 + paddd m6, m7 + paddd m4, m13 + paddd m4, m6 + psrad m4, rndshift + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 +%ifidn %1, put + packuswb xm4, xm4 + movq [dstq], xm4 + add dstq, dsm +%else + mova [tmpq], xm4 + add tmpq, tmp_stridem +%endif + dec hd + jz .dy1_hloop_prep + movq xm4, [srcq+ r4] + movq xm5, [srcq+ r6] + movhps xm4, [srcq+ r7] + movhps xm5, [srcq+ r9] + vinserti128 m4, [srcq+r10], 1 + vinserti128 m5, [srcq+r11], 1 + vpbroadcastq m6, [srcq+r13] + vpbroadcastq m7, [srcq+ rX] + add srcq, ssq + pshufb m0, m14 + pshufb m1, m14 + pshufb m2, m14 + pshufb m3, m14 + vpblendd m4, m6, 0xc0 + vpblendd m5, m7, 0xc0 + pmaddubsw m4, m15 + pmaddubsw m5, [rsp] + phaddw m4, m5 + pslld m5, m4, 16 + paddw m4, m5 + pmulhrsw m4, m12 + pblendw m0, m1, 0xaa + pblendw m1, m2, 0xaa + pblendw m2, m3, 0xaa + pblendw m3, m4, 0xaa + jmp .dy1_vloop +.dy2: + movzx wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2] + add wq, base_reg + jmp wq +%ifidn %1, put +.dy2_w2: + mov myd, mym + movzx t0d, t0b + dec srcq + movd xm15, t0d + punpckldq m8, m9, m8 + paddd m14, m8 ; mx+dx*[0-1] + vpbroadcastd m11, [base+pd_0x4000] + vpbroadcastd xm15, xm15 + pand m8, m14, m10 + psrld m8, 6 + paddd xm15, xm8 + movd r4d, xm15 + pextrd r6d, xm15, 1 + vbroadcasti128 m5, [base+bdct_lb_dw] + vbroadcasti128 m6, [base+subpel_s_shuf2] + vpbroadcastd m15, [base+subpel_filters+r4*8+2] + vpbroadcastd m7, [base+subpel_filters+r6*8+2] + pcmpeqd m8, m9 + psrld m14, 10 + movq xm0, [srcq+ssq*0] + vpbroadcastq m2, [srcq+ssq*1] + movhps xm0, [srcq+ssq*2] + vpbroadcastq m3, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + pshufb m14, m5 + paddb m14, m6 + vpblendd m15, m7, 0xaa + pblendvb m15, m11, m8 + movhps xm1, [srcq+ssq*0] + vpbroadcastq m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + vpblendd m0, m2, 0x30 + vpblendd m1, m4, 0xc0 + vpblendd m0, m3, 0xc0 + pshufb m0, m14 + pshufb m1, m14 + pmaddubsw m0, m15 + pmaddubsw m1, m15 + movq xm11, r4q + punpcklbw xm11, xm11 + psraw xm11, 8 + phaddw m0, m1 + pmulhrsw m0, m12 ; 0 2 _ 4 1 3 _ 5 + pshufd xm8, xm11, q0000 + pshufd xm9, xm11, q1111 + pshufd xm10, xm11, q2222 + pshufd xm11, xm11, q3333 + pshufd m2, m0, q3110 ; 0 2 2 4 1 3 3 5 + vextracti128 xm1, m2, 1 + punpcklwd xm3, xm2, xm1 ; 01 23 + punpckhwd xm2, xm1 ; 23 45 +.dy2_w2_loop: + movq xm6, [srcq+ssq*0] + vpbroadcastq m7, [srcq+ssq*1] + movhps xm6, [srcq+ssq*2] + vpbroadcastq m1, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + pmaddwd xm4, xm3, xm8 + pmaddwd xm5, xm2, xm9 + vpblendd m6, m7, 0x30 + vpblendd m6, m1, 0xc0 + pshufb m6, m14 + pmaddubsw m6, m15 + phaddw m6, m6 + pmulhrsw m6, m12 + palignr m0, m6, m0, 8 + pshufd m2, m0, q3221 + vextracti128 xm1, m2, 1 + punpcklwd xm3, xm2, xm1 ; 45 67 + punpckhwd xm2, xm1 ; 67 89 + pmaddwd xm6, xm3, xm10 + pmaddwd xm7, xm2, xm11 + paddd xm4, xm5 + paddd xm4, xm13 + paddd xm6, xm7 + paddd xm4, xm6 + psrad xm4, rndshift + packssdw xm4, xm4 + packuswb xm4, xm4 + pextrw [dstq+dsq*0], xm4, 0 + pextrw [dstq+dsq*1], xm4, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .dy2_w2_loop + RET +%endif +.dy2_w4: + mov myd, mym + vbroadcasti128 m7, [base+rescale_mul] + movzx t0d, t0b + dec srcq + movd xm15, t0d + pmaddwd m8, m7 + vpbroadcastd m11, [base+pd_0x4000] + vpbroadcastd xm15, xm15 + paddd m14, m8 ; mx+dx*[0-3] + pand m8, m14, m10 + psrld m8, 6 + paddd xm15, xm8 + movd r4d, xm15 + pextrd r6d, xm15, 1 + pextrd r11d, xm15, 2 + pextrd r13d, xm15, 3 + movd xm15, [base+subpel_filters+r4*8+2] + vbroadcasti128 m5, [base+bdct_lb_dw] + vpbroadcastq m6, [base+subpel_s_shuf2] + pinsrd xm15, [base+subpel_filters+r6*8+2], 1 + pcmpeqd m8, m9 + psrld m14, 10 + movu xm0, [srcq+ssq*0] + movu xm2, [srcq+ssq*2] + pinsrd xm15, [base+subpel_filters+r11*8+2], 2 + movu xm1, [srcq+ssq*1] + movu xm3, [srcq+ss3q ] + pinsrd xm15, [base+subpel_filters+r13*8+2], 3 + lea srcq, [srcq+ssq*4] + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + vinserti128 m15, xm15, 1 + pshufb m14, m5 + paddb m14, m6 + vinserti128 m2, [srcq+ssq*0], 1 + vinserti128 m3, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + pblendvb m15, m11, m8 + pshufb xm0, xm14 + pshufb m2, m14 + pshufb xm1, xm14 + pshufb m3, m14 + pmaddubsw xm0, xm15 + pmaddubsw m2, m15 + pmaddubsw xm1, xm15 + pmaddubsw m3, m15 + movq xm11, r4q + punpcklbw xm11, xm11 + psraw xm11, 8 + vinserti128 m11, xm11, 1 + phaddw m0, m2 + phaddw m1, m3 + pmulhrsw m0, m12 ; 0 2 _ 4 + pmulhrsw m1, m12 ; 1 3 _ 5 + pshufd m8, m11, q0000 + pshufd m9, m11, q1111 + pshufd m10, m11, q2222 + pshufd m11, m11, q3333 + punpcklwd xm2, xm0, xm1 + punpckhwd m1, m0, m1 ; 23 45 + vinserti128 m0, m2, xm1, 1 ; 01 23 +.dy2_w4_loop: + movu xm6, [srcq+ssq*0] + movu xm7, [srcq+ssq*1] + vinserti128 m6, [srcq+ssq*2], 1 + vinserti128 m7, [srcq+ss3q ], 1 + lea srcq, [srcq+ssq*4] + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m9 + pshufb m6, m14 + pshufb m7, m14 + pmaddubsw m6, m15 + pmaddubsw m7, m15 + psrld m2, m6, 16 + pslld m3, m7, 16 + paddw m6, m2 + paddw m7, m3 + pblendw m6, m7, 0xaa ; 67 89 + pmulhrsw m6, m12 + paddd m4, m5 + vpblendd m0, m1, m6, 0x0f + mova m1, m6 + vpermq m0, m0, q1032 ; 45 67 + pmaddwd m6, m0, m10 + pmaddwd m7, m1, m11 + paddd m4, m13 + paddd m6, m7 + paddd m4, m6 + psrad m4, rndshift + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 +%ifidn %1, put + packuswb xm4, xm4 + movd [dstq+dsq*0], xm4 + pextrd [dstq+dsq*1], xm4, 1 + lea dstq, [dstq+dsq*2] +%else + mova [tmpq], xm4 + add tmpq, 16 +%endif + sub hd, 2 + jg .dy2_w4_loop + MC_8TAP_SCALED_RET +.dy2_w8: + mov dword [rsp+40], 1 + movifprep tmp_stridem, 16 + jmp .dy2_w_start +.dy2_w16: + mov dword [rsp+40], 2 + movifprep tmp_stridem, 32 + jmp .dy2_w_start +.dy2_w32: + mov dword [rsp+40], 4 + movifprep tmp_stridem, 64 + jmp .dy2_w_start +.dy2_w64: + mov dword [rsp+40], 8 + movifprep tmp_stridem, 128 + jmp .dy2_w_start +.dy2_w128: + mov dword [rsp+40], 16 + movifprep tmp_stridem, 256 +.dy2_w_start: + mov myd, mym +%ifidn %1, put + movifnidn dsm, dsq +%endif + shr t0d, 16 + sub srcq, 3 + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + pmaddwd m8, [base+rescale_mul] + movd xm15, t0d + mov [rsp+64], t0d + mov [rsp+48], srcq + mov [rsp+56], r0q ; dstq / tmpq +%if UNIX64 + mov hm, hd +%endif + shl dword dxm, 3 ; dx*8 + vpbroadcastd m15, xm15 + paddd m14, m8 ; mx+dx*[0-7] + movq xm0, r4q + punpcklbw xm0, xm0 + psraw xm0, 8 + mova [rsp+0x50], xm0 + jmp .dy2_hloop +.dy2_hloop_prep: + dec dword [rsp+40] + jz .ret + add qword [rsp+56], 8*(isprep+1) + mov hd, hm + vpbroadcastd m8, dxm + vpbroadcastd m10, [base+pd_0x3ff] + paddd m14, m8, [rsp] + vpbroadcastd m15, [rsp+64] + pxor m9, m9 + mov srcq, [rsp+48] + mov r0q, [rsp+56] ; dstq / tmpq +.dy2_hloop: + vpbroadcastq m11, [base+pq_0x40000000] + pand m6, m14, m10 + psrld m6, 6 + paddd m15, m6 + pcmpeqd m6, m9 + vextracti128 xm7, m15, 1 + movd r4d, xm15 + pextrd r6d, xm15, 2 + pextrd r7d, xm15, 1 + pextrd r9d, xm15, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + movu [rsp], m14 + movq xm15, [base+subpel_filters+ r4*8] + movq xm10, [base+subpel_filters+ r6*8] + movhps xm15, [base+subpel_filters+ r7*8] + movhps xm10, [base+subpel_filters+ r9*8] + vinserti128 m15, [base+subpel_filters+r10*8], 1 + vinserti128 m10, [base+subpel_filters+r11*8], 1 + vpbroadcastq m9, [base+subpel_filters+r13*8] + vpbroadcastq m8, [base+subpel_filters+ rX*8] + psrld m14, 10 + vextracti128 xm7, m14, 1 + movd r4d, xm14 + pextrd r6d, xm14, 2 + pextrd r7d, xm14, 1 + pextrd r9d, xm14, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + pshufd m5, m6, q1100 + pshufd m6, m6, q3322 + vpblendd m15, m9, 0xc0 + vpblendd m10, m8, 0xc0 + pblendvb m15, m11, m5 + pblendvb m10, m11, m6 + vbroadcasti128 m14, [base+subpel_s_shuf8] + MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b + MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b + MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b + vpbroadcastd m8, [rsp+0x50] + vpbroadcastd m9, [rsp+0x54] + vpbroadcastd m11, [rsp+0x58] + vpbroadcastd m4, [rsp+0x5c] + pshufb m0, m14 ; 01a 01b + pshufb m1, m14 ; 23a 23b + pshufb m2, m14 ; 45a 45b + pshufb m3, m14 ; 67a 67b + SWAP m14, m4 +.dy2_vloop: + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m9 + pmaddwd m6, m2, m11 + pmaddwd m7, m3, m14 + paddd m4, m5 + paddd m6, m7 + paddd m4, m13 + paddd m4, m6 + psrad m4, rndshift + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 +%ifidn %1, put + packuswb xm4, xm4 + movq [dstq], xm4 + add dstq, dsm +%else + mova [tmpq], xm4 + add tmpq, tmp_stridem +%endif + dec hd + jz .dy2_hloop_prep + mova m0, m1 + mova m1, m2 + mova m2, m3 + movq xm3, [srcq+ r4] + movq xm4, [srcq+ r6] + movhps xm3, [srcq+ r7] + movhps xm4, [srcq+ r9] + vinserti128 m3, [srcq+r10], 1 + vinserti128 m4, [srcq+r11], 1 + vpbroadcastq m5, [srcq+r13] + vpbroadcastq m6, [srcq+ rX] + add srcq, ssq + vpblendd m3, m5, 0xc0 + vpblendd m4, m6, 0xc0 + pmaddubsw m3, m15 + pmaddubsw m4, m10 + phaddw m3, m4 + movq xm4, [srcq+ r4] + movq xm5, [srcq+ r6] + movhps xm4, [srcq+ r7] + movhps xm5, [srcq+ r9] + vinserti128 m4, [srcq+r10], 1 + vinserti128 m5, [srcq+r11], 1 + vpbroadcastq m6, [srcq+r13] + vpbroadcastq m7, [srcq+ rX] + add srcq, ssq + vpblendd m4, m6, 0xc0 + vpblendd m5, m7, 0xc0 + pmaddubsw m4, m15 + pmaddubsw m5, m10 + phaddw m4, m5 + psrld m5, m3, 16 + pslld m6, m4, 16 + paddw m3, m5 + paddw m4, m6 + pblendw m3, m4, 0xaa + pmulhrsw m3, m12 + jmp .dy2_vloop +.ret: + MC_8TAP_SCALED_RET 0 +%undef isprep +%endmacro + +%macro BILIN_SCALED_FN 1 +cglobal %1_bilin_scaled + mov t0d, (5*15 << 16) | 5*15 + mov t1d, t0d + jmp mangle(private_prefix %+ _%1_8tap_scaled %+ SUFFIX) +%endmacro + +%if WIN64 +DECLARE_REG_TMP 6, 5 +%else +DECLARE_REG_TMP 6, 8 +%endif + +%define PUT_8TAP_SCALED_FN FN put_8tap_scaled, +%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled, + +BILIN_SCALED_FN put +PUT_8TAP_SCALED_FN sharp, SHARP, SHARP +PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH +PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP +PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH +PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR +PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP +PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR +PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH +PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR +MC_8TAP_SCALED put + +%if WIN64 +DECLARE_REG_TMP 5, 4 +%else +DECLARE_REG_TMP 6, 7 +%endif + +BILIN_SCALED_FN prep +PREP_8TAP_SCALED_FN sharp, SHARP, SHARP +PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH +PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP +PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH +PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR +PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP +PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR +PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH +PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR +MC_8TAP_SCALED prep + +%macro WARP_V 5 ; dst, 02, 46, 13, 57 + ; Can be done using gathers, but that's terribly slow on many CPU:s + lea tmp1d, [myq+deltaq*4] + lea tmp2d, [myq+deltaq*1] + shr myd, 10 + shr tmp1d, 10 + movq xm8, [filterq+myq *8] + vinserti128 m8, [filterq+tmp1q*8], 1 ; a e + lea tmp1d, [tmp2q+deltaq*4] + lea myd, [tmp2q+deltaq*1] + shr tmp2d, 10 + shr tmp1d, 10 + movq xm0, [filterq+tmp2q*8] + vinserti128 m0, [filterq+tmp1q*8], 1 ; b f + lea tmp1d, [myq+deltaq*4] + lea tmp2d, [myq+deltaq*1] + shr myd, 10 + shr tmp1d, 10 + movq xm9, [filterq+myq *8] + vinserti128 m9, [filterq+tmp1q*8], 1 ; c g + lea tmp1d, [tmp2q+deltaq*4] + lea myd, [tmp2q+gammaq] ; my += gamma + shr tmp2d, 10 + shr tmp1d, 10 + punpcklwd m8, m0 + movq xm0, [filterq+tmp2q*8] + vinserti128 m0, [filterq+tmp1q*8], 1 ; d h + punpcklwd m0, m9, m0 + punpckldq m9, m8, m0 + punpckhdq m0, m8, m0 + punpcklbw m8, m11, m9 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8 + punpckhbw m9, m11, m9 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8 + pmaddwd m%2, m8 + pmaddwd m9, m%3 + punpcklbw m8, m11, m0 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8 + punpckhbw m0, m11, m0 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8 + pmaddwd m8, m%4 + pmaddwd m0, m%5 + paddd m%2, m9 + paddd m0, m8 + paddd m%1, m0, m%2 +%endmacro + +cglobal warp_affine_8x8t, 0, 14, 0, tmp, ts +%if WIN64 + sub rsp, 0xa0 +%endif + call mangle(private_prefix %+ _warp_affine_8x8_avx2).main +.loop: + psrad m7, 13 + psrad m0, 13 + packssdw m7, m0 + pmulhrsw m7, m14 ; (x + (1 << 6)) >> 7 + vpermq m7, m7, q3120 + mova [tmpq+tsq*0], xm7 + vextracti128 [tmpq+tsq*2], m7, 1 + dec r4d + jz mangle(private_prefix %+ _warp_affine_8x8_avx2).end + call mangle(private_prefix %+ _warp_affine_8x8_avx2).main2 + lea tmpq, [tmpq+tsq*4] + jmp .loop + +cglobal warp_affine_8x8, 0, 14, 0, dst, ds, src, ss, abcd, mx, tmp2, alpha, \ + beta, filter, tmp1, delta, my, gamma +%if WIN64 + sub rsp, 0xa0 + %assign xmm_regs_used 16 + %assign stack_size_padded 0xa0 + %assign stack_offset stack_offset+stack_size_padded +%endif + call .main + jmp .start +.loop: + call .main2 + lea dstq, [dstq+dsq*2] +.start: + psrad m7, 18 + psrad m0, 18 + packusdw m7, m0 + pavgw m7, m11 ; (x + (1 << 10)) >> 11 + vextracti128 xm0, m7, 1 + packuswb xm7, xm0 + pshufd xm7, xm7, q3120 + movq [dstq+dsq*0], xm7 + movhps [dstq+dsq*1], xm7 + dec r4d + jg .loop +.end: + RET +ALIGN function_align +.main: + ; Stack args offset by one (r4m -> r5m etc.) due to call +%if WIN64 + mov abcdq, r5m + mov mxd, r6m + movaps [rsp+stack_offset+0x10], xmm6 + movaps [rsp+stack_offset+0x20], xmm7 + movaps [rsp+0x28], xmm8 + movaps [rsp+0x38], xmm9 + movaps [rsp+0x48], xmm10 + movaps [rsp+0x58], xmm11 + movaps [rsp+0x68], xmm12 + movaps [rsp+0x78], xmm13 + movaps [rsp+0x88], xmm14 + movaps [rsp+0x98], xmm15 +%endif + movsx alphad, word [abcdq+2*0] + movsx betad, word [abcdq+2*1] + mova m12, [warp_8x8_shufA] + mova m13, [warp_8x8_shufB] + vpbroadcastd m14, [pw_8192] + vpbroadcastd m15, [pd_32768] + pxor m11, m11 + lea filterq, [mc_warp_filter] + lea tmp1q, [ssq*3+3] + add mxd, 512+(64<<10) + lea tmp2d, [alphaq*3] + sub srcq, tmp1q ; src -= src_stride*3 + 3 + sub betad, tmp2d ; beta -= alpha*3 + mov myd, r7m + call .h + psrld m1, m0, 16 + call .h + psrld m4, m0, 16 + call .h + pblendw m1, m0, 0xaa ; 02 + call .h + pblendw m4, m0, 0xaa ; 13 + call .h + psrld m2, m1, 16 + pblendw m2, m0, 0xaa ; 24 + call .h + psrld m5, m4, 16 + pblendw m5, m0, 0xaa ; 35 + call .h + psrld m3, m2, 16 + pblendw m3, m0, 0xaa ; 46 + movsx deltad, word [abcdq+2*2] + movsx gammad, word [abcdq+2*3] + add myd, 512+(64<<10) + mov r4d, 4 + lea tmp1d, [deltaq*3] + sub gammad, tmp1d ; gamma -= delta*3 +.main2: + call .h + psrld m6, m5, 16 + pblendw m6, m0, 0xaa ; 57 + WARP_V 7, 1, 3, 4, 6 + call .h + mova m1, m2 + mova m2, m3 + psrld m3, 16 + pblendw m3, m0, 0xaa ; 68 + WARP_V 0, 4, 6, 1, 3 + mova m4, m5 + mova m5, m6 + ret +ALIGN function_align +.h: + lea tmp1d, [mxq+alphaq*4] + lea tmp2d, [mxq+alphaq*1] + vbroadcasti128 m10, [srcq] + shr mxd, 10 + shr tmp1d, 10 + movq xm8, [filterq+mxq *8] + vinserti128 m8, [filterq+tmp1q*8], 1 + lea tmp1d, [tmp2q+alphaq*4] + lea mxd, [tmp2q+alphaq*1] + shr tmp2d, 10 + shr tmp1d, 10 + movq xm0, [filterq+tmp2q*8] + vinserti128 m0, [filterq+tmp1q*8], 1 + lea tmp1d, [mxq+alphaq*4] + lea tmp2d, [mxq+alphaq*1] + shr mxd, 10 + shr tmp1d, 10 + movq xm9, [filterq+mxq *8] + vinserti128 m9, [filterq+tmp1q*8], 1 + lea tmp1d, [tmp2q+alphaq*4] + lea mxd, [tmp2q+betaq] ; mx += beta + shr tmp2d, 10 + shr tmp1d, 10 + punpcklqdq m8, m0 ; 0 1 4 5 + movq xm0, [filterq+tmp2q*8] + vinserti128 m0, [filterq+tmp1q*8], 1 + punpcklqdq m9, m0 ; 2 3 6 7 + pshufb m0, m10, m12 + pmaddubsw m0, m8 + pshufb m10, m13 + pmaddubsw m10, m9 + add srcq, ssq + phaddw m0, m10 + pmaddwd m0, m14 ; 17-bit intermediate, upshifted by 13 + paddd m0, m15 ; rounded 14-bit result in upper 16 bits of dword + ret + +%macro BIDIR_FN 1 ; op + %1 0 + lea stride3q, [strideq*3] + jmp wq +.w4: + vextracti128 xm1, m0, 1 + movd [dstq ], xm0 + pextrd [dstq+strideq*1], xm0, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+stride3q ], xm1, 1 + cmp hd, 4 + je .ret + lea dstq, [dstq+strideq*4] + pextrd [dstq ], xm0, 2 + pextrd [dstq+strideq*1], xm0, 3 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+stride3q ], xm1, 3 + cmp hd, 8 + je .ret + %1 2 + lea dstq, [dstq+strideq*4] + vextracti128 xm1, m0, 1 + movd [dstq ], xm0 + pextrd [dstq+strideq*1], xm0, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+stride3q ], xm1, 1 + lea dstq, [dstq+strideq*4] + pextrd [dstq ], xm0, 2 + pextrd [dstq+strideq*1], xm0, 3 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+stride3q ], xm1, 3 +.ret: + RET +.w8_loop: + %1_INC_PTR 2 + %1 0 + lea dstq, [dstq+strideq*4] +.w8: + vextracti128 xm1, m0, 1 + movq [dstq ], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm1 + sub hd, 4 + jg .w8_loop + RET +.w16_loop: + %1_INC_PTR 4 + %1 0 + lea dstq, [dstq+strideq*4] +.w16: + vpermq m0, m0, q3120 + mova [dstq ], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + %1 2 + vpermq m0, m0, q3120 + mova [dstq+strideq*2], xm0 + vextracti128 [dstq+stride3q ], m0, 1 + sub hd, 4 + jg .w16_loop + RET +.w32_loop: + %1_INC_PTR 4 + %1 0 + lea dstq, [dstq+strideq*2] +.w32: + vpermq m0, m0, q3120 + mova [dstq+strideq*0], m0 + %1 2 + vpermq m0, m0, q3120 + mova [dstq+strideq*1], m0 + sub hd, 2 + jg .w32_loop + RET +.w64_loop: + %1_INC_PTR 4 + %1 0 + add dstq, strideq +.w64: + vpermq m0, m0, q3120 + mova [dstq], m0 + %1 2 + vpermq m0, m0, q3120 + mova [dstq+32], m0 + dec hd + jg .w64_loop + RET +.w128_loop: + %1 0 + add dstq, strideq +.w128: + vpermq m0, m0, q3120 + mova [dstq+0*32], m0 + %1 2 + vpermq m0, m0, q3120 + mova [dstq+1*32], m0 + %1_INC_PTR 8 + %1 -4 + vpermq m0, m0, q3120 + mova [dstq+2*32], m0 + %1 -2 + vpermq m0, m0, q3120 + mova [dstq+3*32], m0 + dec hd + jg .w128_loop + RET +%endmacro + +%macro AVG 1 ; src_offset + mova m0, [tmp1q+(%1+0)*32] + paddw m0, [tmp2q+(%1+0)*32] + mova m1, [tmp1q+(%1+1)*32] + paddw m1, [tmp2q+(%1+1)*32] + pmulhrsw m0, m2 + pmulhrsw m1, m2 + packuswb m0, m1 +%endmacro + +%macro AVG_INC_PTR 1 + add tmp1q, %1*32 + add tmp2q, %1*32 +%endmacro + +cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3 +%define base r6-avg %+ SUFFIX %+ _table + lea r6, [avg %+ SUFFIX %+ _table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, dword [r6+wq*4] + vpbroadcastd m2, [base+pw_1024] + add wq, r6 + BIDIR_FN AVG + +%macro W_AVG 1 ; src_offset + ; (a * weight + b * (16 - weight) + 128) >> 8 + ; = ((a - b) * weight + (b << 4) + 128) >> 8 + ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4 + ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4 + mova m0, [tmp1q+(%1+0)*32] + psubw m2, m0, [tmp2q+(%1+0)*32] + mova m1, [tmp1q+(%1+1)*32] + psubw m3, m1, [tmp2q+(%1+1)*32] + pmulhw m2, m4 + pmulhw m3, m4 + paddw m0, m2 + paddw m1, m3 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + packuswb m0, m1 +%endmacro + +%define W_AVG_INC_PTR AVG_INC_PTR + +cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 +%define base r6-w_avg %+ SUFFIX %+ _table + lea r6, [w_avg %+ SUFFIX %+ _table] + tzcnt wd, wm + movifnidn hd, hm + vpbroadcastw m4, r6m ; weight + movsxd wq, dword [r6+wq*4] + vpbroadcastd m5, [base+pw_2048] + psllw m4, 12 ; (weight-16) << 12 when interpreted as signed + add wq, r6 + cmp dword r6m, 7 + jg .weight_gt7 + mov r6, tmp1q + pxor m0, m0 + mov tmp1q, tmp2q + psubw m4, m0, m4 ; -weight + mov tmp2q, r6 +.weight_gt7: + BIDIR_FN W_AVG + +%macro MASK 1 ; src_offset + ; (a * m + b * (64 - m) + 512) >> 10 + ; = ((a - b) * m + (b << 6) + 512) >> 10 + ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4 + vpermq m3, [maskq+%1*16], q3120 + mova m0, [tmp2q+(%1+0)*32] + psubw m1, m0, [tmp1q+(%1+0)*32] + psubb m3, m4, m3 + paddw m1, m1 ; (b - a) << 1 + paddb m3, m3 + punpcklbw m2, m4, m3 ; -m << 9 + pmulhw m1, m2 + paddw m0, m1 + mova m1, [tmp2q+(%1+1)*32] + psubw m2, m1, [tmp1q+(%1+1)*32] + paddw m2, m2 + punpckhbw m3, m4, m3 + pmulhw m2, m3 + paddw m1, m2 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + packuswb m0, m1 +%endmacro + +%macro MASK_INC_PTR 1 + add maskq, %1*16 + add tmp2q, %1*32 + add tmp1q, %1*32 +%endmacro + +cglobal mask, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-mask %+ SUFFIX %+ _table + lea r7, [mask %+ SUFFIX %+ _table] + tzcnt wd, wm + movifnidn hd, hm + mov maskq, maskmp + movsxd wq, dword [r7+wq*4] + vpbroadcastd m5, [base+pw_2048] + pxor m4, m4 + add wq, r7 + BIDIR_FN MASK + +%macro W_MASK 4-5 0 ; dst, mask, tmp_offset[1-2], 4:4:4 + mova m%1, [tmp1q+32*%3] + mova m1, [tmp2q+32*%3] + psubw m1, m%1 + pabsw m%2, m1 + psubusw m%2, m6, m%2 + psrlw m%2, 8 ; 64 - m + psllw m2, m%2, 10 + pmulhw m1, m2 + paddw m%1, m1 + mova m1, [tmp1q+32*%4] + mova m2, [tmp2q+32*%4] + psubw m2, m1 + pabsw m3, m2 + psubusw m3, m6, m3 + psrlw m3, 8 +%if %5 + packuswb m%2, m3 + psubb m%2, m5, m%2 + vpermq m%2, m%2, q3120 +%else + phaddw m%2, m3 +%endif + psllw m3, 10 + pmulhw m2, m3 + paddw m1, m2 + pmulhrsw m%1, m7 + pmulhrsw m1, m7 + packuswb m%1, m1 +%endmacro + +cglobal blend, 3, 7, 7, dst, ds, tmp, w, h, mask +%define base r6-blend_avx2_table + lea r6, [blend_avx2_table] + tzcnt wd, wm + movifnidn hd, hm + movifnidn maskq, maskmp + movsxd wq, dword [r6+wq*4] + vpbroadcastd m4, [base+pb_64] + vpbroadcastd m5, [base+pw_512] + add wq, r6 + lea r6, [dsq*3] + jmp wq +.w4: + movd xm0, [dstq+dsq*0] + pinsrd xm0, [dstq+dsq*1], 1 + vpbroadcastd xm1, [dstq+dsq*2] + pinsrd xm1, [dstq+r6 ], 3 + mova xm6, [maskq] + psubb xm3, xm4, xm6 + punpcklbw xm2, xm3, xm6 + punpckhbw xm3, xm6 + mova xm6, [tmpq] + add maskq, 4*4 + add tmpq, 4*4 + punpcklbw xm0, xm6 + punpckhbw xm1, xm6 + pmaddubsw xm0, xm2 + pmaddubsw xm1, xm3 + pmulhrsw xm0, xm5 + pmulhrsw xm1, xm5 + packuswb xm0, xm1 + movd [dstq+dsq*0], xm0 + pextrd [dstq+dsq*1], xm0, 1 + pextrd [dstq+dsq*2], xm0, 2 + pextrd [dstq+r6 ], xm0, 3 + lea dstq, [dstq+dsq*4] + sub hd, 4 + jg .w4 + RET +ALIGN function_align +.w8: + movq xm1, [dstq+dsq*0] + movhps xm1, [dstq+dsq*1] + vpbroadcastq m2, [dstq+dsq*2] + vpbroadcastq m3, [dstq+r6 ] + mova m0, [maskq] + mova m6, [tmpq] + add maskq, 8*4 + add tmpq, 8*4 + vpblendd m1, m2, 0x30 + vpblendd m1, m3, 0xc0 + psubb m3, m4, m0 + punpcklbw m2, m3, m0 + punpckhbw m3, m0 + punpcklbw m0, m1, m6 + punpckhbw m1, m6 + pmaddubsw m0, m2 + pmaddubsw m1, m3 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + packuswb m0, m1 + vextracti128 xm1, m0, 1 + movq [dstq+dsq*0], xm0 + movhps [dstq+dsq*1], xm0 + movq [dstq+dsq*2], xm1 + movhps [dstq+r6 ], xm1 + lea dstq, [dstq+dsq*4] + sub hd, 4 + jg .w8 + RET +ALIGN function_align +.w16: + mova m0, [maskq] + mova xm1, [dstq+dsq*0] + vinserti128 m1, [dstq+dsq*1], 1 + psubb m3, m4, m0 + punpcklbw m2, m3, m0 + punpckhbw m3, m0 + mova m6, [tmpq] + add maskq, 16*2 + add tmpq, 16*2 + punpcklbw m0, m1, m6 + punpckhbw m1, m6 + pmaddubsw m0, m2 + pmaddubsw m1, m3 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + packuswb m0, m1 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w16 + RET +ALIGN function_align +.w32: + mova m0, [maskq] + mova m1, [dstq] + mova m6, [tmpq] + add maskq, 32 + add tmpq, 32 + psubb m3, m4, m0 + punpcklbw m2, m3, m0 + punpckhbw m3, m0 + punpcklbw m0, m1, m6 + punpckhbw m1, m6 + pmaddubsw m0, m2 + pmaddubsw m1, m3 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + packuswb m0, m1 + mova [dstq], m0 + add dstq, dsq + dec hd + jg .w32 + RET + +cglobal blend_v, 3, 6, 6, dst, ds, tmp, w, h, mask +%define base r5-blend_v_avx2_table + lea r5, [blend_v_avx2_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, dword [r5+wq*4] + vpbroadcastd m5, [base+pw_512] + add wq, r5 + add maskq, obmc_masks-blend_v_avx2_table + jmp wq +.w2: + vpbroadcastd xm2, [maskq+2*2] +.w2_s0_loop: + movd xm0, [dstq+dsq*0] + pinsrw xm0, [dstq+dsq*1], 1 + movd xm1, [tmpq] + add tmpq, 2*2 + punpcklbw xm0, xm1 + pmaddubsw xm0, xm2 + pmulhrsw xm0, xm5 + packuswb xm0, xm0 + pextrw [dstq+dsq*0], xm0, 0 + pextrw [dstq+dsq*1], xm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w2_s0_loop + RET +ALIGN function_align +.w4: + vpbroadcastq xm2, [maskq+4*2] +.w4_loop: + movd xm0, [dstq+dsq*0] + pinsrd xm0, [dstq+dsq*1], 1 + movq xm1, [tmpq] + add tmpq, 4*2 + punpcklbw xm0, xm1 + pmaddubsw xm0, xm2 + pmulhrsw xm0, xm5 + packuswb xm0, xm0 + movd [dstq+dsq*0], xm0 + pextrd [dstq+dsq*1], xm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w4_loop + RET +ALIGN function_align +.w8: + vbroadcasti128 m4, [maskq+8*2] +.w8_loop: + vpbroadcastq m2, [dstq+dsq*0] + movq xm0, [dstq+dsq*1] + vpblendd m0, m2, 0x30 + movq xm1, [tmpq+8*1] + vinserti128 m1, [tmpq+8*0], 1 + add tmpq, 8*2 + punpcklbw m0, m1 + pmaddubsw m0, m4 + pmulhrsw m0, m5 + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + movhps [dstq+dsq*0], xm0 + movq [dstq+dsq*1], xm0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w8_loop + RET +ALIGN function_align +.w16: + vbroadcasti128 m3, [maskq+16*2] + vbroadcasti128 m4, [maskq+16*3] +.w16_loop: + mova xm1, [dstq+dsq*0] + vinserti128 m1, [dstq+dsq*1], 1 + mova m2, [tmpq] + add tmpq, 16*2 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m4 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + packuswb m0, m1 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w16_loop + RET +ALIGN function_align +.w32: + mova xm3, [maskq+16*4] + vinserti128 m3, [maskq+16*6], 1 + mova xm4, [maskq+16*5] + vinserti128 m4, [maskq+16*7], 1 +.w32_loop: + mova m1, [dstq] + mova m2, [tmpq] + add tmpq, 32 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m4 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + packuswb m0, m1 + mova [dstq], m0 + add dstq, dsq + dec hd + jg .w32_loop + RET + +cglobal blend_h, 4, 7, 6, dst, ds, tmp, w, h, mask +%define base r5-blend_h_avx2_table + lea r5, [blend_h_avx2_table] + mov r6d, wd + tzcnt wd, wd + mov hd, hm + movsxd wq, dword [r5+wq*4] + vpbroadcastd m5, [base+pw_512] + add wq, r5 + lea maskq, [base+obmc_masks+hq*2] + lea hd, [hq*3] + shr hd, 2 ; h * 3/4 + lea maskq, [maskq+hq*2] + neg hq + jmp wq +.w2: + movd xm0, [dstq+dsq*0] + pinsrw xm0, [dstq+dsq*1], 1 + movd xm2, [maskq+hq*2] + movd xm1, [tmpq] + add tmpq, 2*2 + punpcklwd xm2, xm2 + punpcklbw xm0, xm1 + pmaddubsw xm0, xm2 + pmulhrsw xm0, xm5 + packuswb xm0, xm0 + pextrw [dstq+dsq*0], xm0, 0 + pextrw [dstq+dsq*1], xm0, 1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w2 + RET +ALIGN function_align +.w4: + mova xm3, [blend_shuf] +.w4_loop: + movd xm0, [dstq+dsq*0] + pinsrd xm0, [dstq+dsq*1], 1 + movd xm2, [maskq+hq*2] + movq xm1, [tmpq] + add tmpq, 4*2 + pshufb xm2, xm3 + punpcklbw xm0, xm1 + pmaddubsw xm0, xm2 + pmulhrsw xm0, xm5 + packuswb xm0, xm0 + movd [dstq+dsq*0], xm0 + pextrd [dstq+dsq*1], xm0, 1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w4_loop + RET +ALIGN function_align +.w8: + vbroadcasti128 m4, [blend_shuf] + shufpd m4, m4, 0x03 +.w8_loop: + vpbroadcastq m1, [dstq+dsq*0] + movq xm0, [dstq+dsq*1] + vpblendd m0, m1, 0x30 + vpbroadcastd m3, [maskq+hq*2] + movq xm1, [tmpq+8*1] + vinserti128 m1, [tmpq+8*0], 1 + add tmpq, 8*2 + pshufb m3, m4 + punpcklbw m0, m1 + pmaddubsw m0, m3 + pmulhrsw m0, m5 + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + movhps [dstq+dsq*0], xm0 + movq [dstq+dsq*1], xm0 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w8_loop + RET +ALIGN function_align +.w16: + vbroadcasti128 m4, [blend_shuf] + shufpd m4, m4, 0x0c +.w16_loop: + mova xm1, [dstq+dsq*0] + vinserti128 m1, [dstq+dsq*1], 1 + vpbroadcastd m3, [maskq+hq*2] + mova m2, [tmpq] + add tmpq, 16*2 + pshufb m3, m4 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + packuswb m0, m1 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w16_loop + RET +ALIGN function_align +.w32: ; w32/w64/w128 + sub dsq, r6 +.w32_loop0: + vpbroadcastw m3, [maskq+hq*2] + mov wd, r6d +.w32_loop: + mova m1, [dstq] + mova m2, [tmpq] + add tmpq, 32 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + packuswb m0, m1 + mova [dstq], m0 + add dstq, 32 + sub wd, 32 + jg .w32_loop + add dstq, dsq + inc hq + jl .w32_loop0 + RET + +cglobal emu_edge, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \ + bottomext, rightext + ; we assume that the buffer (stride) is larger than width, so we can + ; safely overwrite by a few bytes + + ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) + xor r12d, r12d + lea r10, [ihq-1] + cmp yq, ihq + cmovs r10, yq + test yq, yq + cmovs r10, r12 + imul r10, sstrideq + add srcq, r10 + + ; ref += iclip(x, 0, iw - 1) + lea r10, [iwq-1] + cmp xq, iwq + cmovs r10, xq + test xq, xq + cmovs r10, r12 + add srcq, r10 + + ; bottom_ext = iclip(y + bh - ih, 0, bh - 1) + lea bottomextq, [yq+bhq] + sub bottomextq, ihq + lea r3, [bhq-1] + cmovs bottomextq, r12 + + DEFINE_ARGS bw, bh, iw, ih, x, topext, dst, dstride, src, sstride, \ + bottomext, rightext + + ; top_ext = iclip(-y, 0, bh - 1) + neg topextq + cmovs topextq, r12 + cmp bottomextq, bhq + cmovns bottomextq, r3 + cmp topextq, bhq + cmovg topextq, r3 + + ; right_ext = iclip(x + bw - iw, 0, bw - 1) + lea rightextq, [xq+bwq] + sub rightextq, iwq + lea r2, [bwq-1] + cmovs rightextq, r12 + + DEFINE_ARGS bw, bh, iw, ih, leftext, topext, dst, dstride, src, sstride, \ + bottomext, rightext + + ; left_ext = iclip(-x, 0, bw - 1) + neg leftextq + cmovs leftextq, r12 + cmp rightextq, bwq + cmovns rightextq, r2 + cmp leftextq, bwq + cmovns leftextq, r2 + + DEFINE_ARGS bw, centerh, centerw, dummy, leftext, topext, \ + dst, dstride, src, sstride, bottomext, rightext + + ; center_h = bh - top_ext - bottom_ext + lea r3, [bottomextq+topextq] + sub centerhq, r3 + + ; blk += top_ext * PXSTRIDE(dst_stride) + mov r2, topextq + imul r2, dstrideq + add dstq, r2 + mov r9m, dstq + + ; center_w = bw - left_ext - right_ext + mov centerwq, bwq + lea r3, [rightextq+leftextq] + sub centerwq, r3 + +%macro v_loop 3 ; need_left_ext, need_right_ext, suffix +.v_loop_%3: +%if %1 + ; left extension + xor r3, r3 + vpbroadcastb m0, [srcq] +.left_loop_%3: + mova [dstq+r3], m0 + add r3, 32 + cmp r3, leftextq + jl .left_loop_%3 + + ; body + lea r12, [dstq+leftextq] +%endif + xor r3, r3 +.body_loop_%3: + movu m0, [srcq+r3] +%if %1 + movu [r12+r3], m0 +%else + movu [dstq+r3], m0 +%endif + add r3, 32 + cmp r3, centerwq + jl .body_loop_%3 + +%if %2 + ; right extension +%if %1 + add r12, centerwq +%else + lea r12, [dstq+centerwq] +%endif + xor r3, r3 + vpbroadcastb m0, [srcq+centerwq-1] +.right_loop_%3: + movu [r12+r3], m0 + add r3, 32 + cmp r3, rightextq + jl .right_loop_%3 + +%endif + add dstq, dstrideq + add srcq, sstrideq + dec centerhq + jg .v_loop_%3 +%endmacro + + test leftextq, leftextq + jnz .need_left_ext + test rightextq, rightextq + jnz .need_right_ext + v_loop 0, 0, 0 + jmp .body_done + +.need_left_ext: + test rightextq, rightextq + jnz .need_left_right_ext + v_loop 1, 0, 1 + jmp .body_done + +.need_left_right_ext: + v_loop 1, 1, 2 + jmp .body_done + +.need_right_ext: + v_loop 0, 1, 3 + +.body_done: + ; bottom edge extension + test bottomextq, bottomextq + jz .top + mov srcq, dstq + sub srcq, dstrideq + xor r1, r1 +.bottom_x_loop: + mova m0, [srcq+r1] + lea r3, [dstq+r1] + mov r4, bottomextq +.bottom_y_loop: + mova [r3], m0 + add r3, dstrideq + dec r4 + jg .bottom_y_loop + add r1, 32 + cmp r1, bwq + jl .bottom_x_loop + +.top: + ; top edge extension + test topextq, topextq + jz .end + mov srcq, r9m + mov dstq, dstm + xor r1, r1 +.top_x_loop: + mova m0, [srcq+r1] + lea r3, [dstq+r1] + mov r4, topextq +.top_y_loop: + mova [r3], m0 + add r3, dstrideq + dec r4 + jg .top_y_loop + add r1, 32 + cmp r1, bwq + jl .top_x_loop + +.end: + RET + +cglobal resize, 6, 14, 16, dst, dst_stride, src, src_stride, \ + dst_w, h, src_w, dx, mx0 + sub dword mx0m, 4<<14 + sub dword src_wm, 8 + vpbroadcastd m5, dxm + vpbroadcastd m8, mx0m + vpbroadcastd m6, src_wm + + DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, picptr + LEA r7, $$ +%define base r7-$$ + + vpbroadcastd m3, [base+pw_m256] + vpbroadcastd m7, [base+pd_63] + vbroadcasti128 m15, [base+pb_8x0_8x8] + pmaddwd m2, m5, [base+rescale_mul] ; dx*[0,1,2,3,4,5,6,7] + pslld m5, 3 ; dx*8 + pslld m6, 14 + paddd m8, m2 ; mx+[0..7]*dx + pxor m2, m2 + + ; m2 = 0, m3 = pmulhrsw constant for x=(x+64)>>7 + ; m8 = mx+[0..7]*dx, m5 = dx*8, m6 = src_w, m7 = 0x3f, m15=0,8 + +.loop_y: + xor xd, xd + mova m4, m8 ; per-line working version of mx + +.loop_x: + pmaxsd m0, m4, m2 + psrad m9, m4, 8 ; filter offset (unmasked) + pminsd m0, m6 ; iclip(mx, 0, src_w-8) + psubd m1, m4, m0 ; pshufb offset + psrad m0, 14 ; clipped src_x offset + psrad m1, 14 ; pshufb edge_emu offset + pand m9, m7 ; filter offset (masked) + + ; load source pixels - this ugly code is vpgatherdq emulation since + ; directly using vpgatherdq on Haswell is quite a bit slower :( + movd r8d, xm0 + pextrd r9d, xm0, 1 + pextrd r10d, xm0, 2 + pextrd r11d, xm0, 3 + vextracti128 xm0, m0, 1 + movq xm12, [srcq+r8] + movq xm13, [srcq+r10] + movhps xm12, [srcq+r9] + movhps xm13, [srcq+r11] + movd r8d, xm0 + pextrd r9d, xm0, 1 + pextrd r10d, xm0, 2 + pextrd r11d, xm0, 3 + vinserti128 m12, [srcq+r8], 1 + vinserti128 m13, [srcq+r10], 1 + vpbroadcastq m10, [srcq+r9] + vpbroadcastq m11, [srcq+r11] + vpblendd m12, m10, 11000000b + vpblendd m13, m11, 11000000b + + ; if no emulation is required, we don't need to shuffle or emulate edges + ; this also saves 2 quasi-vpgatherdqs + vptest m1, m1 + jz .filter + + movd r8d, xm1 + pextrd r9d, xm1, 1 + pextrd r10d, xm1, 2 + pextrd r11d, xm1, 3 + movsxd r8, r8d + movsxd r9, r9d + movsxd r10, r10d + movsxd r11, r11d + vextracti128 xm1, m1, 1 + movq xm14, [base+resize_shuf+4+r8] + movq xm0, [base+resize_shuf+4+r10] + movhps xm14, [base+resize_shuf+4+r9] + movhps xm0, [base+resize_shuf+4+r11] + movd r8d, xm1 + pextrd r9d, xm1, 1 + pextrd r10d, xm1, 2 + pextrd r11d, xm1, 3 + movsxd r8, r8d + movsxd r9, r9d + movsxd r10, r10d + movsxd r11, r11d + vinserti128 m14, [base+resize_shuf+4+r8], 1 + vinserti128 m0, [base+resize_shuf+4+r10], 1 + vpbroadcastq m10, [base+resize_shuf+4+r9] + vpbroadcastq m11, [base+resize_shuf+4+r11] + vpblendd m14, m10, 11000000b + vpblendd m0, m11, 11000000b + + paddb m14, m15 + paddb m0, m15 + pshufb m12, m14 + pshufb m13, m0 + +.filter: + movd r8d, xm9 + pextrd r9d, xm9, 1 + pextrd r10d, xm9, 2 + pextrd r11d, xm9, 3 + vextracti128 xm9, m9, 1 + movq xm10, [base+resize_filter+r8*8] + movq xm11, [base+resize_filter+r10*8] + movhps xm10, [base+resize_filter+r9*8] + movhps xm11, [base+resize_filter+r11*8] + movd r8d, xm9 + pextrd r9d, xm9, 1 + pextrd r10d, xm9, 2 + pextrd r11d, xm9, 3 + vinserti128 m10, [base+resize_filter+r8*8], 1 + vinserti128 m11, [base+resize_filter+r10*8], 1 + vpbroadcastq m14, [base+resize_filter+r9*8] + vpbroadcastq m1, [base+resize_filter+r11*8] + vpblendd m10, m14, 11000000b + vpblendd m11, m1, 11000000b + + pmaddubsw m12, m10 + pmaddubsw m13, m11 + phaddw m12, m13 + vextracti128 xm13, m12, 1 + phaddsw xm12, xm13 + pmulhrsw xm12, xm3 ; x=(x+64)>>7 + packuswb xm12, xm12 + movq [dstq+xq], xm12 + + paddd m4, m5 + add xd, 8 + cmp xd, dst_wd + jl .loop_x + + add dstq, dst_strideq + add srcq, src_strideq + dec hd + jg .loop_y + RET + +cglobal w_mask_420, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-w_mask_420_avx2_table + lea r7, [w_mask_420_avx2_table] + tzcnt wd, wm + mov r6d, r7m ; sign + movifnidn hd, hm + movsxd wq, [r7+wq*4] + vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 + vpbroadcastd m7, [base+pw_2048] + pmovzxbd m9, [base+deint_shuf4] + vpbroadcastd m8, [base+wm_420_sign+r6*4] ; 258 - sign + add wq, r7 + W_MASK 0, 4, 0, 1 + mov maskq, maskmp + lea stride3q, [strideq*3] + jmp wq +.w4: + vextracti128 xm1, m0, 1 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+stride3q ], xm1, 1 + cmp hd, 8 + jl .w4_end + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 2 + pextrd [dstq+strideq*1], xm0, 3 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+stride3q ], xm1, 3 + jg .w4_h16 +.w4_end: + vextracti128 xm0, m4, 1 + vpblendd xm1, xm4, xm0, 0x05 + vpblendd xm4, xm0, 0x0a + pshufd xm1, xm1, q2301 + psubw xm4, xm8, xm4 + psubw xm4, xm1 + psrlw xm4, 2 + packuswb xm4, xm4 + movq [maskq], xm4 + RET +.w4_h16: + W_MASK 0, 5, 2, 3 + lea dstq, [dstq+strideq*4] + phaddd m4, m5 + vextracti128 xm1, m0, 1 + psubw m4, m8, m4 + psrlw m4, 2 + vpermd m4, m9, m4 + vextracti128 xm5, m4, 1 + packuswb xm4, xm5 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+stride3q], xm1, 1 + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 2 + pextrd [dstq+strideq*1], xm0, 3 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+stride3q ], xm1, 3 + mova [maskq], xm4 + RET +.w8_loop: + add tmp1q, 2*32 + add tmp2q, 2*32 + W_MASK 0, 4, 0, 1 + lea dstq, [dstq+strideq*4] + add maskq, 8 +.w8: + vextracti128 xm2, m4, 1 + vextracti128 xm1, m0, 1 + psubw xm4, xm8, xm4 + psubw xm4, xm2 + psrlw xm4, 2 + packuswb xm4, xm4 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm1 + movq [maskq], xm4 + sub hd, 4 + jg .w8_loop + RET +.w16_loop: + add tmp1q, 4*32 + add tmp2q, 4*32 + W_MASK 0, 4, 0, 1 + lea dstq, [dstq+strideq*4] + add maskq, 16 +.w16: + vpermq m0, m0, q3120 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + W_MASK 0, 5, 2, 3 + punpckhqdq m1, m4, m5 + punpcklqdq m4, m5 + psubw m1, m8, m1 + psubw m1, m4 + psrlw m1, 2 + vpermq m0, m0, q3120 + packuswb m1, m1 + vpermd m1, m9, m1 + mova [dstq+strideq*2], xm0 + vextracti128 [dstq+stride3q ], m0, 1 + mova [maskq], xm1 + sub hd, 4 + jg .w16_loop + RET +.w32_loop: + add tmp1q, 4*32 + add tmp2q, 4*32 + W_MASK 0, 4, 0, 1 + lea dstq, [dstq+strideq*2] + add maskq, 16 +.w32: + vpermq m0, m0, q3120 + mova [dstq+strideq*0], m0 + W_MASK 0, 5, 2, 3 + psubw m4, m8, m4 + psubw m4, m5 + psrlw m4, 2 + vpermq m0, m0, q3120 + packuswb m4, m4 + vpermd m4, m9, m4 + mova [dstq+strideq*1], m0 + mova [maskq], xm4 + sub hd, 2 + jg .w32_loop + RET +.w64_loop_even: + psubw m10, m8, m4 + psubw m11, m8, m5 + dec hd +.w64_loop: + add tmp1q, 4*32 + add tmp2q, 4*32 + W_MASK 0, 4, 0, 1 + add dstq, strideq +.w64: + vpermq m0, m0, q3120 + mova [dstq+32*0], m0 + W_MASK 0, 5, 2, 3 + vpermq m0, m0, q3120 + mova [dstq+32*1], m0 + test hd, 1 + jz .w64_loop_even + psubw m4, m10, m4 + psubw m5, m11, m5 + psrlw m4, 2 + psrlw m5, 2 + packuswb m4, m5 + vpermd m4, m9, m4 + mova [maskq], m4 + add maskq, 32 + dec hd + jg .w64_loop + RET +.w128_loop_even: + psubw m12, m8, m4 + psubw m13, m8, m5 + dec hd +.w128_loop: + W_MASK 0, 4, 0, 1 + add dstq, strideq +.w128: + vpermq m0, m0, q3120 + mova [dstq+32*0], m0 + W_MASK 0, 5, 2, 3 + vpermq m0, m0, q3120 + mova [dstq+32*1], m0 + add tmp1q, 8*32 + add tmp2q, 8*32 + test hd, 1 + jz .w128_even + psubw m4, m10, m4 + psubw m5, m11, m5 + psrlw m4, 2 + psrlw m5, 2 + packuswb m4, m5 + vpermd m4, m9, m4 + mova [maskq+32*0], m4 + jmp .w128_odd +.w128_even: + psubw m10, m8, m4 + psubw m11, m8, m5 +.w128_odd: + W_MASK 0, 4, -4, -3 + vpermq m0, m0, q3120 + mova [dstq+32*2], m0 + W_MASK 0, 5, -2, -1 + vpermq m0, m0, q3120 + mova [dstq+32*3], m0 + test hd, 1 + jz .w128_loop_even + psubw m4, m12, m4 + psubw m5, m13, m5 + psrlw m4, 2 + psrlw m5, 2 + packuswb m4, m5 + vpermd m4, m9, m4 + mova [maskq+32*1], m4 + add maskq, 64 + dec hd + jg .w128_loop + RET + +cglobal w_mask_422, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-w_mask_422_avx2_table + lea r7, [w_mask_422_avx2_table] + tzcnt wd, wm + mov r6d, r7m ; sign + movifnidn hd, hm + pxor m9, m9 + movsxd wq, dword [r7+wq*4] + vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 + vpbroadcastd m7, [base+pw_2048] + pmovzxbd m10, [base+deint_shuf4] + vpbroadcastd m8, [base+wm_422_sign+r6*4] ; 128 - sign + add wq, r7 + mov maskq, maskmp + W_MASK 0, 4, 0, 1 + lea stride3q, [strideq*3] + jmp wq +.w4: + vextracti128 xm1, m0, 1 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+stride3q ], xm1, 1 + cmp hd, 8 + jl .w4_end + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 2 + pextrd [dstq+strideq*1], xm0, 3 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+stride3q ], xm1, 3 + jg .w4_h16 +.w4_end: + vextracti128 xm5, m4, 1 + packuswb xm4, xm5 + psubb xm5, xm8, xm4 + pavgb xm5, xm9 + pshufd xm5, xm5, q3120 + mova [maskq], xm5 + RET +.w4_h16: + W_MASK 0, 5, 2, 3 + lea dstq, [dstq+strideq*4] + packuswb m4, m5 + psubb m5, m8, m4 + pavgb m5, m9 + vpermd m5, m10, m5 + vextracti128 xm1, m0, 1 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+stride3q ], xm1, 1 + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 2 + pextrd [dstq+strideq*1], xm0, 3 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+stride3q ], xm1, 3 + mova [maskq], m5 + RET +.w8_loop: + add tmp1q, 32*2 + add tmp2q, 32*2 + W_MASK 0, 4, 0, 1 + lea dstq, [dstq+strideq*4] + add maskq, 16 +.w8: + vextracti128 xm5, m4, 1 + vextracti128 xm1, m0, 1 + packuswb xm4, xm5 + psubb xm5, xm8, xm4 + pavgb xm5, xm9 + pshufd xm5, xm5, q3120 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm1 + mova [maskq], xm5 + sub hd, 4 + jg .w8_loop + RET +.w16_loop: + add tmp1q, 32*4 + add tmp2q, 32*4 + W_MASK 0, 4, 0, 1 + lea dstq, [dstq+strideq*4] + add maskq, 32 +.w16: + vpermq m0, m0, q3120 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + W_MASK 0, 5, 2, 3 + packuswb m4, m5 + psubb m5, m8, m4 + pavgb m5, m9 + vpermq m0, m0, q3120 + vpermd m5, m10, m5 + mova [dstq+strideq*2], xm0 + vextracti128 [dstq+stride3q ], m0, 1 + mova [maskq], m5 + sub hd, 4 + jg .w16_loop + RET +.w32_loop: + add tmp1q, 32*4 + add tmp2q, 32*4 + W_MASK 0, 4, 0, 1 + lea dstq, [dstq+strideq*2] + add maskq, 32 +.w32: + vpermq m0, m0, q3120 + mova [dstq+strideq*0], m0 + W_MASK 0, 5, 2, 3 + packuswb m4, m5 + psubb m5, m8, m4 + pavgb m5, m9 + vpermq m0, m0, q3120 + vpermd m5, m10, m5 + mova [dstq+strideq*1], m0 + mova [maskq], m5 + sub hd, 2 + jg .w32_loop + RET +.w64_loop: + add tmp1q, 32*4 + add tmp2q, 32*4 + W_MASK 0, 4, 0, 1 + add dstq, strideq + add maskq, 32 +.w64: + vpermq m0, m0, q3120 + mova [dstq+32*0], m0 + W_MASK 0, 5, 2, 3 + packuswb m4, m5 + psubb m5, m8, m4 + pavgb m5, m9 + vpermq m0, m0, q3120 + vpermd m5, m10, m5 + mova [dstq+32*1], m0 + mova [maskq], m5 + dec hd + jg .w64_loop + RET +.w128_loop: + add tmp1q, 32*8 + add tmp2q, 32*8 + W_MASK 0, 4, 0, 1 + add dstq, strideq + add maskq, 32*2 +.w128: + vpermq m0, m0, q3120 + mova [dstq+32*0], m0 + W_MASK 0, 5, 2, 3 + packuswb m4, m5 + psubb m5, m8, m4 + pavgb m5, m9 + vpermq m0, m0, q3120 + vpermd m5, m10, m5 + mova [dstq+32*1], m0 + mova [maskq+32*0], m5 + W_MASK 0, 4, 4, 5 + vpermq m0, m0, q3120 + mova [dstq+32*2], m0 + W_MASK 0, 5, 6, 7 + packuswb m4, m5 + psubb m5, m8, m4 + pavgb m5, m9 + vpermq m0, m0, q3120 + vpermd m5, m10, m5 + mova [dstq+32*3], m0 + mova [maskq+32*1], m5 + dec hd + jg .w128_loop + RET + +cglobal w_mask_444, 4, 8, 8, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-w_mask_444_avx2_table + lea r7, [w_mask_444_avx2_table] + tzcnt wd, wm + movifnidn hd, hm + mov maskq, maskmp + movsxd wq, dword [r7+wq*4] + vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 + vpbroadcastd m5, [base+pb_64] + vpbroadcastd m7, [base+pw_2048] + add wq, r7 + W_MASK 0, 4, 0, 1, 1 + lea stride3q, [strideq*3] + jmp wq +.w4: + vextracti128 xm1, m0, 1 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+stride3q ], xm1, 1 + mova [maskq+32*0], m4 + cmp hd, 8 + jl .w4_end + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 2 + pextrd [dstq+strideq*1], xm0, 3 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+stride3q ], xm1, 3 + je .w4_end + W_MASK 0, 4, 2, 3, 1 + lea dstq, [dstq+strideq*4] + vextracti128 xm1, m0, 1 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+stride3q ], xm1, 1 + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 2 + pextrd [dstq+strideq*1], xm0, 3 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+stride3q ], xm1, 3 + mova [maskq+32*1], m4 +.w4_end: + RET +.w8_loop: + add tmp1q, 32*2 + add tmp2q, 32*2 + W_MASK 0, 4, 0, 1, 1 + lea dstq, [dstq+strideq*4] + add maskq, 32 +.w8: + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm1 + mova [maskq], m4 + sub hd, 4 + jg .w8_loop + RET +.w16_loop: + add tmp1q, 32*2 + add tmp2q, 32*2 + W_MASK 0, 4, 0, 1, 1 + lea dstq, [dstq+strideq*2] + add maskq, 32 +.w16: + vpermq m0, m0, q3120 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + mova [maskq], m4 + sub hd, 2 + jg .w16_loop + RET +.w32_loop: + add tmp1q, 32*2 + add tmp2q, 32*2 + W_MASK 0, 4, 0, 1, 1 + add dstq, strideq + add maskq, 32 +.w32: + vpermq m0, m0, q3120 + mova [dstq], m0 + mova [maskq], m4 + dec hd + jg .w32_loop + RET +.w64_loop: + add tmp1q, 32*4 + add tmp2q, 32*4 + W_MASK 0, 4, 0, 1, 1 + add dstq, strideq + add maskq, 32*2 +.w64: + vpermq m0, m0, q3120 + mova [dstq+32*0], m0 + mova [maskq+32*0], m4 + W_MASK 0, 4, 2, 3, 1 + vpermq m0, m0, q3120 + mova [dstq+32*1], m0 + mova [maskq+32*1], m4 + dec hd + jg .w64_loop + RET +.w128_loop: + add tmp1q, 32*8 + add tmp2q, 32*8 + W_MASK 0, 4, 0, 1, 1 + add dstq, strideq + add maskq, 32*4 +.w128: + vpermq m0, m0, q3120 + mova [dstq+32*0], m0 + mova [maskq+32*0], m4 + W_MASK 0, 4, 2, 3, 1 + vpermq m0, m0, q3120 + mova [dstq+32*1], m0 + mova [maskq+32*1], m4 + W_MASK 0, 4, 4, 5, 1 + vpermq m0, m0, q3120 + mova [dstq+32*2], m0 + mova [maskq+32*2], m4 + W_MASK 0, 4, 6, 7, 1 + vpermq m0, m0, q3120 + mova [dstq+32*3], m0 + mova [maskq+32*3], m4 + dec hd + jg .w128_loop + RET + +%endif ; ARCH_X86_64 diff --git a/third_party/dav1d/src/x86/mc_avx512.asm b/third_party/dav1d/src/x86/mc_avx512.asm new file mode 100644 index 0000000000..a13c24235c --- /dev/null +++ b/third_party/dav1d/src/x86/mc_avx512.asm @@ -0,0 +1,2395 @@ +; Copyright © 2020, VideoLAN and dav1d authors +; Copyright © 2020, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if HAVE_AVX512ICL && ARCH_X86_64 + +SECTION_RODATA 64 + +bidir_sctr_w4: dd 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 +wm_420_perm4: db 1, 3, 9, 11, 5, 7, 13, 15, 17, 19, 25, 27, 21, 23, 29, 31 + db 33, 35, 41, 43, 37, 39, 45, 47, 49, 51, 57, 59, 53, 55, 61, 63 + db 0, 2, 8, 10, 4, 6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30 + db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62 +wm_420_perm8: db 1, 3, 17, 19, 5, 7, 21, 23, 9, 11, 25, 27, 13, 15, 29, 31 + db 33, 35, 49, 51, 37, 39, 53, 55, 41, 43, 57, 59, 45, 47, 61, 63 + db 0, 2, 16, 18, 4, 6, 20, 22, 8, 10, 24, 26, 12, 14, 28, 30 + db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62 +wm_420_perm16: db 1, 3, 33, 35, 5, 7, 37, 39, 9, 11, 41, 43, 13, 15, 45, 47 + db 17, 19, 49, 51, 21, 23, 53, 55, 25, 27, 57, 59, 29, 31, 61, 63 + db 0, 2, 32, 34, 4, 6, 36, 38, 8, 10, 40, 42, 12, 14, 44, 46 + db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62 +wm_420_mask: db 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63 + db 67, 71, 75, 79, 83, 87, 91, 95, 99,103,107,111,115,119,123,127 + db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 + db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125 +wm_422_mask: db 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62 + db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 + db 66, 70, 74, 78, 82, 86, 90, 94, 98,102,106,110,114,118,122,126 + db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125 +wm_444_mask: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 + db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63 + db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62 +bilin_h_perm16: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 + db 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15 + db 33, 32, 34, 33, 35, 34, 36, 35, 37, 36, 38, 37, 39, 38, 40, 39 + db 41, 40, 42, 41, 43, 42, 44, 43, 45, 44, 46, 45, 47, 46, 48, 47 +bilin_h_perm32: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 + db 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15 + db 17, 16, 18, 17, 19, 18, 20, 19, 21, 20, 22, 21, 23, 22, 24, 23 + db 25, 24, 26, 25, 27, 26, 28, 27, 29, 28, 30, 29, 31, 30, 32, 31 +bilin_v_perm8: db 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 + db 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23 + db 32, 80, 33, 81, 34, 82, 35, 83, 36, 84, 37, 85, 38, 86, 39, 87 + db 64, 32, 65, 33, 66, 34, 67, 35, 68, 36, 69, 37, 70, 38, 71, 39 +bilin_v_perm16: db 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 + db 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 + db 64, 16, 65, 17, 66, 18, 67, 19, 68, 20, 69, 21, 70, 22, 71, 23 + db 72, 24, 73, 25, 74, 26, 75, 27, 76, 28, 77, 29, 78, 30, 79, 31 +bilin_v_perm32: db 64, 0, 65, 1, 66, 2, 67, 3, 68, 4, 69, 5, 70, 6, 71, 7 + db 72, 8, 73, 9, 74, 10, 75, 11, 76, 12, 77, 13, 78, 14, 79, 15 + db 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23 + db 88, 24, 89, 25, 90, 26, 91, 27, 92, 28, 93, 29, 94, 30, 95, 31 +bilin_v_perm64: dq 0, 4, 1, 5, 2, 6, 3, 7 +spel_h_perm16a: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 + db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 + db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38 + db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46 +spel_h_perm16b: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 + db 12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 + db 36, 37, 38, 39, 37, 38, 39, 40, 38, 39, 40, 41, 39, 40, 41, 42 + db 44, 45, 46, 47, 45, 46, 47, 48, 46, 47, 48, 49, 47, 48, 49, 50 +spel_h_perm16c: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 + db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22 + db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46 + db 48, 49, 50, 51, 49, 50, 51, 52, 50, 51, 52, 53, 51, 52, 53, 54 +spel_h_perm32a: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 + db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 + db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22 + db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30 +spel_h_perm32b: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 + db 12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 + db 20, 21, 22, 23, 21, 22, 23, 24, 22, 23, 24, 25, 23, 24, 25, 26 + db 28, 29, 30, 31, 29, 30, 31, 32, 30, 31, 32, 33, 31, 32, 33, 34 +spel_h_perm32c: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 + db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22 + db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30 + db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38 +spel_hv_perm4a: db 8, 9, 16, 17, 10, 11, 18, 19, 12, 13, 20, 21, 14, 15, 22, 23 + db 16, 17, 24, 25, 18, 19, 26, 27, 20, 21, 28, 29, 22, 23, 30, 31 +spel_hv_perm4b: db 24, 25, 32, 33, 26, 27, 34, 35, 28, 29, 36, 37, 30, 31, 38, 39 + db 32, 33, 40, 41, 34, 35, 42, 43, 36, 37, 44, 45, 38, 39, 46, 47 +spel_hv_perm4c: db 40, 41, 48, 49, 42, 43, 50, 51, 44, 45, 52, 53, 46, 47, 54, 55 + db 48, 49, 56, 57, 50, 51, 58, 59, 52, 53, 60, 61, 54, 55, 62, 63 +deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11 +subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 +subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 +subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 +bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11 +bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 +bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7 +pb_02461357: db 0, 2, 4, 6, 1, 3, 5, 7 + +wm_420_perm64: dq 0xfedcba9876543210 +wm_sign: dd 0x40804080, 0xc0c0c0c0, 0x40404040 + +pb_127: times 4 db 127 +pw_m128 times 2 dw -128 +pw_512: times 2 dw 512 +pw_1024: times 2 dw 1024 +pw_2048: times 2 dw 2048 +pw_6903: times 2 dw 6903 +pw_8192: times 2 dw 8192 +pd_2: dd 2 +pd_32: dd 32 +pd_32768: dd 32768 + +%define pb_m64 (wm_sign+4) +%define pb_64 (wm_sign+8) + +cextern mc_subpel_filters +%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) + +%macro BASE_JMP_TABLE 3-* + %xdefine %1_%2_table (%%table - %3) + %xdefine %%base %1_%2 + %%table: + %rep %0 - 2 + dw %%base %+ _w%3 - %%base + %rotate 1 + %endrep +%endmacro + +%macro HV_JMP_TABLE 5-* + %xdefine %%prefix mangle(private_prefix %+ _%1_%2_%3) + %xdefine %%base %1_%3 + %assign %%types %4 + %if %%types & 1 + %xdefine %1_%2_h_%3_table (%%h - %5) + %%h: + %rep %0 - 4 + dw %%prefix %+ .h_w%5 - %%base + %rotate 1 + %endrep + %rotate 4 + %endif + %if %%types & 2 + %xdefine %1_%2_v_%3_table (%%v - %5) + %%v: + %rep %0 - 4 + dw %%prefix %+ .v_w%5 - %%base + %rotate 1 + %endrep + %rotate 4 + %endif + %if %%types & 4 + %xdefine %1_%2_hv_%3_table (%%hv - %5) + %%hv: + %rep %0 - 4 + dw %%prefix %+ .hv_w%5 - %%base + %rotate 1 + %endrep + %endif +%endmacro + +%macro BIDIR_JMP_TABLE 1-* + %xdefine %1_table (%%table - 2*%2) + %xdefine %%base %1_table + %xdefine %%prefix mangle(private_prefix %+ _%1) + %%table: + %rep %0 - 1 + dd %%prefix %+ .w%2 - %%base + %rotate 1 + %endrep +%endmacro + +%xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_avx512icl.prep) + +%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX + +BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, 8tap, avx512icl, 7, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE avg_avx512icl, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_avg_avx512icl, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE mask_avx512icl, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_420_avx512icl, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_422_avx512icl, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_444_avx512icl, 4, 8, 16, 32, 64, 128 + +SECTION .text + +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro + +%macro WRAP_YMM 1+ +INIT_YMM cpuname + %1 +INIT_ZMM cpuname +%endmacro + +DECLARE_REG_TMP 3, 5, 6 + +INIT_ZMM avx512icl +cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 + movifnidn mxyd, r5m ; mx + lea t2, [prep_avx512icl] + tzcnt wd, wm + movifnidn hd, hm + test mxyd, mxyd + jnz .h + mov mxyd, r6m ; my + test mxyd, mxyd + jnz .v +.prep: + movzx wd, word [t2+wq*2+table_offset(prep,)] + add wq, t2 + lea stride3q, [strideq*3] + jmp wq +.prep_w4: + movd xmm0, [srcq+strideq*0] + pinsrd xmm0, [srcq+strideq*1], 1 + pinsrd xmm0, [srcq+strideq*2], 2 + pinsrd xmm0, [srcq+stride3q ], 3 + lea srcq, [srcq+strideq*4] + pmovzxbw ym0, xmm0 + psllw ym0, 4 + mova [tmpq], ym0 + add tmpq, 32 + sub hd, 4 + jg .prep_w4 + RET +.prep_w8: + movq xmm0, [srcq+strideq*0] + movq xmm1, [srcq+strideq*1] + vinserti128 ym0, ymm0, [srcq+strideq*2], 1 + vinserti128 ym1, ymm1, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + punpcklqdq ym0, ym1 + pmovzxbw m0, ym0 + psllw m0, 4 + mova [tmpq], m0 + add tmpq, 32*2 + sub hd, 4 + jg .prep_w8 + RET +.prep_w16: + movu xmm0, [srcq+strideq*0] + vinserti128 ym0, ymm0, [srcq+strideq*1], 1 + movu xmm1, [srcq+strideq*2] + vinserti128 ym1, ymm1, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + pmovzxbw m0, ym0 + pmovzxbw m1, ym1 + psllw m0, 4 + psllw m1, 4 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + add tmpq, 32*4 + sub hd, 4 + jg .prep_w16 + RET +.prep_w32: + pmovzxbw m0, [srcq+strideq*0] + pmovzxbw m1, [srcq+strideq*1] + pmovzxbw m2, [srcq+strideq*2] + pmovzxbw m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + REPX {psllw x, 4}, m0, m1, m2, m3 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + mova [tmpq+64*2], m2 + mova [tmpq+64*3], m3 + add tmpq, 64*4 + sub hd, 4 + jg .prep_w32 + RET +.prep_w64: + pmovzxbw m0, [srcq+strideq*0+32*0] + pmovzxbw m1, [srcq+strideq*0+32*1] + pmovzxbw m2, [srcq+strideq*1+32*0] + pmovzxbw m3, [srcq+strideq*1+32*1] + lea srcq, [srcq+strideq*2] + REPX {psllw x, 4}, m0, m1, m2, m3 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + mova [tmpq+64*2], m2 + mova [tmpq+64*3], m3 + add tmpq, 64*4 + sub hd, 2 + jg .prep_w64 + RET +.prep_w128: + pmovzxbw m0, [srcq+32*0] + pmovzxbw m1, [srcq+32*1] + pmovzxbw m2, [srcq+32*2] + pmovzxbw m3, [srcq+32*3] + REPX {psllw x, 4}, m0, m1, m2, m3 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + mova [tmpq+64*2], m2 + mova [tmpq+64*3], m3 + add tmpq, 64*4 + add srcq, strideq + dec hd + jg .prep_w128 + RET +.h: + ; 16 * src[x] + (mx * (src[x + 1] - src[x])) + ; = (16 - mx) * src[x] + mx * src[x + 1] + imul mxyd, 0xff01 + add mxyd, 16 << 8 + vpbroadcastw m5, mxyd + mov mxyd, r6m ; my + test mxyd, mxyd + jnz .hv + movzx wd, word [t2+wq*2+table_offset(prep, _bilin_h)] + add wq, t2 + lea stride3q, [strideq*3] + jmp wq +.h_w4: + vbroadcasti32x4 ym4, [bilin_h_shuf4] +.h_w4_loop: + movq xmm0, [srcq+strideq*0] + movq xmm1, [srcq+strideq*1] + vinserti32x4 ym0, ymm0, [srcq+strideq*2], 1 + vinserti32x4 ym1, ymm1, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + punpcklqdq ym0, ym1 + pshufb ym0, ym4 + pmaddubsw ym0, ym5 + mova [tmpq], ym0 + add tmpq, 32 + sub hd, 4 + jg .h_w4_loop + RET +.h_w8: + vbroadcasti32x4 m4, [bilin_h_shuf8] +.h_w8_loop: + movu xmm0, [srcq+strideq*0] + vinserti32x4 ym0, ymm0, [srcq+strideq*1], 1 + vinserti32x4 m0, [srcq+strideq*2], 2 + vinserti32x4 m0, [srcq+stride3q ], 3 + lea srcq, [srcq+strideq*4] + pshufb m0, m4 + pmaddubsw m0, m5 + mova [tmpq], m0 + add tmpq, 64 + sub hd, 4 + jg .h_w8_loop + RET +.h_w16: + mova m4, [bilin_h_perm16] +.h_w16_loop: + movu ym0, [srcq+strideq*0] + vinserti32x8 m0, [srcq+strideq*1], 1 + movu ym1, [srcq+strideq*2] + vinserti32x8 m1, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + vpermb m0, m4, m0 + vpermb m1, m4, m1 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + add tmpq, 64*2 + sub hd, 4 + jg .h_w16_loop + RET +.h_w32: + mova m4, [bilin_h_perm32] +.h_w32_loop: + vpermb m0, m4, [srcq+strideq*0] + vpermb m1, m4, [srcq+strideq*1] + vpermb m2, m4, [srcq+strideq*2] + vpermb m3, m4, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmaddubsw m2, m5 + pmaddubsw m3, m5 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + mova [tmpq+64*2], m2 + mova [tmpq+64*3], m3 + add tmpq, 64*4 + sub hd, 4 + jg .h_w32_loop + RET +.h_w64: + mova m4, [bilin_h_perm32] +.h_w64_loop: + vpermb m0, m4, [srcq+strideq*0+32*0] + vpermb m1, m4, [srcq+strideq*0+32*1] + vpermb m2, m4, [srcq+strideq*1+32*0] + vpermb m3, m4, [srcq+strideq*1+32*1] + lea srcq, [srcq+strideq*2] + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmaddubsw m2, m5 + pmaddubsw m3, m5 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + mova [tmpq+64*2], m2 + mova [tmpq+64*3], m3 + add tmpq, 64*4 + sub hd, 2 + jg .h_w64_loop + RET +.h_w128: + mova m4, [bilin_h_perm32] +.h_w128_loop: + vpermb m0, m4, [srcq+32*0] + vpermb m1, m4, [srcq+32*1] + vpermb m2, m4, [srcq+32*2] + vpermb m3, m4, [srcq+32*3] + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmaddubsw m2, m5 + pmaddubsw m3, m5 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + mova [tmpq+64*2], m2 + mova [tmpq+64*3], m3 + add tmpq, 64*4 + add srcq, strideq + dec hd + jg .h_w128_loop + RET +.v: + WIN64_SPILL_XMM 7 + movzx wd, word [t2+wq*2+table_offset(prep, _bilin_v)] + imul mxyd, 0xff01 + add mxyd, 16 << 8 + add wq, t2 + lea stride3q, [strideq*3] + vpbroadcastw m6, mxyd + jmp wq +.v_w4: + vpbroadcastd xm0, [srcq+strideq*0] + mov r3d, 0x29 + vbroadcasti32x4 ym3, [bilin_v_shuf4] + kmovb k1, r3d +.v_w4_loop: + vpblendmd xm1{k1}, xm0, [srcq+strideq*1] {1to4} ; __01 ____ + vpbroadcastd ym2, [srcq+strideq*2] + vpbroadcastd ym2{k1}, [srcq+stride3q ] ; __2_ 23__ + lea srcq, [srcq+strideq*4] + vpbroadcastd ym0, [srcq+strideq*0] + punpckhqdq ym2{k1}, ym1, ym0 ; 012_ 234_ + pshufb ym2, ym3 + pmaddubsw ym2, ym6 + mova [tmpq], ym2 + add tmpq, 32 + sub hd, 4 + jg .v_w4_loop + RET +.v_w8: + mova m5, [bilin_v_perm8] + vbroadcasti32x4 ym0, [srcq+strideq*0] +.v_w8_loop: + vinserti32x4 ym1, ym0, [srcq+strideq*1], 1 + vpbroadcastq ym0, [srcq+strideq*2] + vinserti32x4 m1, [srcq+stride3q ], 2 + lea srcq, [srcq+strideq*4] + vinserti32x4 ym0, [srcq+strideq*0], 0 + vpermt2b m1, m5, m0 + pmaddubsw m1, m6 + mova [tmpq], m1 + add tmpq, 64 + sub hd, 4 + jg .v_w8_loop + RET +.v_w16: + mova m5, [bilin_v_perm16] + movu xm0, [srcq+strideq*0] +.v_w16_loop: + movu xm2, [srcq+strideq*2] + vinserti32x4 ym1, ym0, [srcq+strideq*1], 1 + vpermt2b m1, m5, m2 + vinserti32x4 ym2, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + movu xm0, [srcq+strideq*0] + vpermt2b m2, m5, m0 + pmaddubsw m1, m6 + pmaddubsw m2, m6 + mova [tmpq+64*0], m1 + mova [tmpq+64*1], m2 + add tmpq, 64*2 + sub hd, 4 + jg .v_w16_loop + RET +.v_w32: + mova m5, [bilin_v_perm32] + movu ym0, [srcq+strideq*0] +.v_w32_loop: + movu ym2, [srcq+strideq*1] + movu ym3, [srcq+strideq*2] + movu ym4, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpermt2b m0, m5, m2 + vpermt2b m2, m5, m3 + vpermt2b m3, m5, m4 + pmaddubsw m1, m0, m6 + movu ym0, [srcq+strideq*0] + vpermt2b m4, m5, m0 + pmaddubsw m2, m6 + pmaddubsw m3, m6 + pmaddubsw m4, m6 + mova [tmpq+64*0], m1 + mova [tmpq+64*1], m2 + mova [tmpq+64*2], m3 + mova [tmpq+64*3], m4 + add tmpq, 64*4 + sub hd, 4 + jg .v_w32_loop + RET +.v_w64: + mova m5, [bilin_v_perm64] + vpermq m0, m5, [srcq+strideq*0] +.v_w64_loop: + vpermq m1, m5, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + punpcklbw m4, m1, m0 + punpckhbw m2, m1, m0 + vpermq m0, m5, [srcq+strideq*0] + punpcklbw m3, m0, m1 + punpckhbw m1, m0, m1 + pmaddubsw m4, m6 + pmaddubsw m2, m6 + pmaddubsw m3, m6 + pmaddubsw m1, m6 + mova [tmpq+64*0], m4 + mova [tmpq+64*1], m2 + mova [tmpq+64*2], m3 + mova [tmpq+64*3], m1 + add tmpq, 64*4 + sub hd, 2 + jg .v_w64_loop + RET +.v_w128: + mova m5, [bilin_v_perm64] + vpermq m0, m5, [srcq+strideq*0+ 0] + vpermq m1, m5, [srcq+strideq*0+64] +.v_w128_loop: + vpermq m2, m5, [srcq+strideq*1+ 0] + vpermq m3, m5, [srcq+strideq*1+64] + lea srcq, [srcq+strideq*2] + punpcklbw m4, m2, m0 + punpckhbw m0, m2, m0 + pmaddubsw m4, m6 + pmaddubsw m0, m6 + mova [tmpq+64*0], m4 + mova [tmpq+64*1], m0 + punpcklbw m4, m3, m1 + punpckhbw m1, m3, m1 + pmaddubsw m4, m6 + pmaddubsw m1, m6 + mova [tmpq+64*2], m4 + mova [tmpq+64*3], m1 + vpermq m0, m5, [srcq+strideq*0+ 0] + vpermq m1, m5, [srcq+strideq*0+64] + punpcklbw m4, m0, m2 + punpckhbw m2, m0, m2 + pmaddubsw m4, m6 + pmaddubsw m2, m6 + mova [tmpq+64*4], m4 + mova [tmpq+64*5], m2 + punpcklbw m4, m1, m3 + punpckhbw m3, m1, m3 + pmaddubsw m4, m6 + pmaddubsw m3, m6 + mova [tmpq+64*6], m4 + mova [tmpq+64*7], m3 + add tmpq, 64*8 + sub hd, 2 + jg .v_w128_loop + RET +.hv: + ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4 + ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4) + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 7 + movzx wd, word [t2+wq*2+table_offset(prep, _bilin_hv)] + shl mxyd, 11 + vpbroadcastw m6, mxyd + add wq, t2 + lea stride3q, [strideq*3] + jmp wq +.hv_w4: + vbroadcasti32x4 ym4, [bilin_h_shuf4] + vpbroadcastq ym0, [srcq+strideq*0] + pshufb ym0, ym4 + pmaddubsw ym0, ym5 +.hv_w4_loop: + movq xmm1, [srcq+strideq*1] + movq xmm2, [srcq+strideq*2] + vinserti32x4 ym1, ymm1, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + vinserti32x4 ym2, ymm2, [srcq+strideq*0], 1 + punpcklqdq ym1, ym2 + pshufb ym1, ym4 + pmaddubsw ym1, ym5 ; 1 2 3 4 + valignq ym2, ym1, ym0, 3 ; 0 1 2 3 + mova ym0, ym1 + psubw ym1, ym2 + pmulhrsw ym1, ym6 + paddw ym1, ym2 + mova [tmpq], ym1 + add tmpq, 32 + sub hd, 4 + jg .hv_w4_loop + RET +.hv_w8: + vbroadcasti32x4 m4, [bilin_h_shuf8] + vbroadcasti32x4 m0, [srcq+strideq*0] + pshufb m0, m4 + pmaddubsw m0, m5 +.hv_w8_loop: + movu xmm1, [srcq+strideq*1] + vinserti128 ym1, ymm1, [srcq+strideq*2], 1 + vinserti128 m1, [srcq+stride3q ], 2 + lea srcq, [srcq+strideq*4] + vinserti128 m1, [srcq+strideq*0], 3 + pshufb m1, m4 + pmaddubsw m1, m5 ; 1 2 3 4 + valignq m2, m1, m0, 6 ; 0 1 2 3 + mova m0, m1 + psubw m1, m2 + pmulhrsw m1, m6 + paddw m1, m2 + mova [tmpq], m1 + add tmpq, 64 + sub hd, 4 + jg .hv_w8_loop + RET +.hv_w16: + mova m4, [bilin_h_perm16] + vbroadcasti32x8 m0, [srcq+strideq*0] + vpermb m0, m4, m0 + pmaddubsw m0, m5 +.hv_w16_loop: + movu ym1, [srcq+strideq*1] + vinserti32x8 m1, [srcq+strideq*2], 1 + movu ym2, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vinserti32x8 m2, [srcq+strideq*0], 1 + vpermb m1, m4, m1 + vpermb m2, m4, m2 + pmaddubsw m1, m5 ; 1 2 + vshufi32x4 m3, m0, m1, q1032 ; 0 1 + pmaddubsw m0, m2, m5 ; 3 4 + vshufi32x4 m2, m1, m0, q1032 ; 2 3 + psubw m1, m3 + pmulhrsw m1, m6 + paddw m1, m3 + psubw m3, m0, m2 + pmulhrsw m3, m6 + paddw m3, m2 + mova [tmpq+64*0], m1 + mova [tmpq+64*1], m3 + add tmpq, 64*2 + sub hd, 4 + jg .hv_w16_loop + RET +.hv_w32: + mova m4, [bilin_h_perm32] + vpermb m0, m4, [srcq+strideq*0] + pmaddubsw m0, m5 +.hv_w32_loop: + vpermb m1, m4, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vpermb m2, m4, [srcq+strideq*0] + pmaddubsw m1, m5 + psubw m3, m1, m0 + pmulhrsw m3, m6 + paddw m3, m0 + pmaddubsw m0, m2, m5 + psubw m2, m0, m1 + pmulhrsw m2, m6 + paddw m2, m1 + mova [tmpq+64*0], m3 + mova [tmpq+64*1], m2 + add tmpq, 64*2 + sub hd, 2 + jg .hv_w32_loop + RET +.hv_w64: + mova m4, [bilin_h_perm32] + vpermb m0, m4, [srcq+32*0] + vpermb m1, m4, [srcq+32*1] + pmaddubsw m0, m5 + pmaddubsw m1, m5 +.hv_w64_loop: + add srcq, strideq + vpermb m2, m4, [srcq+32*0] + vpermb m3, m4, [srcq+32*1] + pmaddubsw m2, m5 + pmaddubsw m3, m5 + psubw m7, m2, m0 + psubw m8, m3, m1 + pmulhrsw m7, m6 + pmulhrsw m8, m6 + paddw m7, m0 + mova m0, m2 + paddw m8, m1 + mova m1, m3 + mova [tmpq+64*0], m7 + mova [tmpq+64*1], m8 + add tmpq, 64*2 + dec hd + jg .hv_w64_loop + RET +.hv_w128: + mova m4, [bilin_h_perm32] + vpermb m0, m4, [srcq+32*0] + vpermb m1, m4, [srcq+32*1] + vpermb m2, m4, [srcq+32*2] + vpermb m3, m4, [srcq+32*3] + REPX {pmaddubsw x, m5}, m0, m1, m2, m3 +.hv_w128_loop: + add srcq, strideq + vpermb m7, m4, [srcq+32*0] + vpermb m8, m4, [srcq+32*1] + vpermb m9, m4, [srcq+32*2] + vpermb m10, m4, [srcq+32*3] + REPX {pmaddubsw x, m5}, m7, m8, m9, m10 + psubw m11, m7, m0 + psubw m12, m8, m1 + psubw m13, m9, m2 + psubw m14, m10, m3 + REPX {pmulhrsw x, m6}, m11, m12, m13, m14 + paddw m11, m0 + mova m0, m7 + paddw m12, m1 + mova m1, m8 + paddw m13, m2 + mova m2, m9 + paddw m14, m3 + mova m3, m10 + mova [tmpq+64*0], m11 + mova [tmpq+64*1], m12 + mova [tmpq+64*2], m13 + mova [tmpq+64*3], m14 + add tmpq, 64*4 + dec hd + jg .hv_w128_loop + RET + +; int8_t subpel_filters[5][15][8] +%assign FILTER_REGULAR (0*15 << 16) | 3*15 +%assign FILTER_SMOOTH (1*15 << 16) | 4*15 +%assign FILTER_SHARP (2*15 << 16) | 3*15 + +%macro FN 4 ; fn, type, type_h, type_v +cglobal %1_%2 + mov t0d, FILTER_%3 +%ifidn %3, %4 + mov t1d, t0d +%else + mov t1d, FILTER_%4 +%endif +%ifnidn %2, regular ; skip the jump in the last filter + jmp mangle(private_prefix %+ _%1 %+ SUFFIX) +%endif +%endmacro + +%macro PREP_8TAP_H 0 + vpermb m10, m5, m0 + vpermb m11, m5, m1 + vpermb m12, m6, m0 + vpermb m13, m6, m1 + vpermb m14, m7, m0 + vpermb m15, m7, m1 + mova m0, m4 + vpdpbusd m0, m10, m8 + mova m2, m4 + vpdpbusd m2, m12, m8 + mova m1, m4 + vpdpbusd m1, m11, m8 + mova m3, m4 + vpdpbusd m3, m13, m8 + vpdpbusd m0, m12, m9 + vpdpbusd m2, m14, m9 + vpdpbusd m1, m13, m9 + vpdpbusd m3, m15, m9 + packssdw m0, m2 + packssdw m1, m3 + psraw m0, 2 + psraw m1, 2 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 +%endmacro + +%if WIN64 +DECLARE_REG_TMP 6, 4 +%else +DECLARE_REG_TMP 6, 7 +%endif + +%define PREP_8TAP_FN FN prep_8tap, + +PREP_8TAP_FN sharp, SHARP, SHARP +PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH +PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP +PREP_8TAP_FN smooth, SMOOTH, SMOOTH +PREP_8TAP_FN sharp_regular, SHARP, REGULAR +PREP_8TAP_FN regular_sharp, REGULAR, SHARP +PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR +PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH +PREP_8TAP_FN regular, REGULAR, REGULAR + +cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + lea r7, [prep_avx512icl] + movsxd wq, wm + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jnz .v + tzcnt wd, wd + movzx wd, word [r7+wq*2+table_offset(prep,)] + add wq, r7 + lea r6, [strideq*3] +%if WIN64 + pop r7 +%endif + jmp wq +.h: + test myd, 0xf00 + jnz .hv + vpbroadcastd m4, [pd_2] + WIN64_SPILL_XMM 10 + cmp wd, 4 + je .h_w4 + tzcnt wd, wd + shr mxd, 16 + sub srcq, 3 + movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)] + vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep_avx512icl+0] + vpbroadcastd m9, [r7+mxq*8+subpel_filters-prep_avx512icl+4] + add wq, r7 + jmp wq +.h_w4: + movzx mxd, mxb + vbroadcasti128 ym5, [subpel_h_shufA] + mov r3d, 0x4 + dec srcq + vpbroadcastd ym6, [r7+mxq*8+subpel_filters-prep_avx512icl+2] + kmovb k1, r3d + lea stride3q, [strideq*3] +.h_w4_loop: + movq xm2, [srcq+strideq*0] + movq xm3, [srcq+strideq*1] + vpbroadcastq ym2{k1}, [srcq+strideq*2] + vpbroadcastq ym3{k1}, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + pshufb ym2, ym5 + pshufb ym3, ym5 + mova ym0, ym4 + vpdpbusd ym0, ym2, ym6 + mova ym1, ym4 + vpdpbusd ym1, ym3, ym6 + packssdw ym0, ym1 + psraw ym0, 2 + mova [tmpq], ym0 + add tmpq, 32 + sub hd, 4 + jg .h_w4_loop + RET +.h_w8: + vbroadcasti128 m5, [subpel_h_shufA] + vbroadcasti128 m6, [subpel_h_shufB] + vbroadcasti128 m7, [subpel_h_shufC] + lea stride3q, [strideq*3] +.h_w8_loop: + movu xmm3, [srcq+strideq*0] + vinserti128 ym3, ymm3, [srcq+strideq*1], 1 + vinserti128 m3, [srcq+strideq*2], 2 + vinserti128 m3, [srcq+stride3q ], 3 + lea srcq, [srcq+strideq*4] + pshufb m1, m3, m5 + pshufb m2, m3, m6 + mova m0, m4 + vpdpbusd m0, m1, m8 + mova m1, m4 + vpdpbusd m1, m2, m8 + pshufb m3, m7 + vpdpbusd m0, m2, m9 + vpdpbusd m1, m3, m9 + packssdw m0, m1 + psraw m0, 2 + mova [tmpq], m0 + add tmpq, 64 + sub hd, 4 + jg .h_w8_loop + RET +.h_w16: + mova m5, [spel_h_perm16a] + mova m6, [spel_h_perm16b] + mova m7, [spel_h_perm16c] + lea stride3q, [strideq*3] +.h_w16_loop: + movu ym0, [srcq+strideq*0] + movu ym1, [srcq+strideq*2] + vinserti32x8 m0, [srcq+strideq*1], 1 + vinserti32x8 m1, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + PREP_8TAP_H + add tmpq, 64*2 + sub hd, 4 + jg .h_w16_loop + RET +.h_w32: + mova m5, [spel_h_perm32a] + mova m6, [spel_h_perm32b] + mova m7, [spel_h_perm32c] +.h_w32_loop: + movu m0, [srcq+strideq*0] + movu m1, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + PREP_8TAP_H + add tmpq, 64*2 + sub hd, 2 + jg .h_w32_loop + RET +.h_w64: + xor r6d, r6d + jmp .h_start +.h_w128: + mov r6, -64*1 +.h_start: + mova m5, [spel_h_perm32a] + mova m6, [spel_h_perm32b] + mova m7, [spel_h_perm32c] + sub srcq, r6 + mov r5, r6 +.h_loop: + movu m0, [srcq+r6+32*0] + movu m1, [srcq+r6+32*1] + PREP_8TAP_H + add tmpq, 64*2 + add r6, 64 + jle .h_loop + add srcq, strideq + mov r6, r5 + dec hd + jg .h_loop + RET +.v: + movzx mxd, myb ; Select 4-tap/8-tap filter multipliers. + shr myd, 16 ; Note that the code is 8-tap only, having + tzcnt wd, wd + cmp hd, 4 ; a separate 4-tap code path for (4|8|16)x4 + cmove myd, mxd ; had a negligible effect on performance. + ; TODO: Would a 6-tap code path be worth it? + lea myq, [r7+myq*8+subpel_filters-prep_avx512icl] + movzx wd, word [r7+wq*2+table_offset(prep, _8tap_v)] + add wq, r7 + lea stride3q, [strideq*3] + sub srcq, stride3q + vpbroadcastd m7, [pw_8192] + vpbroadcastw m8, [myq+0] + vpbroadcastw m9, [myq+2] + vpbroadcastw m10, [myq+4] + vpbroadcastw m11, [myq+6] + jmp wq +.v_w4: + movd xmm0, [srcq+strideq*0] + vpbroadcastd ymm1, [srcq+strideq*2] + vpbroadcastd xmm2, [srcq+strideq*1] + vpbroadcastd ymm3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpblendd ymm1, ymm0, 0x01 ; 0 2 2 _ 2 _ _ _ + vpblendd ymm3, ymm2, 0x03 ; 1 1 3 3 3 3 _ _ + vpbroadcastd ymm0, [srcq+strideq*0] + vpbroadcastd ymm2, [srcq+strideq*1] + vpblendd ymm1, ymm0, 0x68 ; 0 2 2 4 2 4 4 _ + vpbroadcastd ymm0, [srcq+strideq*2] + vbroadcasti128 ymm5, [deint_shuf4] + vpblendd ymm3, ymm2, 0xc0 ; 1 1 3 3 3 3 5 5 + vpblendd ymm2, ymm3, ymm1, 0x55 ; 0 1 2 3 2 3 4 5 + vpblendd ymm3, ymm1, 0xaa ; 1 2 3 4 3 4 5 _ + punpcklbw ymm1, ymm2, ymm3 ; 01 12 23 34 + vpblendd ymm3, ymm0, 0x80 ; 1 2 3 4 3 4 5 6 + punpckhbw ymm2, ymm3 ; 23 34 45 56 +.v_w4_loop: + pinsrd xmm0, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + vpbroadcastd ymm3, [srcq+strideq*0] + vpbroadcastd ymm4, [srcq+strideq*1] + vpblendd ymm3, ymm4, 0x20 ; _ _ 8 _ 8 9 _ _ + vpblendd ymm3, ymm0, 0x03 ; 6 7 8 _ 8 9 _ _ + vpbroadcastd ymm0, [srcq+strideq*2] + vpblendd ymm3, ymm0, 0x40 ; 6 7 8 _ 8 9 a _ + pshufb ymm3, ymm5 ; 67 78 89 9a + pmaddubsw ymm4, ymm1, ym8 + vperm2i128 ymm1, ymm2, ymm3, 0x21 ; 45 56 67 78 + pmaddubsw ymm2, ym9 + paddw ymm4, ymm2 + mova ymm2, ymm3 + pmaddubsw ymm3, ym11 + paddw ymm3, ymm4 + pmaddubsw ymm4, ymm1, ym10 + paddw ymm3, ymm4 + pmulhrsw ymm3, ym7 + mova [tmpq], ymm3 + add tmpq, 32 + sub hd, 4 + jg .v_w4_loop + vzeroupper + RET +.v_w8: + mov r3d, 0xf044 + kmovw k1, r3d + kshiftrw k2, k1, 8 + movq xm0, [srcq+strideq*0] + vpbroadcastq ym1, [srcq+strideq*1] + vpbroadcastq m2, [srcq+strideq*2] + vpbroadcastq m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpbroadcastq m4, [srcq+strideq*0] + vpbroadcastq m5, [srcq+strideq*1] + vpbroadcastq m6, [srcq+strideq*2] + vmovdqa64 ym0{k1}, ym1 + vmovdqa64 ym1{k1}, ym2 + vmovdqa64 m2{k1}, m3 + vmovdqa64 m3{k1}, m4 + vmovdqa64 m4{k1}, m5 + vmovdqa64 m5{k1}, m6 + punpcklbw ym0, ym1 ; 01 12 __ __ + punpcklbw m2, m3 ; 23 34 23 34 + punpcklbw m4, m5 ; 45 56 45 56 + vmovdqa64 m0{k2}, m2 ; 01 12 23 34 + vmovdqa64 m2{k2}, m4 ; 23 34 45 56 +.v_w8_loop: + vpbroadcastq m1, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpbroadcastq m3, [srcq+strideq*0] + vpbroadcastq m5, [srcq+strideq*1] + pmaddubsw m14, m0, m8 + pmaddubsw m15, m2, m9 + vpblendmq m0{k1}, m6, m1 + vpblendmq m2{k1}, m1, m3 + vpbroadcastq m6, [srcq+strideq*2] + paddw m14, m15 + punpcklbw m2, m0, m2 ; 67 78 67 78 + vpblendmq m12{k1}, m3, m5 + vpblendmq m13{k1}, m5, m6 + vpblendmq m0{k2}, m4, m2 ; 45 56 67 78 + punpcklbw m4, m12, m13 ; 89 9a 89 9a + vmovdqa64 m2{k2}, m4 ; 67 78 89 9a + pmaddubsw m12, m0, m10 + pmaddubsw m13, m2, m11 + paddw m14, m12 + paddw m14, m13 + pmulhrsw m14, m7 + mova [tmpq], m14 + add tmpq, 64 + sub hd, 4 + jg .v_w8_loop + RET +.v_w16: + mov r3d, 0xf0 + kmovb k1, r3d + vbroadcasti128 m0, [srcq+strideq*0] + vbroadcasti128 m1, [srcq+strideq*1] + vbroadcasti128 m2, [srcq+strideq*2] + vbroadcasti128 m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vbroadcasti128 m4, [srcq+strideq*0] + vbroadcasti128 m5, [srcq+strideq*1] + vbroadcasti128 m6, [srcq+strideq*2] + vmovdqa64 m0{k1}, m1 + vmovdqa64 m1{k1}, m2 + vmovdqa64 m2{k1}, m3 + vmovdqa64 m3{k1}, m4 + vmovdqa64 m4{k1}, m5 + vmovdqa64 m5{k1}, m6 + shufpd m0, m2, 0xcc ; 0a_2a 0b_2b 1a_3a 1b_3b + shufpd m1, m3, 0xcc ; 1a_3a 1b_3b 2a_4a 2b_4b + shufpd m4, m4, 0x44 ; 4a_-- 4b_-- 5a_-- 5b_-- + shufpd m5, m5, 0x44 ; 5a_-- 5b_-- 6a_-- 6b_-- + punpckhbw m2, m0, m1 ; 23a 23b 34a 34b + punpcklbw m0, m1 ; 01a 01b 12a 12b + punpcklbw m4, m5 ; 45a 45b 56a 56b +.v_w16_loop: + vbroadcasti128 m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vbroadcasti128 m5, [srcq+strideq*0] + vpblendmq m1{k1}, m6, m3 + vmovdqa64 m3{k1}, m5 + pmaddubsw m12, m0, m8 + pmaddubsw m13, m2, m8 + pmaddubsw m14, m2, m9 + pmaddubsw m15, m4, m9 + pmaddubsw m0, m4, m10 + vbroadcasti128 m2, [srcq+strideq*1] + vbroadcasti128 m6, [srcq+strideq*2] + paddw m12, m14 + paddw m13, m15 + paddw m12, m0 + vmovdqa64 m5{k1}, m2 + vmovdqa64 m2{k1}, m6 + mova m0, m4 + shufpd m1, m5, 0xcc ; 6a_8a 6b_8b 7a_9a 7b_9b + shufpd m3, m2, 0xcc ; 7a_9a 7b_9b 8a_Aa 8b_Ab + punpcklbw m2, m1, m3 ; 67a 67b 78a 78b + punpckhbw m4, m1, m3 ; 89a 89b 9Aa 9Ab + pmaddubsw m14, m2, m10 + pmaddubsw m15, m2, m11 + paddw m13, m14 + paddw m12, m15 + pmaddubsw m14, m4, m11 + paddw m13, m14 + pmulhrsw m12, m7 + pmulhrsw m13, m7 + mova [tmpq+ 0], m12 + mova [tmpq+64], m13 + add tmpq, 64*2 + sub hd, 4 + jg .v_w16_loop + RET +.v_w32: + mova m18, [bilin_v_perm64] + movu ym0, [srcq+strideq*0] + movu ym1, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + movu ym2, [srcq+strideq*0] + movu ym3, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + movu ym4, [srcq+strideq*0] + movu ym5, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + movu ym6, [srcq+strideq*0] + vpermq m0, m18, m0 + vpermq m1, m18, m1 + vpermq m2, m18, m2 + vpermq m3, m18, m3 + vpermq m4, m18, m4 + vpermq m5, m18, m5 + vpermq m6, m18, m6 + punpcklbw m0, m1 + punpcklbw m1, m2 + punpcklbw m2, m3 + punpcklbw m3, m4 + punpcklbw m4, m5 + punpcklbw m5, m6 +.v_w32_loop: + movu ym12, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + movu ym13, [srcq+strideq*0] + pmaddubsw m14, m0, m8 + pmaddubsw m16, m2, m9 + pmaddubsw m15, m1, m8 + pmaddubsw m17, m3, m9 + mova m0, m2 + mova m1, m3 + vpermq m12, m18, m12 + vpermq m13, m18, m13 + paddw m14, m16 + paddw m15, m17 + pmaddubsw m16, m4, m10 + pmaddubsw m17, m5, m10 + punpcklbw m6, m12 + punpcklbw m12, m13 + mova m2, m4 + mova m3, m5 + paddw m14, m16 + paddw m15, m17 + pmaddubsw m16, m6, m11 + pmaddubsw m17, m12, m11 + mova m4, m6 + mova m5, m12 + paddw m14, m16 + paddw m15, m17 + pmulhrsw m14, m7 + pmulhrsw m15, m7 + mova m6, m13 + mova [tmpq+ 0], m14 + mova [tmpq+64], m15 + add tmpq, 64*2 + sub hd, 2 + jg .v_w32_loop + vzeroupper + RET +.v_w64: + mov wd, 64 + jmp .v_start +.v_w128: + mov wd, 128 +.v_start: + WIN64_SPILL_XMM 27 + mova m26, [bilin_v_perm64] + lea r6d, [hq+wq*2] + mov r5, srcq + mov r7, tmpq +.v_loop0: + vpermq m0, m26, [srcq+strideq*0] + vpermq m1, m26, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vpermq m2, m26, [srcq+strideq*0] + vpermq m3, m26, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vpermq m4, m26, [srcq+strideq*0] + vpermq m5, m26, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vpermq m6, m26, [srcq+strideq*0] + punpckhbw m12, m0, m1 + punpcklbw m0, m1 + punpckhbw m13, m1, m2 + punpcklbw m1, m2 + punpckhbw m14, m2, m3 + punpcklbw m2, m3 + punpckhbw m15, m3, m4 + punpcklbw m3, m4 + punpckhbw m16, m4, m5 + punpcklbw m4, m5 + punpckhbw m17, m5, m6 + punpcklbw m5, m6 +.v_loop: + vpermq m18, m26, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vpermq m19, m26, [srcq+strideq*0] + pmaddubsw m20, m0, m8 + pmaddubsw m21, m12, m8 + pmaddubsw m22, m1, m8 + pmaddubsw m23, m13, m8 + mova m0, m2 + mova m12, m14 + mova m1, m3 + mova m13, m15 + pmaddubsw m2, m9 + pmaddubsw m14, m9 + pmaddubsw m3, m9 + pmaddubsw m15, m9 + punpckhbw m24, m6, m18 + punpcklbw m6, m18 + paddw m20, m2 + paddw m21, m14 + paddw m22, m3 + paddw m23, m15 + mova m2, m4 + mova m14, m16 + mova m3, m5 + mova m15, m17 + pmaddubsw m4, m10 + pmaddubsw m16, m10 + pmaddubsw m5, m10 + pmaddubsw m17, m10 + punpckhbw m25, m18, m19 + punpcklbw m18, m19 + paddw m20, m4 + paddw m21, m16 + paddw m22, m5 + paddw m23, m17 + mova m4, m6 + mova m16, m24 + mova m5, m18 + mova m17, m25 + pmaddubsw m6, m11 + pmaddubsw m24, m11 + pmaddubsw m18, m11 + pmaddubsw m25, m11 + paddw m20, m6 + paddw m21, m24 + paddw m22, m18 + paddw m23, m25 + pmulhrsw m20, m7 + pmulhrsw m21, m7 + pmulhrsw m22, m7 + pmulhrsw m23, m7 + mova m6, m19 + mova [tmpq+wq*0+ 0], m20 + mova [tmpq+wq*0+64], m21 + mova [tmpq+wq*2+ 0], m22 + mova [tmpq+wq*2+64], m23 + lea tmpq, [tmpq+wq*4] + sub hd, 2 + jg .v_loop + add r5, 64 + add r7, 128 + movzx hd, r6b + mov srcq, r5 + mov tmpq, r7 + sub r6d, 1<<8 + jg .v_loop0 + RET +.hv: + %assign stack_offset stack_offset - stack_size_padded + %assign stack_size_padded 0 + WIN64_SPILL_XMM 16 + cmp wd, 4 + je .hv_w4 + shr mxd, 16 + sub srcq, 3 + vpbroadcastd m10, [r7+mxq*8+subpel_filters-prep_avx512icl+0] + vpbroadcastd m11, [r7+mxq*8+subpel_filters-prep_avx512icl+4] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + tzcnt wd, wd + vpbroadcastd m8, [pd_2] + movzx wd, word [r7+wq*2+table_offset(prep, _8tap_hv)] + vpbroadcastd m9, [pd_32] + add wq, r7 + vpbroadcastq m0, [r7+myq*8+subpel_filters-prep_avx512icl] + lea stride3q, [strideq*3] + sub srcq, stride3q + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + pshufd m12, m0, q0000 + pshufd m13, m0, q1111 + pshufd m14, m0, q2222 + pshufd m15, m0, q3333 + jmp wq +.hv_w4: + movzx mxd, mxb + dec srcq + vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep_avx512icl+2] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + vpbroadcastq m0, [r7+myq*8+subpel_filters-prep_avx512icl] + lea stride3q, [strideq*3] + sub srcq, stride3q + mov r3d, 0x04 + kmovb k1, r3d + kshiftlb k2, k1, 2 + kshiftlb k3, k1, 4 + vpbroadcastd m10, [pd_2] + vbroadcasti128 m16, [subpel_h_shufA] + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + vpbroadcastd m11, [pd_32] + pshufd m12, m0, q0000 + pshufd m13, m0, q1111 + pshufd m14, m0, q2222 + pshufd m15, m0, q3333 + movq xm3, [srcq+strideq*0] + vpbroadcastq ym2, [srcq+strideq*1] + vpbroadcastq ym3{k1}, [srcq+strideq*2] + vpbroadcastq m2{k2}, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpbroadcastq m3{k2}, [srcq+strideq*0] + vpbroadcastq m2{k3}, [srcq+strideq*1] + vpbroadcastq m3{k3}, [srcq+strideq*2] + mova m17, [spel_hv_perm4a] + movu m18, [spel_hv_perm4b] + mova m0, m10 + mova m1, m10 + pshufb m2, m16 + pshufb m3, m16 + vpdpbusd m0, m2, m8 + vpdpbusd m1, m3, m8 + packssdw m0, m1 ; _ 0 1 2 3 4 5 6 + psraw m0, 2 + vpermb m1, m17, m0 ; 01 12 23 34 + vpermb m2, m18, m0 ; 23 34 45 56 +.hv_w4_loop: + movq xm3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + movq xm4, [srcq+strideq*0] + vpbroadcastq ym3{k1}, [srcq+strideq*1] + vpbroadcastq ym4{k1}, [srcq+strideq*2] + mova ym5, ym10 + mova ym6, ym10 + pshufb ym3, ym16 + pshufb ym4, ym16 + vpdpbusd ym5, ym3, ym8 + vpdpbusd ym6, ym4, ym8 + mova m7, m11 + packssdw ym5, ym6 ; 7 8 9 a _ _ _ _ + psraw ym5, 2 + valignq m0, m5, m0, 4 ; _ 4 5 6 7 8 9 a + vpdpwssd m7, m1, m12 + vpdpwssd m7, m2, m13 + vpermb m1, m17, m0 ; 45 56 67 78 + vpermb m2, m18, m0 ; 67 78 89 9a + vpdpwssd m7, m1, m14 + vpdpwssd m7, m2, m15 + psrad m7, 6 + vpmovdw [tmpq], m7 + add tmpq, 32 + sub hd, 4 + jg .hv_w4_loop + vzeroupper + RET +.hv_w8: + WIN64_SPILL_XMM 24 + vbroadcasti128 m16, [subpel_h_shufA] + vbroadcasti128 m17, [subpel_h_shufB] + vbroadcasti128 m18, [subpel_h_shufC] + vinserti128 ym0, [srcq+strideq*0], 1 + vinserti128 m0, [srcq+strideq*1], 2 + vinserti128 m0, [srcq+strideq*2], 3 + movu xm1, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vinserti128 ym1, [srcq+strideq*0], 1 + vinserti128 m1, [srcq+strideq*1], 2 + vinserti128 m1, [srcq+strideq*2], 3 + mova m2, m8 + mova m4, m8 + mova m3, m8 + mova m5, m8 + pshufb m20, m0, m16 + pshufb m21, m0, m17 + pshufb m22, m0, m18 + pshufb m23, m1, m16 + pshufb m6, m1, m17 + pshufb m7, m1, m18 + vpdpbusd m2, m20, m10 + vpdpbusd m4, m21, m10 + vpdpbusd m2, m21, m11 + vpdpbusd m4, m22, m11 + vpdpbusd m3, m23, m10 + vpdpbusd m5, m6, m10 + vpdpbusd m3, m6, m11 + vpdpbusd m5, m7, m11 + packssdw m2, m4 + packssdw m3, m5 + psraw m2, 2 ; _ 0 1 2 + psraw m3, 2 ; 3 4 5 6 + valignq m0, m3, m2, 2 ; 0 1 2 3 + valignq m1, m3, m2, 4 ; 1 2 3 4 + valignq m2, m3, m2, 6 ; 2 3 4 5 + punpcklwd m4, m0, m1 ; 01a 12a 23a 34a + punpckhwd m5, m0, m1 ; 01b 12b 23b 34b + punpcklwd m6, m2, m3 ; 23a 34a 45a 56a + punpckhwd m7, m2, m3 ; 23b 34b 45b 56b +.hv_w8_loop: + movu xm19, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vinserti128 ym19, [srcq+strideq*0], 1 + vinserti128 m19, [srcq+strideq*1], 2 + vinserti128 m19, [srcq+strideq*2], 3 + mova m20, m9 + mova m21, m9 + mova m22, m8 + mova m23, m8 + vpdpwssd m20, m4, m12 + vpdpwssd m21, m5, m12 + vpdpwssd m20, m6, m13 + vpdpwssd m21, m7, m13 + pshufb m0, m19, m16 + pshufb m1, m19, m17 + pshufb m2, m19, m18 + vpdpbusd m22, m0, m10 + vpdpbusd m23, m1, m10 + vpdpbusd m22, m1, m11 + vpdpbusd m23, m2, m11 + packssdw m22, m23 + psraw m22, 2 ; 7 8 9 A + valignq m0, m22, m3, 2 ; 4 5 6 7 + valignq m1, m22, m3, 4 ; 5 6 7 8 + valignq m2, m22, m3, 6 ; 6 7 8 9 + mova m3, m22 + punpcklwd m4, m0, m1 ; 45a 56a 67a 78a + punpckhwd m5, m0, m1 ; 45b 56b 67b 78b + punpcklwd m6, m2, m3 ; 67a 78a 89a 9Aa + punpckhwd m7, m2, m3 ; 67b 78b 89b 9Ab + vpdpwssd m20, m4, m14 + vpdpwssd m21, m5, m14 + vpdpwssd m20, m6, m15 + vpdpwssd m21, m7, m15 + psrad m20, 6 + psrad m21, 6 + packssdw m20, m21 + mova [tmpq], m20 + add tmpq, 64 + sub hd, 4 + jg .hv_w8_loop + RET +.hv_w16: + mov wd, 16*2 + jmp .hv_start +.hv_w32: + mov wd, 32*2 + jmp .hv_start +.hv_w64: + mov wd, 64*2 + jmp .hv_start +.hv_w128: + mov wd, 128*2 +.hv_start: + WIN64_SPILL_XMM 31 + mova m16, [spel_h_perm16a] + mova m17, [spel_h_perm16b] + mova m18, [spel_h_perm16c] + lea r6d, [hq+wq*8-256] + mov r5, srcq + mov r7, tmpq +.hv_loop0: + movu ym0, [srcq+strideq*0] + vinserti32x8 m0, [srcq+strideq*1], 1 + lea srcq, [srcq+strideq*2] + movu ym1, [srcq+strideq*0] + vinserti32x8 m1, [srcq+strideq*1], 1 + lea srcq, [srcq+strideq*2] + movu ym2, [srcq+strideq*0] + vinserti32x8 m2, [srcq+strideq*1], 1 + lea srcq, [srcq+strideq*2] + movu ym3, [srcq+strideq*0] + mova m4, m8 + mova m5, m8 + mova m6, m8 + mova m7, m8 + vpermb m19, m16, m0 + vpermb m20, m17, m0 + vpermb m21, m18, m0 + vpermb m22, m16, m1 + vpermb m23, m17, m1 + vpermb m24, m18, m1 + vpermb m25, m16, m2 + vpermb m26, m17, m2 + vpermb m27, m18, m2 + vpermb ym28, ym16, ym3 + vpermb ym29, ym17, ym3 + vpermb ym30, ym18, ym3 + mova m0, m8 + mova m1, m8 + mova ym2, ym8 + mova ym3, ym8 + vpdpbusd m4, m19, m10 + vpdpbusd m5, m20, m10 + vpdpbusd m6, m22, m10 + vpdpbusd m7, m23, m10 + vpdpbusd m0, m25, m10 + vpdpbusd m1, m26, m10 + vpdpbusd ym2, ym28, ym10 + vpdpbusd ym3, ym29, ym10 + vpdpbusd m4, m20, m11 + vpdpbusd m5, m21, m11 + vpdpbusd m6, m23, m11 + vpdpbusd m7, m24, m11 + vpdpbusd m0, m26, m11 + vpdpbusd m1, m27, m11 + vpdpbusd ym2, ym29, ym11 + vpdpbusd ym3, ym30, ym11 + packssdw m4, m5 + packssdw m6, m7 + packssdw m0, m1 + packssdw ym2, ym3 + psraw m4, 2 ; 0a 0b 1a 1b + psraw m6, 2 ; 2a 2b 3a 3b + psraw m0, 2 ; 4a 4b 5a 5b + psraw ym2, 2 ; 6a 6b __ __ + vshufi32x4 m5, m4, m6, q1032 ; 1a 1b 2a 2b + vshufi32x4 m7, m6, m0, q1032 ; 3a 3b 4a 4b + vshufi32x4 m1, m0, m2, q1032 ; 5a 5b 6a 6b + punpcklwd m2, m4, m5 ; 01a 01c 12a 12c + punpckhwd m3, m4, m5 ; 01b 01d 12b 12d + punpcklwd m4, m6, m7 ; 23a 23c 34a 34c + punpckhwd m5, m6, m7 ; 23b 23d 34b 34d + punpcklwd m6, m0, m1 ; 45a 45c 56a 56c + punpckhwd m7, m0, m1 ; 45b 45d 56b 56d +.hv_loop: + movu ym19, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vinserti32x8 m19, [srcq+strideq*0], 1 + mova m20, m9 + mova m21, m9 + mova m22, m8 + mova m23, m8 + vpdpwssd m20, m2, m12 + vpdpwssd m21, m3, m12 + vpdpwssd m20, m4, m13 + vpdpwssd m21, m5, m13 + vpermb m24, m16, m19 + vpermb m25, m17, m19 + vpermb m26, m18, m19 + vpdpbusd m22, m24, m10 + vpdpbusd m23, m25, m10 + vpdpbusd m22, m25, m11 + vpdpbusd m23, m26, m11 + packssdw m22, m23 + psraw m22, 2 ; 7a 7b 8a 8b + vshufi32x4 m0, m1, m22, q1032 ; 6a 6b 7a 7b + mova m2, m4 + mova m3, m5 + mova m1, m22 + mova m4, m6 + mova m5, m7 + punpcklwd m6, m0, m1 ; 67a 67c 78a 78c + punpckhwd m7, m0, m1 ; 67b 67d 78b 78d + vpdpwssd m20, m4, m14 + vpdpwssd m21, m5, m14 + vpdpwssd m20, m6, m15 + vpdpwssd m21, m7, m15 + psrad m20, 6 + psrad m21, 6 + packssdw m20, m21 + mova [tmpq+wq*0], ym20 + vextracti32x8 [tmpq+wq*1], m20, 1 + lea tmpq, [tmpq+wq*2] + sub hd, 2 + jg .hv_loop + add r5, 16 + add r7, 32 + movzx hd, r6b + mov srcq, r5 + mov tmpq, r7 + sub r6d, 1<<8 + jg .hv_loop0 + RET + +%macro BIDIR_FN 1 ; op + lea stride3q, [strideq*3] + jmp wq +.w4: + cmp hd, 8 + jg .w4_h16 + WRAP_YMM %1 0 + vextracti32x4 xmm1, ym0, 1 + movd [dstq ], xm0 + pextrd [dstq+strideq*1], xm0, 1 + movd [dstq+strideq*2], xmm1 + pextrd [dstq+stride3q ], xmm1, 1 + jl .w4_ret + lea dstq, [dstq+strideq*4] + pextrd [dstq ], xm0, 2 + pextrd [dstq+strideq*1], xm0, 3 + pextrd [dstq+strideq*2], xmm1, 2 + pextrd [dstq+stride3q ], xmm1, 3 +.w4_ret: + RET +.w4_h16: + vpbroadcastd m7, strided + pmulld m7, [bidir_sctr_w4] + %1 0 + kxnorw k1, k1, k1 + vpscatterdd [dstq+m7]{k1}, m0 + RET +.w8: + cmp hd, 4 + jne .w8_h8 + WRAP_YMM %1 0 + vextracti128 xmm1, ym0, 1 + movq [dstq ], xm0 + movq [dstq+strideq*1], xmm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xmm1 + RET +.w8_loop: + %1_INC_PTR 2 + lea dstq, [dstq+strideq*4] +.w8_h8: + %1 0 + vextracti32x4 xmm1, ym0, 1 + vextracti32x4 xmm2, m0, 2 + vextracti32x4 xmm3, m0, 3 + movq [dstq ], xm0 + movq [dstq+strideq*1], xmm1 + movq [dstq+strideq*2], xmm2 + movq [dstq+stride3q ], xmm3 + lea dstq, [dstq+strideq*4] + movhps [dstq ], xm0 + movhps [dstq+strideq*1], xmm1 + movhps [dstq+strideq*2], xmm2 + movhps [dstq+stride3q ], xmm3 + sub hd, 8 + jg .w8_loop + RET +.w16_loop: + %1_INC_PTR 2 + lea dstq, [dstq+strideq*4] +.w16: + %1 0 + vpermq m0, m0, q3120 + mova [dstq ], xm0 + vextracti32x4 [dstq+strideq*1], m0, 2 + vextracti32x4 [dstq+strideq*2], ym0, 1 + vextracti32x4 [dstq+stride3q ], m0, 3 + sub hd, 4 + jg .w16_loop + RET +.w32: + pmovzxbq m7, [pb_02461357] +.w32_loop: + %1 0 + %1_INC_PTR 2 + vpermq m0, m7, m0 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_loop + RET +.w64: + pmovzxbq m7, [pb_02461357] +.w64_loop: + %1 0 + %1_INC_PTR 2 + vpermq m0, m7, m0 + mova [dstq], m0 + add dstq, strideq + dec hd + jg .w64_loop + RET +.w128: + pmovzxbq m7, [pb_02461357] +.w128_loop: + %1 0 + vpermq m6, m7, m0 + %1 2 + mova [dstq+64*0], m6 + %1_INC_PTR 4 + vpermq m6, m7, m0 + mova [dstq+64*1], m6 + add dstq, strideq + dec hd + jg .w128_loop + RET +%endmacro + +%macro AVG 1 ; src_offset + mova m0, [tmp1q+(%1+0)*mmsize] + paddw m0, [tmp2q+(%1+0)*mmsize] + mova m1, [tmp1q+(%1+1)*mmsize] + paddw m1, [tmp2q+(%1+1)*mmsize] + pmulhrsw m0, m2 + pmulhrsw m1, m2 + packuswb m0, m1 +%endmacro + +%macro AVG_INC_PTR 1 + add tmp1q, %1*mmsize + add tmp2q, %1*mmsize +%endmacro + +cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3 +%define base r6-avg_avx512icl_table + lea r6, [avg_avx512icl_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, dword [r6+wq*4] + vpbroadcastd m2, [base+pw_1024] + add wq, r6 + BIDIR_FN AVG + +%macro W_AVG 1 ; src_offset + ; (a * weight + b * (16 - weight) + 128) >> 8 + ; = ((a - b) * weight + (b << 4) + 128) >> 8 + ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4 + ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4 + mova m0, [tmp1q+(%1+0)*mmsize] + psubw m2, m0, [tmp2q+(%1+0)*mmsize] + mova m1, [tmp1q+(%1+1)*mmsize] + psubw m3, m1, [tmp2q+(%1+1)*mmsize] + pmulhw m2, m4 + pmulhw m3, m4 + paddw m0, m2 + paddw m1, m3 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + packuswb m0, m1 +%endmacro + +%define W_AVG_INC_PTR AVG_INC_PTR + +cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 +%define base r6-w_avg_avx512icl_table + lea r6, [w_avg_avx512icl_table] + tzcnt wd, wm + movifnidn hd, hm + vpbroadcastw m4, r6m ; weight + movsxd wq, dword [r6+wq*4] + vpbroadcastd m5, [base+pw_2048] + psllw m4, 12 ; (weight-16) << 12 when interpreted as signed + add wq, r6 + cmp dword r6m, 7 + jg .weight_gt7 + mov r6, tmp1q + pxor m0, m0 + mov tmp1q, tmp2q + psubw m4, m0, m4 ; -weight + mov tmp2q, r6 +.weight_gt7: + BIDIR_FN W_AVG + +%macro MASK 1 ; src_offset + ; (a * m + b * (64 - m) + 512) >> 10 + ; = ((a - b) * m + (b << 6) + 512) >> 10 + ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4 +%if mmsize == 64 + vpermq m3, m8, [maskq+%1*32] +%else + vpermq m3, [maskq+%1*16], q3120 +%endif + mova m0, [tmp2q+(%1+0)*mmsize] + psubw m1, m0, [tmp1q+(%1+0)*mmsize] + psubb m3, m4, m3 + paddw m1, m1 ; (b - a) << 1 + paddb m3, m3 + punpcklbw m2, m4, m3 ; -m << 9 + pmulhw m1, m2 + paddw m0, m1 + mova m1, [tmp2q+(%1+1)*mmsize] + psubw m2, m1, [tmp1q+(%1+1)*mmsize] + paddw m2, m2 + punpckhbw m3, m4, m3 + pmulhw m2, m3 + paddw m1, m2 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + packuswb m0, m1 +%endmacro + +%macro MASK_INC_PTR 1 + add maskq, %1*32 + add tmp2q, %1*64 + add tmp1q, %1*64 +%endmacro + +cglobal mask, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-mask_avx512icl_table + lea r7, [mask_avx512icl_table] + tzcnt wd, wm + movifnidn hd, hm + mov maskq, maskmp + movsxd wq, dword [r7+wq*4] + pxor m4, m4 + mova m8, [base+bilin_v_perm64] + vpbroadcastd m5, [base+pw_2048] + add wq, r7 + BIDIR_FN MASK + +%macro W_MASK 4-5 0 ; dst, mask, tmp_offset[1-2], 4:4:4 + mova m%1, [tmp1q+mmsize*%3] + mova m1, [tmp2q+mmsize*%3] + psubw m1, m%1 + pabsw m%2, m1 + psubusw m%2, m6, m%2 + psrlw m%2, 8 ; 64 - m + psllw m2, m%2, 10 + pmulhw m1, m2 + paddw m%1, m1 + mova m1, [tmp1q+mmsize*%4] + mova m2, [tmp2q+mmsize*%4] + psubw m2, m1 + pabsw m3, m2 + psubusw m3, m6, m3 + vpshldw m%2, m3, 8 + psllw m3, m%2, 10 +%if %5 + psubb m%2, m5, m%2 +%endif + pmulhw m2, m3 + paddw m1, m2 + pmulhrsw m%1, m7 + pmulhrsw m1, m7 + packuswb m%1, m1 +%endmacro + +cglobal w_mask_420, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-w_mask_420_avx512icl_table + lea r7, [w_mask_420_avx512icl_table] + tzcnt wd, wm + mov r6d, r7m ; sign + movifnidn hd, hm + movsxd wq, [r7+wq*4] + vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 + vpbroadcastd m7, [base+pw_2048] + vpbroadcastd m9, [base+pb_m64] ; -1 << 6 + mova ym10, [base+wm_420_mask+32] + vpbroadcastd m8, [base+wm_sign+r6*8] ; (258 - sign) << 6 + add wq, r7 + mov maskq, maskmp + lea stride3q, [strideq*3] + jmp wq +.w4: + mova m5, [wm_420_perm4] + cmp hd, 8 + jg .w4_h16 + WRAP_YMM W_MASK 0, 4, 0, 1 + vinserti128 ym5, [wm_420_perm4+32], 1 + vpermb ym4, ym5, ym4 + vpdpbusd ym8, ym4, ym9 + vextracti128 xmm1, m0, 1 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + movd [dstq+strideq*2], xmm1 + pextrd [dstq+stride3q ], xmm1, 1 + jl .w4_end + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 2 + pextrd [dstq+strideq*1], xm0, 3 + pextrd [dstq+strideq*2], xmm1, 2 + pextrd [dstq+stride3q ], xmm1, 3 +.w4_end: + vpermb ym8, ym10, ym8 + movq [maskq], xm8 + RET +.w4_h16: + vpbroadcastd m11, strided + pmulld m11, [bidir_sctr_w4] + W_MASK 0, 4, 0, 1 + vpermb m4, m5, m4 + vpdpbusd m8, m4, m9 + kxnorw k1, k1, k1 + vpermb m8, m10, m8 + mova [maskq], xm8 + vpscatterdd [dstq+m11]{k1}, m0 + RET +.w8: + mova m5, [wm_420_perm8] + cmp hd, 4 + jne .w8_h8 + WRAP_YMM W_MASK 0, 4, 0, 1 + vinserti128 ym5, [wm_420_perm8+32], 1 + vpermb ym4, ym5, ym4 + vpdpbusd ym8, ym4, ym9 + vpermb m8, m10, m8 + mova [maskq], xm8 + vextracti128 xmm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xmm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xmm1 + RET +.w8_loop: + add tmp1q, 128 + add tmp2q, 128 + add maskq, 16 + lea dstq, [dstq+strideq*4] +.w8_h8: + W_MASK 0, 4, 0, 1 + vpermb m4, m5, m4 + mova m1, m8 + vpdpbusd m1, m4, m9 + vpermb m1, m10, m1 + mova [maskq], xm1 + vextracti32x4 xmm1, ym0, 1 + vextracti32x4 xmm2, m0, 2 + vextracti32x4 xmm3, m0, 3 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xmm1 + movq [dstq+strideq*2], xmm2 + movq [dstq+stride3q ], xmm3 + lea dstq, [dstq+strideq*4] + movhps [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xmm1 + movhps [dstq+strideq*2], xmm2 + movhps [dstq+stride3q ], xmm3 + sub hd, 8 + jg .w8_loop + RET +.w16: + mova m5, [wm_420_perm16] +.w16_loop: + W_MASK 0, 4, 0, 1 + vpermb m4, m5, m4 + mova m1, m8 + vpdpbusd m1, m4, m9 + add tmp1q, 128 + add tmp2q, 128 + vpermb m1, m10, m1 + vpermq m0, m0, q3120 + mova [maskq], xm1 + add maskq, 16 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], m0, 2 + vextracti32x4 [dstq+strideq*2], ym0, 1 + vextracti32x4 [dstq+stride3q ], m0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w16_loop + RET +.w32: + pmovzxbq m5, [pb_02461357] +.w32_loop: + W_MASK 0, 4, 0, 1 + mova m1, m8 + vpdpbusd m1, m4, m9 + add tmp1q, 128 + add tmp2q, 128 + vpermb m1, m10, m1 + vpermq m0, m5, m0 + mova [maskq], xm1 + add maskq, 16 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_loop + RET +.w64: + pmovzxbq m12, [wm_420_perm64] ; 0, 2, 4, 6, 8, 10, 12, 14 + psrlq m13, m12, 4 ; 1, 3, 5, 7, 9, 11, 13, 15 +.w64_loop: + W_MASK 0, 4, 0, 2 + W_MASK 11, 5, 1, 3 + mova m2, m8 + vpdpbusd m2, m4, m9 + mova m3, m8 + vpdpbusd m3, m5, m9 + add tmp1q, 256 + add tmp2q, 256 + vpermt2b m2, m10, m3 + mova m1, m0 + vpermt2q m0, m12, m11 + vpermt2q m1, m13, m11 + mova [maskq], ym2 + add maskq, 32 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w64_loop + RET +.w128: + pmovzxbq m14, [wm_420_perm64] + mova m10, [wm_420_mask] + psrlq m15, m14, 4 +.w128_loop: + W_MASK 0, 12, 0, 4 + W_MASK 11, 13, 1, 5 + mova m4, m8 + vpdpbusd m4, m12, m9 + mova m5, m8 + vpdpbusd m5, m13, m9 + mova m1, m0 + vpermt2q m0, m14, m11 + vpermt2q m1, m15, m11 + mova [dstq+strideq*0+64*0], m0 + mova [dstq+strideq*1+64*0], m1 + W_MASK 0, 12, 2, 6 + W_MASK 11, 13, 3, 7 + vprold m4, 16 + vprold m5, 16 + vpdpbusd m4, m12, m9 + vpdpbusd m5, m13, m9 + add tmp1q, 512 + add tmp2q, 512 + vpermt2b m4, m10, m5 + mova m1, m0 + vpermt2q m0, m14, m11 + vpermt2q m1, m15, m11 + mova [maskq], m4 + add maskq, 64 + mova [dstq+strideq*0+64*1], m0 + mova [dstq+strideq*1+64*1], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w128_loop + RET + +cglobal w_mask_422, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-w_mask_422_avx512icl_table + lea r7, [w_mask_422_avx512icl_table] + tzcnt wd, wm + mov r6d, r7m ; sign + movifnidn hd, hm + movsxd wq, dword [r7+wq*4] + vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 + vpbroadcastd m7, [base+pw_2048] + vpbroadcastd m9, [base+pw_m128] + mova m10, [base+wm_422_mask] + vpbroadcastd m11, [base+pb_127] + add wq, r7 + vpbroadcastd m8, [base+wm_sign+4+r6*4] + mov maskq, maskmp + lea stride3q, [strideq*3] + jmp wq +.w4: + cmp hd, 8 + jg .w4_h16 + WRAP_YMM W_MASK 0, 4, 0, 1 + movhps xm10, [wm_422_mask+16] + vpdpwssd ym8, ym4, ym9 + vpermb ym8, ym10, ym8 + vextracti128 xmm1, m0, 1 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + movd [dstq+strideq*2], xmm1 + pextrd [dstq+stride3q ], xmm1, 1 + jl .w4_end + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 2 + pextrd [dstq+strideq*1], xm0, 3 + pextrd [dstq+strideq*2], xmm1, 2 + pextrd [dstq+stride3q ], xmm1, 3 +.w4_end: + pand xm8, xm11 + mova [maskq], xm8 + RET +.w4_h16: + vpbroadcastd m5, strided + pmulld m5, [bidir_sctr_w4] + W_MASK 0, 4, 0, 1 + vpdpwssd m8, m4, m9 + kxnorw k1, k1, k1 + vpermb m8, m10, m8 + pand ym8, ym11 + mova [maskq], ym8 + vpscatterdd [dstq+m5]{k1}, m0 + RET +.w8: + cmp hd, 4 + jne .w8_h8 + WRAP_YMM W_MASK 0, 4, 0, 1 + movhps xm10, [wm_422_mask+16] + vpdpwssd ym8, ym4, ym9 + vpermb ym8, ym10, ym8 + pand xm8, xm11 + mova [maskq], xm8 + vextracti128 xmm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xmm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xmm1 + RET +.w8_loop: + add tmp1q, 128 + add tmp2q, 128 + add maskq, 32 + lea dstq, [dstq+strideq*4] +.w8_h8: + W_MASK 0, 4, 0, 1 + mova m1, m8 + vpdpwssd m1, m4, m9 + vpermb m1, m10, m1 + pand ym1, ym11 + mova [maskq], ym1 + vextracti32x4 xmm1, ym0, 1 + vextracti32x4 xmm2, m0, 2 + vextracti32x4 xmm3, m0, 3 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xmm1 + movq [dstq+strideq*2], xmm2 + movq [dstq+stride3q ], xmm3 + lea dstq, [dstq+strideq*4] + movhps [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xmm1 + movhps [dstq+strideq*2], xmm2 + movhps [dstq+stride3q ], xmm3 + sub hd, 8 + jg .w8_loop + RET +.w16_loop: + add tmp1q, 128 + add tmp2q, 128 + add maskq, 32 + lea dstq, [dstq+strideq*4] +.w16: + W_MASK 0, 4, 0, 1 + mova m1, m8 + vpdpwssd m1, m4, m9 + vpermb m1, m10, m1 + vpermq m0, m0, q3120 + pand ym1, ym11 + mova [maskq], ym1 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], m0, 2 + vextracti32x4 [dstq+strideq*2], ym0, 1 + vextracti32x4 [dstq+stride3q ], m0, 3 + sub hd, 4 + jg .w16_loop + RET +.w32: + pmovzxbq m5, [pb_02461357] +.w32_loop: + W_MASK 0, 4, 0, 1 + mova m1, m8 + vpdpwssd m1, m4, m9 + add tmp1q, 128 + add tmp2q, 128 + vpermb m1, m10, m1 + vpermq m0, m5, m0 + pand ym1, ym11 + mova [maskq], ym1 + add maskq, 32 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_loop + RET +.w64: + pmovzxbq m5, [pb_02461357] +.w64_loop: + W_MASK 0, 4, 0, 1 + mova m1, m8 + vpdpwssd m1, m4, m9 + add tmp1q, 128 + add tmp2q, 128 + vpermb m1, m10, m1 + vpermq m0, m5, m0 + pand ym1, ym11 + mova [maskq], ym1 + add maskq, 32 + mova [dstq], m0 + add dstq, strideq + dec hd + jg .w64_loop + RET +.w128: + pmovzxbq m13, [pb_02461357] +.w128_loop: + W_MASK 0, 4, 0, 1 + W_MASK 12, 5, 2, 3 + mova m2, m8 + vpdpwssd m2, m4, m9 + mova m3, m8 + vpdpwssd m3, m5, m9 + add tmp1q, 256 + add tmp2q, 256 + vpermt2b m2, m10, m3 + vpermq m0, m13, m0 + vpermq m1, m13, m12 + pand m2, m11 + mova [maskq], m2 + add maskq, 64 + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + add dstq, strideq + dec hd + jg .w128_loop + RET + +cglobal w_mask_444, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-w_mask_444_avx512icl_table + lea r7, [w_mask_444_avx512icl_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, dword [r7+wq*4] + vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 + vpbroadcastd m5, [base+pb_64] + vpbroadcastd m7, [base+pw_2048] + mova m8, [base+wm_444_mask] + add wq, r7 + mov maskq, maskmp + lea stride3q, [strideq*3] + jmp wq +.w4: + cmp hd, 8 + jg .w4_h16 + WRAP_YMM W_MASK 0, 4, 0, 1, 1 + vinserti128 ym8, [wm_444_mask+32], 1 + vpermb ym4, ym8, ym4 + mova [maskq], ym4 + vextracti128 xmm1, m0, 1 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + movd [dstq+strideq*2], xmm1 + pextrd [dstq+stride3q ], xmm1, 1 + jl .w4_end + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 2 + pextrd [dstq+strideq*1], xm0, 3 + pextrd [dstq+strideq*2], xmm1, 2 + pextrd [dstq+stride3q ], xmm1, 3 +.w4_end: + RET +.w4_h16: + vpbroadcastd m9, strided + pmulld m9, [bidir_sctr_w4] + W_MASK 0, 4, 0, 1, 1 + vpermb m4, m8, m4 + kxnorw k1, k1, k1 + mova [maskq], m4 + vpscatterdd [dstq+m9]{k1}, m0 + RET +.w8: + cmp hd, 4 + jne .w8_h8 + WRAP_YMM W_MASK 0, 4, 0, 1, 1 + vinserti128 ym8, [wm_444_mask+32], 1 + vpermb ym4, ym8, ym4 + mova [maskq], ym4 + vextracti128 xmm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xmm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xmm1 + RET +.w8_loop: + add tmp1q, 128 + add tmp2q, 128 + add maskq, 64 + lea dstq, [dstq+strideq*4] +.w8_h8: + W_MASK 0, 4, 0, 1, 1 + vpermb m4, m8, m4 + mova [maskq], m4 + vextracti32x4 xmm1, ym0, 1 + vextracti32x4 xmm2, m0, 2 + vextracti32x4 xmm3, m0, 3 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xmm1 + movq [dstq+strideq*2], xmm2 + movq [dstq+stride3q ], xmm3 + lea dstq, [dstq+strideq*4] + movhps [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xmm1 + movhps [dstq+strideq*2], xmm2 + movhps [dstq+stride3q ], xmm3 + sub hd, 8 + jg .w8_loop + RET +.w16_loop: + add tmp1q, 128 + add tmp2q, 128 + add maskq, 64 + lea dstq, [dstq+strideq*4] +.w16: + W_MASK 0, 4, 0, 1, 1 + vpermb m4, m8, m4 + vpermq m0, m0, q3120 + mova [maskq], m4 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], m0, 2 + vextracti32x4 [dstq+strideq*2], ym0, 1 + vextracti32x4 [dstq+stride3q ], m0, 3 + sub hd, 4 + jg .w16_loop + RET +.w32: + pmovzxbq m9, [pb_02461357] +.w32_loop: + W_MASK 0, 4, 0, 1, 1 + vpermb m4, m8, m4 + add tmp1q, 128 + add tmp2q, 128 + vpermq m0, m9, m0 + mova [maskq], m4 + add maskq, 64 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_loop + RET +.w64: + pmovzxbq m9, [pb_02461357] +.w64_loop: + W_MASK 0, 4, 0, 1, 1 + vpermb m4, m8, m4 + add tmp1q, 128 + add tmp2q, 128 + vpermq m0, m9, m0 + mova [maskq], m4 + add maskq, 64 + mova [dstq], m0 + add dstq, strideq + dec hd + jg .w64_loop + RET +.w128: + pmovzxbq m11, [pb_02461357] +.w128_loop: + W_MASK 0, 4, 0, 1, 1 + W_MASK 10, 9, 2, 3, 1 + vpermb m4, m8, m4 + vpermb m9, m8, m9 + add tmp1q, 256 + add tmp2q, 256 + vpermq m0, m11, m0 + vpermq m10, m11, m10 + mova [maskq+64*0], m4 + mova [maskq+64*1], m9 + add maskq, 128 + mova [dstq+64*0], m0 + mova [dstq+64*1], m10 + add dstq, strideq + dec hd + jg .w128_loop + RET + +%endif ; HAVE_AVX512ICL && ARCH_X86_64 diff --git a/third_party/dav1d/src/x86/mc_init_tmpl.c b/third_party/dav1d/src/x86/mc_init_tmpl.c new file mode 100644 index 0000000000..47f0104a9d --- /dev/null +++ b/third_party/dav1d/src/x86/mc_init_tmpl.c @@ -0,0 +1,366 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/mc.h" + +decl_mc_fn(dav1d_put_8tap_regular_avx2); +decl_mc_fn(dav1d_put_8tap_regular_ssse3); +decl_mc_fn(dav1d_put_8tap_regular_smooth_avx2); +decl_mc_fn(dav1d_put_8tap_regular_smooth_ssse3); +decl_mc_fn(dav1d_put_8tap_regular_sharp_avx2); +decl_mc_fn(dav1d_put_8tap_regular_sharp_ssse3); +decl_mc_fn(dav1d_put_8tap_smooth_avx2); +decl_mc_fn(dav1d_put_8tap_smooth_ssse3); +decl_mc_fn(dav1d_put_8tap_smooth_regular_avx2); +decl_mc_fn(dav1d_put_8tap_smooth_regular_ssse3); +decl_mc_fn(dav1d_put_8tap_smooth_sharp_avx2); +decl_mc_fn(dav1d_put_8tap_smooth_sharp_ssse3); +decl_mc_fn(dav1d_put_8tap_sharp_avx2); +decl_mc_fn(dav1d_put_8tap_sharp_ssse3); +decl_mc_fn(dav1d_put_8tap_sharp_regular_avx2); +decl_mc_fn(dav1d_put_8tap_sharp_regular_ssse3); +decl_mc_fn(dav1d_put_8tap_sharp_smooth_avx2); +decl_mc_fn(dav1d_put_8tap_sharp_smooth_ssse3); +decl_mc_fn(dav1d_put_bilin_avx2); +decl_mc_fn(dav1d_put_bilin_ssse3); + +decl_mct_fn(dav1d_prep_8tap_regular_avx512icl); +decl_mct_fn(dav1d_prep_8tap_regular_avx2); +decl_mct_fn(dav1d_prep_8tap_regular_ssse3); +decl_mct_fn(dav1d_prep_8tap_regular_sse2); +decl_mct_fn(dav1d_prep_8tap_regular_smooth_avx512icl); +decl_mct_fn(dav1d_prep_8tap_regular_smooth_avx2); +decl_mct_fn(dav1d_prep_8tap_regular_smooth_ssse3); +decl_mct_fn(dav1d_prep_8tap_regular_smooth_sse2); +decl_mct_fn(dav1d_prep_8tap_regular_sharp_avx512icl); +decl_mct_fn(dav1d_prep_8tap_regular_sharp_avx2); +decl_mct_fn(dav1d_prep_8tap_regular_sharp_ssse3); +decl_mct_fn(dav1d_prep_8tap_regular_sharp_sse2); +decl_mct_fn(dav1d_prep_8tap_smooth_avx512icl); +decl_mct_fn(dav1d_prep_8tap_smooth_avx2); +decl_mct_fn(dav1d_prep_8tap_smooth_ssse3); +decl_mct_fn(dav1d_prep_8tap_smooth_sse2); +decl_mct_fn(dav1d_prep_8tap_smooth_regular_avx512icl); +decl_mct_fn(dav1d_prep_8tap_smooth_regular_avx2); +decl_mct_fn(dav1d_prep_8tap_smooth_regular_ssse3); +decl_mct_fn(dav1d_prep_8tap_smooth_regular_sse2); +decl_mct_fn(dav1d_prep_8tap_smooth_sharp_avx512icl); +decl_mct_fn(dav1d_prep_8tap_smooth_sharp_avx2); +decl_mct_fn(dav1d_prep_8tap_smooth_sharp_ssse3); +decl_mct_fn(dav1d_prep_8tap_smooth_sharp_sse2); +decl_mct_fn(dav1d_prep_8tap_sharp_avx512icl); +decl_mct_fn(dav1d_prep_8tap_sharp_avx2); +decl_mct_fn(dav1d_prep_8tap_sharp_ssse3); +decl_mct_fn(dav1d_prep_8tap_sharp_sse2); +decl_mct_fn(dav1d_prep_8tap_sharp_regular_avx512icl); +decl_mct_fn(dav1d_prep_8tap_sharp_regular_avx2); +decl_mct_fn(dav1d_prep_8tap_sharp_regular_ssse3); +decl_mct_fn(dav1d_prep_8tap_sharp_regular_sse2); +decl_mct_fn(dav1d_prep_8tap_sharp_smooth_avx512icl); +decl_mct_fn(dav1d_prep_8tap_sharp_smooth_avx2); +decl_mct_fn(dav1d_prep_8tap_sharp_smooth_ssse3); +decl_mct_fn(dav1d_prep_8tap_sharp_smooth_sse2); +decl_mct_fn(dav1d_prep_bilin_avx512icl); +decl_mct_fn(dav1d_prep_bilin_avx2); +decl_mct_fn(dav1d_prep_bilin_ssse3); +decl_mct_fn(dav1d_prep_bilin_sse2); + +decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_avx2); +decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_ssse3); +decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_smooth_avx2); +decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_smooth_ssse3); +decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_sharp_avx2); +decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_sharp_ssse3); +decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_avx2); +decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_ssse3); +decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_regular_avx2); +decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_regular_ssse3); +decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_sharp_avx2); +decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_sharp_ssse3); +decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_avx2); +decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_ssse3); +decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_regular_avx2); +decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_regular_ssse3); +decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_smooth_avx2); +decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_smooth_ssse3); +decl_mc_scaled_fn(dav1d_put_bilin_scaled_avx2); +decl_mc_scaled_fn(dav1d_put_bilin_scaled_ssse3); + +decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_avx2); +decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_ssse3); +decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_smooth_avx2); +decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_smooth_ssse3); +decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_sharp_avx2); +decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_sharp_ssse3); +decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_avx2); +decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_ssse3); +decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_regular_avx2); +decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_regular_ssse3); +decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_sharp_avx2); +decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_sharp_ssse3); +decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_avx2); +decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_ssse3); +decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_regular_avx2); +decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_regular_ssse3); +decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_smooth_avx2); +decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_smooth_ssse3); +decl_mct_scaled_fn(dav1d_prep_bilin_scaled_avx2); +decl_mct_scaled_fn(dav1d_prep_bilin_scaled_ssse3); + +decl_avg_fn(dav1d_avg_avx512icl); +decl_avg_fn(dav1d_avg_avx2); +decl_avg_fn(dav1d_avg_ssse3); +decl_w_avg_fn(dav1d_w_avg_avx512icl); +decl_w_avg_fn(dav1d_w_avg_avx2); +decl_w_avg_fn(dav1d_w_avg_ssse3); +decl_mask_fn(dav1d_mask_avx512icl); +decl_mask_fn(dav1d_mask_avx2); +decl_mask_fn(dav1d_mask_ssse3); +decl_w_mask_fn(dav1d_w_mask_420_avx512icl); +decl_w_mask_fn(dav1d_w_mask_420_avx2); +decl_w_mask_fn(dav1d_w_mask_420_ssse3); +decl_w_mask_fn(dav1d_w_mask_422_avx512icl); +decl_w_mask_fn(dav1d_w_mask_422_avx2); +decl_w_mask_fn(dav1d_w_mask_444_avx512icl); +decl_w_mask_fn(dav1d_w_mask_444_avx2); +decl_blend_fn(dav1d_blend_avx2); +decl_blend_fn(dav1d_blend_ssse3); +decl_blend_dir_fn(dav1d_blend_v_avx2); +decl_blend_dir_fn(dav1d_blend_v_ssse3); +decl_blend_dir_fn(dav1d_blend_h_avx2); +decl_blend_dir_fn(dav1d_blend_h_ssse3); + +decl_warp8x8_fn(dav1d_warp_affine_8x8_avx2); +decl_warp8x8_fn(dav1d_warp_affine_8x8_sse4); +decl_warp8x8_fn(dav1d_warp_affine_8x8_ssse3); +decl_warp8x8_fn(dav1d_warp_affine_8x8_sse2); +decl_warp8x8t_fn(dav1d_warp_affine_8x8t_avx2); +decl_warp8x8t_fn(dav1d_warp_affine_8x8t_sse4); +decl_warp8x8t_fn(dav1d_warp_affine_8x8t_ssse3); +decl_warp8x8t_fn(dav1d_warp_affine_8x8t_sse2); + +decl_emu_edge_fn(dav1d_emu_edge_avx2); +decl_emu_edge_fn(dav1d_emu_edge_ssse3); + +decl_resize_fn(dav1d_resize_avx2); +decl_resize_fn(dav1d_resize_ssse3); + +COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) { +#define init_mc_fn(type, name, suffix) \ + c->mc[type] = dav1d_put_##name##_##suffix +#define init_mct_fn(type, name, suffix) \ + c->mct[type] = dav1d_prep_##name##_##suffix +#define init_mc_scaled_fn(type, name, suffix) \ + c->mc_scaled[type] = dav1d_put_##name##_##suffix +#define init_mct_scaled_fn(type, name, suffix) \ + c->mct_scaled[type] = dav1d_prep_##name##_##suffix + + const unsigned flags = dav1d_get_cpu_flags(); + + if(!(flags & DAV1D_X86_CPU_FLAG_SSE2)) + return; + +#if BITDEPTH == 8 + init_mct_fn(FILTER_2D_BILINEAR, bilin, sse2); + init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, sse2); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, sse2); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, sse2); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, sse2); + init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, sse2); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, sse2); + init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, sse2); + init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, sse2); + init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, sse2); + + c->warp8x8 = dav1d_warp_affine_8x8_sse2; + c->warp8x8t = dav1d_warp_affine_8x8t_sse2; +#endif + + if(!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) + return; + +#if BITDEPTH == 8 + init_mc_fn(FILTER_2D_BILINEAR, bilin, ssse3); + init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3); + init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3); + init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3); + init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3); + init_mc_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, ssse3); + init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, ssse3); + init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3); + init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3); + init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3); + + init_mct_fn(FILTER_2D_BILINEAR, bilin, ssse3); + init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3); + init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, ssse3); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, ssse3); + init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3); + init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3); + init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3); + +#if ARCH_X86_64 + init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, ssse3); + init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, ssse3); + init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, ssse3); + init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, ssse3); + init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, ssse3); + init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, ssse3); + init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, ssse3); + init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, ssse3); + init_mc_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, ssse3); + init_mc_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, ssse3); + + init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, ssse3); + init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, ssse3); + init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, ssse3); + init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, ssse3); + init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, ssse3); + init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, ssse3); + init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, ssse3); + init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, ssse3); + init_mct_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, ssse3); + init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, ssse3); +#endif + + c->avg = dav1d_avg_ssse3; + c->w_avg = dav1d_w_avg_ssse3; + c->mask = dav1d_mask_ssse3; + c->w_mask[2] = dav1d_w_mask_420_ssse3; + c->blend = dav1d_blend_ssse3; + c->blend_v = dav1d_blend_v_ssse3; + c->blend_h = dav1d_blend_h_ssse3; + + c->warp8x8 = dav1d_warp_affine_8x8_ssse3; + c->warp8x8t = dav1d_warp_affine_8x8t_ssse3; + + c->emu_edge = dav1d_emu_edge_ssse3; + c->resize = dav1d_resize_ssse3; +#endif + + if(!(flags & DAV1D_X86_CPU_FLAG_SSE41)) + return; + +#if BITDEPTH == 8 + c->warp8x8 = dav1d_warp_affine_8x8_sse4; + c->warp8x8t = dav1d_warp_affine_8x8t_sse4; +#endif + +#if ARCH_X86_64 + if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) + return; + +#if BITDEPTH == 8 + init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2); + init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2); + init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2); + init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2); + init_mc_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx2); + init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx2); + init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx2); + init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2); + init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2); + init_mc_fn(FILTER_2D_BILINEAR, bilin, avx2); + + init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2); + init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx2); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx2); + init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx2); + init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2); + init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2); + init_mct_fn(FILTER_2D_BILINEAR, bilin, avx2); + + init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, avx2); + init_mc_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, avx2); + + init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, avx2); + init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, avx2); + + c->avg = dav1d_avg_avx2; + c->w_avg = dav1d_w_avg_avx2; + c->mask = dav1d_mask_avx2; + c->w_mask[0] = dav1d_w_mask_444_avx2; + c->w_mask[1] = dav1d_w_mask_422_avx2; + c->w_mask[2] = dav1d_w_mask_420_avx2; + c->blend = dav1d_blend_avx2; + c->blend_v = dav1d_blend_v_avx2; + c->blend_h = dav1d_blend_h_avx2; + + c->warp8x8 = dav1d_warp_affine_8x8_avx2; + c->warp8x8t = dav1d_warp_affine_8x8t_avx2; + + c->emu_edge = dav1d_emu_edge_avx2; + c->resize = dav1d_resize_avx2; +#endif + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) + return; + +#if HAVE_AVX512ICL && BITDEPTH == 8 + init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx512icl); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx512icl); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx512icl); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx512icl); + init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx512icl); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx512icl); + init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx512icl); + init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx512icl); + init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx512icl); + init_mct_fn(FILTER_2D_BILINEAR, bilin, avx512icl); + + c->avg = dav1d_avg_avx512icl; + c->w_avg = dav1d_w_avg_avx512icl; + c->mask = dav1d_mask_avx512icl; + c->w_mask[0] = dav1d_w_mask_444_avx512icl; + c->w_mask[1] = dav1d_w_mask_422_avx512icl; + c->w_mask[2] = dav1d_w_mask_420_avx512icl; +#endif +#endif +} diff --git a/third_party/dav1d/src/x86/mc_sse.asm b/third_party/dav1d/src/x86/mc_sse.asm new file mode 100644 index 0000000000..edbd186564 --- /dev/null +++ b/third_party/dav1d/src/x86/mc_sse.asm @@ -0,0 +1,7544 @@ +; Copyright © 2018, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; Copyright © 2018, VideoLabs +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA 16 + +; dav1d_obmc_masks[] with 64-x interleaved +obmc_masks: db 0, 0, 0, 0 + ; 2 @4 + db 45, 19, 64, 0 + ; 4 @8 + db 39, 25, 50, 14, 59, 5, 64, 0 + ; 8 @16 + db 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0 + ; 16 @32 + db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10 + db 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0 + ; 32 @64 + db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20 + db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9 + db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2 + +warp_8x8_shufA: db 0, 2, 4, 6, 1, 3, 5, 7, 1, 3, 5, 7, 2, 4, 6, 8 +warp_8x8_shufB: db 4, 6, 8, 10, 5, 7, 9, 11, 5, 7, 9, 11, 6, 8, 10, 12 +warp_8x8_shufC: db 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9, 4, 6, 8, 10 +warp_8x8_shufD: db 6, 8, 10, 12, 7, 9, 11, 13, 7, 9, 11, 13, 8, 10, 12, 14 +blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 +subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12 + db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14 +subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 +subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 +subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 +subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11 +subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 +bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 +bilin_h_shuf8: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 +unpckw: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 + +pb_8x0_8x8: times 8 db 0 + times 8 db 8 +bdct_lb_dw: times 4 db 0 + times 4 db 4 + times 4 db 8 + times 4 db 12 +rescale_mul: dd 0, 1, 2, 3 +resize_shuf: times 5 db 0 + db 1, 2, 3, 4, 5, 6 + times 5+16 db 7 + +pb_64: times 16 db 64 +pw_m256: times 8 dw -256 +pw_1: times 8 dw 1 +pw_2: times 8 dw 2 +pw_8: times 8 dw 8 +pw_15: times 8 dw 15 +pw_26: times 8 dw 26 +pw_34: times 8 dw 34 +pw_512: times 8 dw 512 +pw_1024: times 8 dw 1024 +pw_2048: times 8 dw 2048 +pw_6903: times 8 dw 6903 +pw_8192: times 8 dw 8192 +pd_32: times 4 dd 32 +pd_63: times 4 dd 63 +pd_512: times 4 dd 512 +pd_16384: times 4 dd 16484 +pd_32768: times 4 dd 32768 +pd_262144:times 4 dd 262144 +pd_0x3ff: times 4 dd 0x3ff +pd_0x4000:times 4 dd 0x4000 +pq_0x40000000: times 2 dq 0x40000000 + +pw_258: times 2 dw 258 + +cextern mc_subpel_filters +%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) + +%macro BIDIR_JMP_TABLE 1-* + ;evaluated at definition time (in loop below) + %xdefine %1_table (%%table - 2*%2) + %xdefine %%base %1_table + %xdefine %%prefix mangle(private_prefix %+ _%1) + ; dynamically generated label + %%table: + %rep %0 - 1 ; repeat for num args + dd %%prefix %+ .w%2 - %%base + %rotate 1 + %endrep +%endmacro + +BIDIR_JMP_TABLE avg_ssse3, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_avg_ssse3, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE mask_ssse3, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_420_ssse3, 4, 8, 16, 16, 16, 16 +BIDIR_JMP_TABLE blend_ssse3, 4, 8, 16, 32 +BIDIR_JMP_TABLE blend_v_ssse3, 2, 4, 8, 16, 32 +BIDIR_JMP_TABLE blend_h_ssse3, 2, 4, 8, 16, 16, 16, 16 + +%macro BASE_JMP_TABLE 3-* + %xdefine %1_%2_table (%%table - %3) + %xdefine %%base %1_%2 + %%table: + %rep %0 - 2 + dw %%base %+ _w%3 - %%base + %rotate 1 + %endrep +%endmacro + +%xdefine prep_sse2 mangle(private_prefix %+ _prep_bilin_sse2.prep) +%xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_ssse3.put) +%xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_ssse3.prep) + +BASE_JMP_TABLE put, ssse3, 2, 4, 8, 16, 32, 64, 128 +BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128 + +%macro HV_JMP_TABLE 5-* + %xdefine %%prefix mangle(private_prefix %+ _%1_%2_%3) + %xdefine %%base %1_%3 + %assign %%types %4 + %if %%types & 1 + %xdefine %1_%2_h_%3_table (%%h - %5) + %%h: + %rep %0 - 4 + dw %%prefix %+ .h_w%5 - %%base + %rotate 1 + %endrep + %rotate 4 + %endif + %if %%types & 2 + %xdefine %1_%2_v_%3_table (%%v - %5) + %%v: + %rep %0 - 4 + dw %%prefix %+ .v_w%5 - %%base + %rotate 1 + %endrep + %rotate 4 + %endif + %if %%types & 4 + %xdefine %1_%2_hv_%3_table (%%hv - %5) + %%hv: + %rep %0 - 4 + dw %%prefix %+ .hv_w%5 - %%base + %rotate 1 + %endrep + %endif +%endmacro + +HV_JMP_TABLE prep, 8tap, sse2, 1, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, bilin, sse2, 7, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE put, 8tap, ssse3, 3, 2, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, 8tap, ssse3, 1, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE put, bilin, ssse3, 7, 2, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, bilin, ssse3, 7, 4, 8, 16, 32, 64, 128 + +%macro SCALED_JMP_TABLE 1-* + %xdefine %1_table (%%table - %2) + %xdefine %%base mangle(private_prefix %+ _%1) +%%table: + %rep %0 - 1 + dw %%base %+ .w%2 - %%base + %rotate 1 + %endrep + %rotate 1 +%%dy_1024: + %xdefine %1_dy1_table (%%dy_1024 - %2) + %rep %0 - 1 + dw %%base %+ .dy1_w%2 - %%base + %rotate 1 + %endrep + %rotate 1 +%%dy_2048: + %xdefine %1_dy2_table (%%dy_2048 - %2) + %rep %0 - 1 + dw %%base %+ .dy2_w%2 - %%base + %rotate 1 + %endrep +%endmacro + +%if ARCH_X86_64 +SCALED_JMP_TABLE put_8tap_scaled_ssse3, 2, 4, 8, 16, 32, 64, 128 +SCALED_JMP_TABLE prep_8tap_scaled_ssse3, 4, 8, 16, 32, 64, 128 +%endif + +%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX + +cextern mc_warp_filter + +SECTION .text + +INIT_XMM ssse3 + +%if ARCH_X86_32 + DECLARE_REG_TMP 1 + %define base t0-put_ssse3 +%else + DECLARE_REG_TMP 7 + %define base 0 +%endif + +%macro RESTORE_DSQ_32 1 + %if ARCH_X86_32 + mov %1, dsm ; restore dsq + %endif +%endmacro + +cglobal put_bilin, 1, 8, 0, dst, ds, src, ss, w, h, mxy + movifnidn mxyd, r6m ; mx + LEA t0, put_ssse3 + movifnidn srcq, srcmp + movifnidn ssq, ssmp + tzcnt wd, wm + mov hd, hm + test mxyd, mxyd + jnz .h + mov mxyd, r7m ; my + test mxyd, mxyd + jnz .v +.put: + movzx wd, word [t0+wq*2+table_offset(put,)] + add wq, t0 + RESTORE_DSQ_32 t0 + jmp wq +.put_w2: + movzx r4d, word [srcq+ssq*0] + movzx r6d, word [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mov [dstq+dsq*0], r4w + mov [dstq+dsq*1], r6w + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w2 + RET +.put_w4: + mov r4d, [srcq+ssq*0] + mov r6d, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mov [dstq+dsq*0], r4d + mov [dstq+dsq*1], r6d + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w4 + RET +.put_w8: + movq m0, [srcq+ssq*0] + movq m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movq [dstq+dsq*0], m0 + movq [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w8 + RET +.put_w16: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w16 + RET +.put_w32: + movu m0, [srcq+ssq*0+16*0] + movu m1, [srcq+ssq*0+16*1] + movu m2, [srcq+ssq*1+16*0] + movu m3, [srcq+ssq*1+16*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0+16*0], m0 + mova [dstq+dsq*0+16*1], m1 + mova [dstq+dsq*1+16*0], m2 + mova [dstq+dsq*1+16*1], m3 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w32 + RET +.put_w64: + movu m0, [srcq+16*0] + movu m1, [srcq+16*1] + movu m2, [srcq+16*2] + movu m3, [srcq+16*3] + add srcq, ssq + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + mova [dstq+16*2], m2 + mova [dstq+16*3], m3 + add dstq, dsq + dec hd + jg .put_w64 + RET +.put_w128: + movu m0, [srcq+16*0] + movu m1, [srcq+16*1] + movu m2, [srcq+16*2] + movu m3, [srcq+16*3] + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + mova [dstq+16*2], m2 + mova [dstq+16*3], m3 + movu m0, [srcq+16*4] + movu m1, [srcq+16*5] + movu m2, [srcq+16*6] + movu m3, [srcq+16*7] + mova [dstq+16*4], m0 + mova [dstq+16*5], m1 + mova [dstq+16*6], m2 + mova [dstq+16*7], m3 + add srcq, ssq + add dstq, dsq + dec hd + jg .put_w128 + RET +.h: + ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4 + ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4 + imul mxyd, 0x00ff00ff + mova m4, [base+bilin_h_shuf8] + mova m0, [base+bilin_h_shuf4] + add mxyd, 0x00100010 + movd m5, mxyd + mov mxyd, r7m ; my + pshufd m5, m5, q0000 + test mxyd, mxyd + jnz .hv + movzx wd, word [t0+wq*2+table_offset(put, _bilin_h)] + mova m3, [base+pw_2048] + add wq, t0 + movifnidn dsq, dsmp + jmp wq +.h_w2: + pshufd m4, m4, q3120 ; m4 = {1, 0, 2, 1, 5, 4, 6, 5} +.h_w2_loop: + movd m0, [srcq+ssq*0] + movd m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpckldq m0, m1 + pshufb m0, m4 + pmaddubsw m0, m5 + pmulhrsw m0, m3 + packuswb m0, m0 + movd r6d, m0 + mov [dstq+dsq*0], r6w + shr r6d, 16 + mov [dstq+dsq*1], r6w + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w2_loop + RET +.h_w4: + movq m4, [srcq+ssq*0] + movhps m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb m4, m0 + pmaddubsw m4, m5 + pmulhrsw m4, m3 + packuswb m4, m4 + movd [dstq+dsq*0], m4 + psrlq m4, 32 + movd [dstq+dsq*1], m4 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w4 + RET +.h_w8: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb m0, m4 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packuswb m0, m1 + movq [dstq+dsq*0], m0 + movhps [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w8 + RET +.h_w16: + movu m0, [srcq+8*0] + movu m1, [srcq+8*1] + add srcq, ssq + pshufb m0, m4 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packuswb m0, m1 + mova [dstq], m0 + add dstq, dsq + dec hd + jg .h_w16 + RET +.h_w32: + movu m0, [srcq+mmsize*0+8*0] + movu m1, [srcq+mmsize*0+8*1] + pshufb m0, m4 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packuswb m0, m1 + movu m1, [srcq+mmsize*1+8*0] + movu m2, [srcq+mmsize*1+8*1] + add srcq, ssq + pshufb m1, m4 + pshufb m2, m4 + pmaddubsw m1, m5 + pmaddubsw m2, m5 + pmulhrsw m1, m3 + pmulhrsw m2, m3 + packuswb m1, m2 + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + add dstq, dsq + dec hd + jg .h_w32 + RET +.h_w64: + mov r6, -16*3 +.h_w64_loop: + movu m0, [srcq+r6+16*3+8*0] + movu m1, [srcq+r6+16*3+8*1] + pshufb m0, m4 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packuswb m0, m1 + mova [dstq+r6+16*3], m0 + add r6, 16 + jle .h_w64_loop + add srcq, ssq + add dstq, dsq + dec hd + jg .h_w64 + RET +.h_w128: + mov r6, -16*7 +.h_w128_loop: + movu m0, [srcq+r6+16*7+8*0] + movu m1, [srcq+r6+16*7+8*1] + pshufb m0, m4 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packuswb m0, m1 + mova [dstq+r6+16*7], m0 + add r6, 16 + jle .h_w128_loop + add srcq, ssq + add dstq, dsq + dec hd + jg .h_w128 + RET +.v: + movzx wd, word [t0+wq*2+table_offset(put, _bilin_v)] + imul mxyd, 0x00ff00ff + mova m5, [base+pw_2048] + add mxyd, 0x00100010 + add wq, t0 + movd m4, mxyd + pshufd m4, m4, q0000 + movifnidn dsq, dsmp + jmp wq +.v_w2: + movd m0, [srcq+ssq*0] +.v_w2_loop: + pinsrw m0, [srcq+ssq*1], 1 ; 0 1 + lea srcq, [srcq+ssq*2] + pshuflw m1, m0, q2301 + pinsrw m0, [srcq+ssq*0], 0 ; 2 1 + punpcklbw m1, m0 + pmaddubsw m1, m4 + pmulhrsw m1, m5 + packuswb m1, m1 + movd r6d, m1 + mov [dstq+dsq*1], r6w + shr r6d, 16 + mov [dstq+dsq*0], r6w + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: + movd m0, [srcq+ssq*0] +.v_w4_loop: + movd m2, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova m1, m0 + movd m0, [srcq+ssq*0] + punpckldq m1, m2 ; 0 1 + punpckldq m2, m0 ; 1 2 + punpcklbw m1, m2 + pmaddubsw m1, m4 + pmulhrsw m1, m5 + packuswb m1, m1 + movd [dstq+dsq*0], m1 + psrlq m1, 32 + movd [dstq+dsq*1], m1 + ; + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: + movq m0, [srcq+ssq*0] +.v_w8_loop: + movq m2, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova m1, m0 + movq m0, [srcq+ssq*0] + punpcklbw m1, m2 + punpcklbw m2, m0 + pmaddubsw m1, m4 + pmaddubsw m2, m4 + pmulhrsw m1, m5 + pmulhrsw m2, m5 + packuswb m1, m2 + movq [dstq+dsq*0], m1 + movhps [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w8_loop + RET +%macro PUT_BILIN_V_W16 0 + movu m0, [srcq+ssq*0] +%%loop: + movu m3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova m1, m0 + mova m2, m0 + movu m0, [srcq+ssq*0] + punpcklbw m1, m3 + punpckhbw m2, m3 + pmaddubsw m1, m4 + pmaddubsw m2, m4 + pmulhrsw m1, m5 + pmulhrsw m2, m5 + packuswb m1, m2 + punpcklbw m2, m3, m0 + punpckhbw m3, m0 + pmaddubsw m2, m4 + pmaddubsw m3, m4 + pmulhrsw m2, m5 + pmulhrsw m3, m5 + packuswb m2, m3 + mova [dstq+dsq*0], m1 + mova [dstq+dsq*1], m2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg %%loop +%endmacro +.v_w16: + PUT_BILIN_V_W16 + RET +.v_w128: + lea r6d, [hq+(7<<16)] + jmp .v_w16gt +.v_w64: + lea r6d, [hq+(3<<16)] + jmp .v_w16gt +.v_w32: + lea r6d, [hq+(1<<16)] +.v_w16gt: + mov r4, srcq +%if ARCH_X86_64 + mov r7, dstq +%endif +.v_w16gt_loop: + PUT_BILIN_V_W16 +%if ARCH_X86_64 + add r4, 16 + add r7, 16 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 +%else + mov dstq, dstmp + add r4, 16 + movzx hd, r6w + add dstq, 16 + mov srcq, r4 + mov dstmp, dstq +%endif + sub r6d, 1<<16 + jg .v_w16gt + RET +.hv: + ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8 + ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4 + movzx wd, word [t0+wq*2+table_offset(put, _bilin_hv)] + WIN64_SPILL_XMM 8 + shl mxyd, 11 ; can't shift by 12 due to signed overflow + mova m7, [base+pw_15] + movd m6, mxyd + add wq, t0 + pshuflw m6, m6, q0000 + paddb m5, m5 + punpcklqdq m6, m6 + jmp wq +.hv_w2: + RESTORE_DSQ_32 t0 + movd m0, [srcq+ssq*0] + punpckldq m0, m0 + pshufb m0, m4 + pmaddubsw m0, m5 +.hv_w2_loop: + movd m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movd m2, [srcq+ssq*0] + punpckldq m1, m2 + pshufb m1, m4 + pmaddubsw m1, m5 ; 1 _ 2 _ + shufps m2, m0, m1, q1032 ; 0 _ 1 _ + mova m0, m1 + psubw m1, m2 ; 2 * (src[x + src_stride] - src[x]) + pmulhw m1, m6 ; (my * (src[x + src_stride] - src[x]) >> 4 + pavgw m2, m7 ; src[x] + 8 + paddw m1, m2 ; src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8 + psrlw m1, 4 + packuswb m1, m1 +%if ARCH_X86_64 + movq r6, m1 +%else + pshuflw m1, m1, q2020 + movd r6d, m1 +%endif + mov [dstq+dsq*0], r6w + shr r6, gprsize*4 + mov [dstq+dsq*1], r6w + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +.hv_w4: + mova m4, [base+bilin_h_shuf4] + movddup xm0, [srcq+ssq*0] + movifnidn dsq, dsmp + pshufb m0, m4 + pmaddubsw m0, m5 +.hv_w4_loop: + movq m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movhps m1, [srcq+ssq*0] + pshufb m1, m4 + pmaddubsw m1, m5 ; 1 2 + shufps m2, m0, m1, q1032 ; 0 1 + mova m0, m1 + psubw m1, m2 + pmulhw m1, m6 + pavgw m2, m7 + paddw m1, m2 + psrlw m1, 4 + packuswb m1, m1 + movd [dstq+dsq*0], m1 + psrlq m1, 32 + movd [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: + movu m0, [srcq+ssq*0] + movifnidn dsq, dsmp + pshufb m0, m4 + pmaddubsw m0, m5 +.hv_w8_loop: + movu m2, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb m2, m4 + pmaddubsw m2, m5 + psubw m1, m2, m0 + pmulhw m1, m6 + pavgw m0, m7 + paddw m1, m0 + movu m0, [srcq+ssq*0] + pshufb m0, m4 + pmaddubsw m0, m5 + psubw m3, m0, m2 + pmulhw m3, m6 + pavgw m2, m7 + paddw m3, m2 + psrlw m1, 4 + psrlw m3, 4 + packuswb m1, m3 + movq [dstq+dsq*0], m1 + movhps [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w8_loop + RET +.hv_w128: + lea r6d, [hq+(7<<16)] + jmp .hv_w16_start +.hv_w64: + lea r6d, [hq+(3<<16)] + jmp .hv_w16_start +.hv_w32: + lea r6d, [hq+(1<<16)] +.hv_w16_start: + mov r4, srcq +%if ARCH_X86_32 + %define m8 [dstq] +%else + mov r7, dstq +%endif +.hv_w16: + movifnidn dsq, dsmp +%if WIN64 + movaps r4m, m8 +%endif +.hv_w16_loop0: + movu m0, [srcq+8*0] + movu m1, [srcq+8*1] + pshufb m0, m4 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 +.hv_w16_loop: + add srcq, ssq + movu m2, [srcq+8*0] + movu m3, [srcq+8*1] + pshufb m2, m4 + pshufb m3, m4 + pmaddubsw m2, m5 + pmaddubsw m3, m5 + mova m8, m2 + psubw m2, m0 + pmulhw m2, m6 + pavgw m0, m7 + paddw m2, m0 + mova m0, m3 + psubw m3, m1 + pmulhw m3, m6 + pavgw m1, m7 + paddw m3, m1 + mova m1, m0 + mova m0, m8 + psrlw m2, 4 + psrlw m3, 4 + packuswb m2, m3 + mova [dstq], m2 + add dstq, dsmp + dec hd + jg .hv_w16_loop +%if ARCH_X86_32 + mov dstq, dstm + add r4, 16 + movzx hd, r6w + add dstq, 16 + mov srcq, r4 + mov dstm, dstq +%else + add r4, 16 + add r7, 16 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 +%endif + sub r6d, 1<<16 + jg .hv_w16_loop0 +%if WIN64 + movaps m8, r4m +%endif + RET + +%macro PSHUFB_BILIN_H8 2 ; dst, src + %if cpuflag(ssse3) + pshufb %1, %2 + %else + psrldq %2, %1, 1 + punpcklbw %1, %2 + %endif +%endmacro + +%macro PSHUFB_BILIN_H4 3 ; dst, src, tmp + %if cpuflag(ssse3) + pshufb %1, %2 + %else + psrldq %2, %1, 1 + punpckhbw %3, %1, %2 + punpcklbw %1, %2 + punpcklqdq %1, %3 + %endif +%endmacro + +%macro PMADDUBSW 5 ; dst/src1, src2, zero, tmp, reset_zero + %if cpuflag(ssse3) + pmaddubsw %1, %2 + %else + %if %5 == 1 + pxor %3, %3 + %endif + punpckhbw %4, %1, %3 + punpcklbw %1, %1, %3 + pmaddwd %4, %2 + pmaddwd %1, %2 + packssdw %1, %4 + %endif +%endmacro + +%macro PMULHRSW 5 ; dst, src, tmp, rndval, shift + %if cpuflag(ssse3) + pmulhrsw %1, %2 + %else + punpckhwd %3, %1, %4 + punpcklwd %1, %4 + pmaddwd %3, %2 + pmaddwd %1, %2 + psrad %3, %5 + psrad %1, %5 + packssdw %1, %3 + %endif +%endmacro + +%macro PREP_BILIN 0 +%if ARCH_X86_32 + %define base r6-prep%+SUFFIX +%else + %define base 0 +%endif + +cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 + movifnidn mxyd, r5m ; mx + LEA r6, prep%+SUFFIX + tzcnt wd, wm + movifnidn hd, hm + test mxyd, mxyd + jnz .h + mov mxyd, r6m ; my + test mxyd, mxyd + jnz .v +.prep: +%if notcpuflag(ssse3) + add r6, prep_ssse3 - prep_sse2 + jmp prep_ssse3 +%else + movzx wd, word [r6+wq*2+table_offset(prep,)] + pxor m4, m4 + add wq, r6 + lea stride3q, [strideq*3] + jmp wq +.prep_w4: + movd m0, [srcq+strideq*0] + movd m1, [srcq+strideq*1] + movd m2, [srcq+strideq*2] + movd m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + punpckldq m0, m1 + punpckldq m2, m3 + punpcklbw m0, m4 + punpcklbw m2, m4 + psllw m0, 4 + psllw m2, 4 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m2 + add tmpq, 16*2 + sub hd, 4 + jg .prep_w4 + RET +.prep_w8: + movq m0, [srcq+strideq*0] + movq m1, [srcq+strideq*1] + movq m2, [srcq+strideq*2] + movq m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + punpcklbw m0, m4 + punpcklbw m1, m4 + punpcklbw m2, m4 + punpcklbw m3, m4 + psllw m0, 4 + psllw m1, 4 + psllw m2, 4 + psllw m3, 4 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + mova [tmpq+16*2], m2 + mova [tmpq+16*3], m3 + add tmpq, 16*4 + sub hd, 4 + jg .prep_w8 + RET +.prep_w16: + movu m1, [srcq+strideq*0] + movu m3, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + punpcklbw m0, m1, m4 + punpckhbw m1, m4 + punpcklbw m2, m3, m4 + punpckhbw m3, m4 + psllw m0, 4 + psllw m1, 4 + psllw m2, 4 + psllw m3, 4 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + mova [tmpq+16*2], m2 + mova [tmpq+16*3], m3 + add tmpq, 16*4 + sub hd, 2 + jg .prep_w16 + RET +.prep_w128: + mov r3, -128 + jmp .prep_w32_start +.prep_w64: + mov r3, -64 + jmp .prep_w32_start +.prep_w32: + mov r3, -32 +.prep_w32_start: + sub srcq, r3 +.prep_w32_vloop: + mov r6, r3 +.prep_w32_hloop: + movu m1, [srcq+r6+16*0] + movu m3, [srcq+r6+16*1] + punpcklbw m0, m1, m4 + punpckhbw m1, m4 + punpcklbw m2, m3, m4 + punpckhbw m3, m4 + psllw m0, 4 + psllw m1, 4 + psllw m2, 4 + psllw m3, 4 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + mova [tmpq+16*2], m2 + mova [tmpq+16*3], m3 + add tmpq, 16*4 + add r6, 32 + jl .prep_w32_hloop + add srcq, strideq + dec hd + jg .prep_w32_vloop + RET +%endif +.h: + ; 16 * src[x] + (mx * (src[x + 1] - src[x])) + ; = (16 - mx) * src[x] + mx * src[x + 1] +%if cpuflag(ssse3) + imul mxyd, 0x00ff00ff + mova m4, [base+bilin_h_shuf8] + add mxyd, 0x00100010 +%else + imul mxyd, 0xffff + add mxyd, 16 +%endif + movd m5, mxyd + mov mxyd, r6m ; my + pshufd m5, m5, q0000 + test mxyd, mxyd + jnz .hv + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)] +%if notcpuflag(ssse3) + WIN64_SPILL_XMM 8 + pxor m6, m6 +%endif + add wq, r6 + jmp wq +.h_w4: +%if cpuflag(ssse3) + mova m4, [base+bilin_h_shuf4] +%endif + lea stride3q, [strideq*3] +.h_w4_loop: + movq m0, [srcq+strideq*0] + movhps m0, [srcq+strideq*1] + movq m1, [srcq+strideq*2] + movhps m1, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + PSHUFB_BILIN_H4 m0, m4, m2 + PMADDUBSW m0, m5, m6, m2, 0 + PSHUFB_BILIN_H4 m1, m4, m2 + PMADDUBSW m1, m5, m6, m2, 0 + mova [tmpq+0 ], m0 + mova [tmpq+16], m1 + add tmpq, 32 + sub hd, 4 + jg .h_w4_loop + RET +.h_w8: + lea stride3q, [strideq*3] +.h_w8_loop: + movu m0, [srcq+strideq*0] + movu m1, [srcq+strideq*1] + movu m2, [srcq+strideq*2] + movu m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + PSHUFB_BILIN_H8 m0, m4 + PSHUFB_BILIN_H8 m1, m4 + PSHUFB_BILIN_H8 m2, m4 + PSHUFB_BILIN_H8 m3, m4 + PMADDUBSW m0, m5, m6, m7, 0 + PMADDUBSW m1, m5, m6, m7, 0 + PMADDUBSW m2, m5, m6, m7, 0 + PMADDUBSW m3, m5, m6, m7, 0 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + mova [tmpq+16*2], m2 + mova [tmpq+16*3], m3 + add tmpq, 16*4 + sub hd, 4 + jg .h_w8_loop + RET +.h_w16: + movu m0, [srcq+strideq*0+8*0] + movu m1, [srcq+strideq*0+8*1] + movu m2, [srcq+strideq*1+8*0] + movu m3, [srcq+strideq*1+8*1] + lea srcq, [srcq+strideq*2] + PSHUFB_BILIN_H8 m0, m4 + PSHUFB_BILIN_H8 m1, m4 + PSHUFB_BILIN_H8 m2, m4 + PSHUFB_BILIN_H8 m3, m4 + PMADDUBSW m0, m5, m6, m7, 0 + PMADDUBSW m1, m5, m6, m7, 0 + PMADDUBSW m2, m5, m6, m7, 0 + PMADDUBSW m3, m5, m6, m7, 0 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + mova [tmpq+16*2], m2 + mova [tmpq+16*3], m3 + add tmpq, 16*4 + sub hd, 2 + jg .h_w16 + RET +.h_w128: + mov r3, -128 + jmp .h_w32_start +.h_w64: + mov r3, -64 + jmp .h_w32_start +.h_w32: + mov r3, -32 +.h_w32_start: + sub srcq, r3 +.h_w32_vloop: + mov r6, r3 +.h_w32_hloop: + movu m0, [srcq+r6+8*0] + movu m1, [srcq+r6+8*1] + movu m2, [srcq+r6+8*2] + movu m3, [srcq+r6+8*3] + PSHUFB_BILIN_H8 m0, m4 + PSHUFB_BILIN_H8 m1, m4 + PSHUFB_BILIN_H8 m2, m4 + PSHUFB_BILIN_H8 m3, m4 + PMADDUBSW m0, m5, m6, m7, 0 + PMADDUBSW m1, m5, m6, m7, 0 + PMADDUBSW m2, m5, m6, m7, 0 + PMADDUBSW m3, m5, m6, m7, 0 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + mova [tmpq+16*2], m2 + mova [tmpq+16*3], m3 + add tmpq, 16*4 + add r6, 32 + jl .h_w32_hloop + add srcq, strideq + dec hd + jg .h_w32_vloop + RET +.v: +%if notcpuflag(ssse3) + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 8 +%endif + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)] +%if cpuflag(ssse3) + imul mxyd, 0x00ff00ff + add mxyd, 0x00100010 +%else + imul mxyd, 0xffff + pxor m6, m6 + add mxyd, 16 +%endif + add wq, r6 + lea stride3q, [strideq*3] + movd m5, mxyd + pshufd m5, m5, q0000 + jmp wq +.v_w4: + movd m0, [srcq+strideq*0] +.v_w4_loop: + movd m1, [srcq+strideq*1] + movd m2, [srcq+strideq*2] + movd m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + punpckldq m0, m1 + punpckldq m1, m2 + punpcklbw m0, m1 ; 01 12 + PMADDUBSW m0, m5, m6, m7, 0 + mova [tmpq+16*0], m0 + movd m0, [srcq+strideq*0] + punpckldq m2, m3 + punpckldq m3, m0 + punpcklbw m2, m3 ; 23 34 + PMADDUBSW m2, m5, m6, m7, 0 + mova [tmpq+16*1], m2 + add tmpq, 16*2 + sub hd, 4 + jg .v_w4_loop + RET +.v_w8: + movq m0, [srcq+strideq*0] +.v_w8_loop: + movq m1, [srcq+strideq*1] + movq m2, [srcq+strideq*2] + movq m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + punpcklbw m0, m1 ; 01 + punpcklbw m1, m2 ; 12 + PMADDUBSW m0, m5, m6, m7, 0 + PMADDUBSW m1, m5, m6, m7, 0 + mova [tmpq+16*0], m0 + movq m0, [srcq+strideq*0] + punpcklbw m2, m3 ; 23 + punpcklbw m3, m0 ; 34 + PMADDUBSW m2, m5, m6, m7, 0 + mova [tmpq+16*1], m1 + PMADDUBSW m3, m5, m6, m7, 0 + mova [tmpq+16*2], m2 + mova [tmpq+16*3], m3 + add tmpq, 16*4 + sub hd, 4 + jg .v_w8_loop + RET +.v_w16: + movu m0, [srcq+strideq*0] +.v_w16_loop: + movu m1, [srcq+strideq*1] + movu m2, [srcq+strideq*2] + movu m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + punpcklbw m4, m0, m1 + punpckhbw m0, m1 + PMADDUBSW m4, m5, m6, m7, 0 + PMADDUBSW m0, m5, m6, m7, 0 + mova [tmpq+16*0], m4 + punpcklbw m4, m1, m2 + punpckhbw m1, m2 + PMADDUBSW m4, m5, m6, m7, 0 + mova [tmpq+16*1], m0 + movu m0, [srcq+strideq*0] + PMADDUBSW m1, m5, m6, m7, 0 + mova [tmpq+16*2], m4 + punpcklbw m4, m2, m3 + punpckhbw m2, m3 + PMADDUBSW m4, m5, m6, m7, 0 + mova [tmpq+16*3], m1 + PMADDUBSW m2, m5, m6, m7, 0 + mova [tmpq+16*4], m4 + punpcklbw m4, m3, m0 + punpckhbw m3, m0 + PMADDUBSW m4, m5, m6, m7, 0 + mova [tmpq+16*5], m2 + PMADDUBSW m3, m5, m6, m7, 0 + mova [tmpq+16*6], m4 + mova [tmpq+16*7], m3 + add tmpq, 16*8 + sub hd, 4 + jg .v_w16_loop + RET +.v_w128: + lea r3d, [hq+(3<<8)] + mov r6d, 256 + jmp .v_w32_start +.v_w64: + lea r3d, [hq+(1<<8)] + mov r6d, 128 + jmp .v_w32_start +.v_w32: + xor r3d, r3d + mov r6d, 64 +.v_w32_start: +%if ARCH_X86_64 + %if WIN64 + PUSH r7 + %endif + mov r7, tmpq +%endif + mov r5, srcq +.v_w32_hloop: + movu m0, [srcq+strideq*0+16*0] + movu m1, [srcq+strideq*0+16*1] +.v_w32_vloop: + movu m2, [srcq+strideq*1+16*0] + movu m3, [srcq+strideq*1+16*1] + lea srcq, [srcq+strideq*2] + punpcklbw m4, m0, m2 + punpckhbw m0, m2 + PMADDUBSW m4, m5, m6, m7, 0 + PMADDUBSW m0, m5, m6, m7, 0 + mova [tmpq+16*0], m4 + mova [tmpq+16*1], m0 + movu m0, [srcq+strideq*0+16*0] + punpcklbw m4, m1, m3 + punpckhbw m1, m3 + PMADDUBSW m4, m5, m6, m7, 0 + PMADDUBSW m1, m5, m6, m7, 0 + mova [tmpq+16*2], m4 + mova [tmpq+16*3], m1 + movu m1, [srcq+strideq*0+16*1] + add tmpq, r6 + punpcklbw m4, m2, m0 + punpckhbw m2, m0 + PMADDUBSW m4, m5, m6, m7, 0 + PMADDUBSW m2, m5, m6, m7, 0 + mova [tmpq+16*0], m4 + mova [tmpq+16*1], m2 + punpcklbw m4, m3, m1 + punpckhbw m3, m1 + PMADDUBSW m4, m5, m6, m7, 0 + PMADDUBSW m3, m5, m6, m7, 0 + mova [tmpq+16*2], m4 + mova [tmpq+16*3], m3 + add tmpq, r6 + sub hd, 2 + jg .v_w32_vloop + add r5, 32 + movzx hd, r3b + mov srcq, r5 +%if ARCH_X86_64 + add r7, 16*4 + mov tmpq, r7 +%else + mov tmpq, tmpmp + add tmpq, 16*4 + mov tmpmp, tmpq +%endif + sub r3d, 1<<8 + jg .v_w32_hloop +%if WIN64 + POP r7 +%endif + RET +.hv: + ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4 + ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4) + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] +%assign stack_offset stack_offset - stack_size_padded +%if cpuflag(ssse3) + imul mxyd, 0x08000800 + WIN64_SPILL_XMM 8 +%else + or mxyd, 1<<16 + WIN64_SPILL_XMM 9 + %if ARCH_X86_64 + mova m8, [base+pw_8] + %else + %define m8 [base+pw_8] + %endif + pxor m7, m7 +%endif + movd m6, mxyd + add wq, r6 + pshufd m6, m6, q0000 + jmp wq +.hv_w4: +%if cpuflag(ssse3) + mova m4, [base+bilin_h_shuf4] + movddup m0, [srcq+strideq*0] +%else + movhps m0, [srcq+strideq*0] +%endif + lea r3, [strideq*3] + PSHUFB_BILIN_H4 m0, m4, m3 + PMADDUBSW m0, m5, m7, m4, 0 ; _ 0 +.hv_w4_loop: + movq m1, [srcq+strideq*1] + movhps m1, [srcq+strideq*2] + movq m2, [srcq+r3 ] + lea srcq, [srcq+strideq*4] + movhps m2, [srcq+strideq*0] + PSHUFB_BILIN_H4 m1, m4, m3 + PSHUFB_BILIN_H4 m2, m4, m3 + PMADDUBSW m1, m5, m7, m4, 0 ; 1 2 + PMADDUBSW m2, m5, m7, m4, 0 ; 3 4 + shufpd m0, m1, 0x01 ; 0 1 + shufpd m3, m1, m2, 0x01 ; 2 3 + psubw m1, m0 + PMULHRSW m1, m6, m4, m8, 4 + paddw m1, m0 + mova m0, m2 + psubw m2, m3 + PMULHRSW m2, m6, m4, m8, 4 + paddw m2, m3 + mova [tmpq+16*0], m1 + mova [tmpq+16*1], m2 + add tmpq, 32 + sub hd, 4 + jg .hv_w4_loop + RET +.hv_w8: + movu m0, [srcq+strideq*0] + PSHUFB_BILIN_H8 m0, m4 + PMADDUBSW m0, m5, m7, m4, 0 ; 0 +.hv_w8_loop: + movu m1, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + movu m2, [srcq+strideq*0] + PSHUFB_BILIN_H8 m1, m4 + PSHUFB_BILIN_H8 m2, m4 + PMADDUBSW m1, m5, m7, m4, 0 ; 1 + PMADDUBSW m2, m5, m7, m4, 0 ; 2 + psubw m3, m1, m0 + PMULHRSW m3, m6, m4, m8, 4 + paddw m3, m0 + mova m0, m2 + psubw m2, m1 + PMULHRSW m2, m6, m4, m8, 4 + paddw m2, m1 + mova [tmpq+16*0], m3 + mova [tmpq+16*1], m2 + add tmpq, 16*2 + sub hd, 2 + jg .hv_w8_loop + RET +.hv_w128: + lea r3d, [hq+(7<<8)] + mov r5d, 256 + jmp .hv_w16_start +.hv_w64: + lea r3d, [hq+(3<<8)] + mov r5d, 128 + jmp .hv_w16_start +.hv_w32: + lea r3d, [hq+(1<<8)] + mov r5d, 64 + jmp .hv_w16_start +.hv_w16: + xor r3d, r3d + mov r5d, 32 +.hv_w16_start: +%if ARCH_X86_64 || cpuflag(ssse3) + mov r6, srcq +%endif +%if ARCH_X86_64 + %if WIN64 + PUSH r7 + %endif + mov r7, tmpq +%endif +.hv_w16_hloop: + movu m0, [srcq+strideq*0+8*0] + movu m1, [srcq+strideq*0+8*1] + PSHUFB_BILIN_H8 m0, m4 + PSHUFB_BILIN_H8 m1, m4 + PMADDUBSW m0, m5, m7, m4, 0 ; 0a + PMADDUBSW m1, m5, m7, m4, 0 ; 0b +.hv_w16_vloop: + movu m2, [srcq+strideq*1+8*0] + PSHUFB_BILIN_H8 m2, m4 + PMADDUBSW m2, m5, m7, m4, 0 ; 1a + psubw m3, m2, m0 + PMULHRSW m3, m6, m4, m8, 4 + paddw m3, m0 + mova [tmpq+16*0], m3 + movu m3, [srcq+strideq*1+8*1] + lea srcq, [srcq+strideq*2] + PSHUFB_BILIN_H8 m3, m4 + PMADDUBSW m3, m5, m7, m4, 0 ; 1b + psubw m0, m3, m1 + PMULHRSW m0, m6, m4, m8, 4 + paddw m0, m1 + mova [tmpq+16*1], m0 + add tmpq, r5 + movu m0, [srcq+strideq*0+8*0] + PSHUFB_BILIN_H8 m0, m4 + PMADDUBSW m0, m5, m7, m4, 0 ; 2a + psubw m1, m0, m2 + PMULHRSW m1, m6, m4, m8, 4 + paddw m1, m2 + mova [tmpq+16*0], m1 + movu m1, [srcq+strideq*0+8*1] + PSHUFB_BILIN_H8 m1, m4 + PMADDUBSW m1, m5, m7, m4, 0 ; 2b + psubw m2, m1, m3 + PMULHRSW m2, m6, m4, m8, 4 + paddw m2, m3 + mova [tmpq+16*1], m2 + add tmpq, r5 + sub hd, 2 + jg .hv_w16_vloop + movzx hd, r3b +%if ARCH_X86_64 + add r6, 16 + add r7, 2*16 + mov srcq, r6 + mov tmpq, r7 +%elif cpuflag(ssse3) + mov tmpq, tmpm + add r6, 16 + add tmpq, 2*16 + mov srcq, r6 + mov tmpm, tmpq +%else + mov srcq, srcm + mov tmpq, tmpm + add srcq, 16 + add tmpq, 2*16 + mov srcm, srcq + mov tmpm, tmpq +%endif + sub r3d, 1<<8 + jg .hv_w16_hloop +%if WIN64 + POP r7 +%endif + RET +%endmacro + +; int8_t subpel_filters[5][15][8] +%assign FILTER_REGULAR (0*15 << 16) | 3*15 +%assign FILTER_SMOOTH (1*15 << 16) | 4*15 +%assign FILTER_SHARP (2*15 << 16) | 3*15 + +%macro FN 4 ; prefix, type, type_h, type_v +cglobal %1_%2 + mov t0d, FILTER_%3 +%ifidn %3, %4 + mov t1d, t0d +%else + mov t1d, FILTER_%4 +%endif +%ifnidn %2, regular ; skip the jump in the last filter + jmp mangle(private_prefix %+ _%1 %+ SUFFIX) +%endif +%endmacro + +%if ARCH_X86_32 +DECLARE_REG_TMP 1, 2 +%elif WIN64 +DECLARE_REG_TMP 4, 5 +%else +DECLARE_REG_TMP 7, 8 +%endif + +FN put_8tap, sharp, SHARP, SHARP +FN put_8tap, sharp_smooth, SHARP, SMOOTH +FN put_8tap, smooth_sharp, SMOOTH, SHARP +FN put_8tap, smooth, SMOOTH, SMOOTH +FN put_8tap, sharp_regular, SHARP, REGULAR +FN put_8tap, regular_sharp, REGULAR, SHARP +FN put_8tap, smooth_regular, SMOOTH, REGULAR +FN put_8tap, regular_smooth, REGULAR, SMOOTH +FN put_8tap, regular, REGULAR, REGULAR + +%if ARCH_X86_32 + %define base_reg r1 + %define base base_reg-put_ssse3 +%else + %define base_reg r8 + %define base 0 +%endif + +cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 +%assign org_stack_offset stack_offset + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h +%if ARCH_X86_64 + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v +%else + imul ssd, mym, 0x010101 + add ssd, t1d ; 8tap_v, my, 4tap_v + mov srcq, srcm +%endif + mov wd, wm + movifnidn hd, hm + LEA base_reg, put_ssse3 + test mxd, 0xf00 + jnz .h +%if ARCH_X86_32 + test ssd, 0xf00 +%else + test myd, 0xf00 +%endif + jnz .v + tzcnt wd, wd + movzx wd, word [base_reg+wq*2+table_offset(put,)] + add wq, base_reg +; put_bilin mangling jump +%assign stack_offset org_stack_offset + movifnidn dsq, dsmp + movifnidn ssq, ssmp +%if WIN64 + pop r8 +%endif + lea r6, [ssq*3] + jmp wq +.h: +%if ARCH_X86_32 + test ssd, 0xf00 +%else + test myd, 0xf00 +%endif + jnz .hv + movifnidn ssq, ssmp + WIN64_SPILL_XMM 12 + cmp wd, 4 + jl .h_w2 + je .h_w4 + tzcnt wd, wd +%if ARCH_X86_64 + mova m10, [base+subpel_h_shufA] + mova m11, [base+subpel_h_shufB] + mova m9, [base+subpel_h_shufC] +%endif + shr mxd, 16 + sub srcq, 3 + movzx wd, word [base_reg+wq*2+table_offset(put, _8tap_h)] + movq m6, [base_reg+mxq*8+subpel_filters-put_ssse3] + mova m7, [base+pw_34] ; 2 + (8 << 2) + pshufd m5, m6, q0000 + pshufd m6, m6, q1111 + add wq, base_reg + jmp wq +.h_w2: +%if ARCH_X86_32 + and mxd, 0x7f +%else + movzx mxd, mxb +%endif + dec srcq + mova m4, [base+subpel_h_shuf4] + movd m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2] + mova m5, [base+pw_34] ; 2 + (8 << 2) + pshufd m3, m3, q0000 + movifnidn dsq, dsmp +.h_w2_loop: + movq m0, [srcq+ssq*0] + movhps m0, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb m0, m4 + pmaddubsw m0, m3 + phaddw m0, m0 + paddw m0, m5 ; pw34 + psraw m0, 6 + packuswb m0, m0 + movd r6d, m0 + mov [dstq+dsq*0], r6w + shr r6d, 16 + mov [dstq+dsq*1], r6w + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w2_loop + RET +.h_w4: +%if ARCH_X86_32 + and mxd, 0x7f +%else + movzx mxd, mxb +%endif + dec srcq + movd m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2] + mova m6, [base+subpel_h_shufA] + mova m5, [base+pw_34] ; 2 + (8 << 2) + pshufd m3, m3, q0000 + movifnidn dsq, dsmp +.h_w4_loop: + movq m0, [srcq+ssq*0] ; 1 + movq m1, [srcq+ssq*1] ; 2 + lea srcq, [srcq+ssq*2] + pshufb m0, m6 ; subpel_h_shufA + pshufb m1, m6 ; subpel_h_shufA + pmaddubsw m0, m3 ; subpel_filters + pmaddubsw m1, m3 ; subpel_filters + phaddw m0, m1 + paddw m0, m5 ; pw34 + psraw m0, 6 + packuswb m0, m0 + movd [dstq+dsq*0], m0 + psrlq m0, 32 + movd [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w4_loop + RET +%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3] + %if ARCH_X86_32 + pshufb %2, %1, [base+subpel_h_shufB] + pshufb %3, %1, [base+subpel_h_shufC] + pshufb %1, [base+subpel_h_shufA] + %else + pshufb %2, %1, m11; subpel_h_shufB + pshufb %3, %1, m9 ; subpel_h_shufC + pshufb %1, m10 ; subpel_h_shufA + %endif + pmaddubsw %4, %2, m5 ; subpel +0 B0 + pmaddubsw %2, m6 ; subpel +4 B4 + pmaddubsw %3, m6 ; C4 + pmaddubsw %1, m5 ; A0 + paddw %3, %4 ; C4+B0 + paddw %1, %2 ; A0+B4 + phaddw %1, %3 + paddw %1, m7 ; pw34 + psraw %1, 6 +%endmacro +.h_w8: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + PUT_8TAP_H m0, m2, m3, m4 + PUT_8TAP_H m1, m2, m3, m4 + packuswb m0, m1 +%if ARCH_X86_32 + movq [dstq], m0 + add dstq, dsm + movhps [dstq], m0 + add dstq, dsm +%else + movq [dstq+dsq*0], m0 + movhps [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] +%endif + sub hd, 2 + jg .h_w8 + RET +.h_w128: + mov r4, -16*7 + jmp .h_w16_start +.h_w64: + mov r4, -16*3 + jmp .h_w16_start +.h_w32: + mov r4, -16*1 + jmp .h_w16_start +.h_w16: + xor r4d, r4d +.h_w16_start: + sub srcq, r4 + sub dstq, r4 +.h_w16_loop_v: + mov r6, r4 +.h_w16_loop_h: + movu m0, [srcq+r6+8*0] + movu m1, [srcq+r6+8*1] + PUT_8TAP_H m0, m2, m3, m4 + PUT_8TAP_H m1, m2, m3, m4 + packuswb m0, m1 + mova [dstq+r6], m0 + add r6, 16 + jle .h_w16_loop_h + add srcq, ssq + add dstq, dsmp + dec hd + jg .h_w16_loop_v + RET +.v: +%if ARCH_X86_32 + movzx mxd, ssb + shr ssd, 16 + cmp hd, 6 + cmovs ssd, mxd + movq m0, [base_reg+ssq*8+subpel_filters-put_ssse3] +%else + %assign stack_offset org_stack_offset + WIN64_SPILL_XMM 16 + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + movq m0, [base_reg+myq*8+subpel_filters-put_ssse3] +%endif + tzcnt r6d, wd + movzx r6d, word [base_reg+r6*2+table_offset(put, _8tap_v)] + punpcklwd m0, m0 + mova m7, [base+pw_512] + add r6, base_reg +%if ARCH_X86_32 + %define subpel0 [rsp+mmsize*0] + %define subpel1 [rsp+mmsize*1] + %define subpel2 [rsp+mmsize*2] + %define subpel3 [rsp+mmsize*3] +%assign regs_used 2 ; use r1 (ds) as tmp for stack alignment if needed + ALLOC_STACK -16*4 +%assign regs_used 7 + pshufd m1, m0, q0000 + mova subpel0, m1 + pshufd m1, m0, q1111 + mova subpel1, m1 + pshufd m1, m0, q2222 + mova subpel2, m1 + pshufd m1, m0, q3333 + mova subpel3, m1 + mov ssq, [rstk+stack_offset+gprsize*4] + lea ssq, [ssq*3] + sub srcq, ssq + mov ssq, [rstk+stack_offset+gprsize*4] + mov dsq, [rstk+stack_offset+gprsize*2] +%else + %define subpel0 m8 + %define subpel1 m9 + %define subpel2 m10 + %define subpel3 m11 + lea ss3q, [ssq*3] + pshufd m8, m0, q0000 + sub srcq, ss3q + pshufd m9, m0, q1111 + pshufd m10, m0, q2222 + pshufd m11, m0, q3333 +%endif + jmp r6 +.v_w2: + movd m1, [srcq+ssq*0] + movd m0, [srcq+ssq*1] +%if ARCH_X86_32 + lea srcq, [srcq+ssq*2] + movd m2, [srcq+ssq*0] + movd m5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movd m3, [srcq+ssq*0] + movd m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] +%else + movd m2, [srcq+ssq*2] + add srcq, ss3q + movd m5, [srcq+ssq*0] + movd m3, [srcq+ssq*1] + movd m4, [srcq+ssq*2] + add srcq, ss3q +%endif + punpcklwd m1, m0 ; 0 1 + punpcklwd m0, m2 ; 1 2 + punpcklbw m1, m0 ; 01 12 + movd m0, [srcq+ssq*0] + punpcklwd m2, m5 ; 2 3 + punpcklwd m5, m3 ; 3 4 + punpcklwd m3, m4 ; 4 5 + punpcklwd m4, m0 ; 5 6 + punpcklbw m2, m5 ; 23 34 + punpcklbw m3, m4 ; 45 56 +.v_w2_loop: + movd m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddubsw m5, m1, subpel0 ; a0 b0 + mova m1, m2 + pmaddubsw m2, subpel1 ; a1 b1 + paddw m5, m2 + mova m2, m3 + pmaddubsw m3, subpel2 ; a2 b2 + paddw m5, m3 + punpcklwd m3, m0, m4 ; 6 7 + movd m0, [srcq+ssq*0] + punpcklwd m4, m0 ; 7 8 + punpcklbw m3, m4 ; 67 78 + pmaddubsw m4, m3, subpel3 ; a3 b3 + paddw m5, m4 + pmulhrsw m5, m7 + packuswb m5, m5 + movd r6d, m5 + mov [dstq+dsq*0], r6w + shr r6d, 16 + mov [dstq+dsq*1], r6w + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: +%if ARCH_X86_32 +.v_w8: +.v_w16: +.v_w32: +.v_w64: +.v_w128: + shl wd, 14 +%if STACK_ALIGNMENT < 16 + %define dstm [rsp+mmsize*4+gprsize] + mov dstm, dstq +%endif + lea r6d, [hq+wq-(1<<16)] + mov r4, srcq +.v_w4_loop0: +%endif + movd m1, [srcq+ssq*0] + movd m0, [srcq+ssq*1] +%if ARCH_X86_32 + lea srcq, [srcq+ssq*2] + movd m2, [srcq+ssq*0] + movd m5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movd m3, [srcq+ssq*0] + movd m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] +%else + movd m2, [srcq+ssq*2] + add srcq, ss3q + movd m5, [srcq+ssq*0] + movd m3, [srcq+ssq*1] + movd m4, [srcq+ssq*2] + add srcq, ss3q +%endif + punpckldq m1, m0 ; 0 1 + punpckldq m0, m2 ; 1 2 + punpcklbw m1, m0 ; 01 12 + movd m0, [srcq+ssq*0] + punpckldq m2, m5 ; 2 3 + punpckldq m5, m3 ; 3 4 + punpckldq m3, m4 ; 4 5 + punpckldq m4, m0 ; 5 6 + punpcklbw m2, m5 ; 23 34 + punpcklbw m3, m4 ; 45 56 +.v_w4_loop: + movd m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddubsw m5, m1, subpel0 ; a0 b0 + mova m1, m2 + pmaddubsw m2, subpel1 ; a1 b1 + paddw m5, m2 + mova m2, m3 + pmaddubsw m3, subpel2 ; a2 b2 + paddw m5, m3 + punpckldq m3, m0, m4 ; 6 7 _ _ + movd m0, [srcq+ssq*0] + punpckldq m4, m0 ; 7 8 _ _ + punpcklbw m3, m4 ; 67 78 + pmaddubsw m4, m3, subpel3 ; a3 b3 + paddw m5, m4 + pmulhrsw m5, m7 + packuswb m5, m5 + movd [dstq+dsq*0], m5 + psrlq m5, 32 + movd [dstq+dsq*1], m5 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop +%if ARCH_X86_32 + mov dstq, dstm + add r4, 4 + movzx hd, r6w + add dstq, 4 + mov srcq, r4 + mov dstm, dstq + sub r6d, 1<<16 + jg .v_w4_loop0 +%endif + RET +%if ARCH_X86_64 +.v_w8: +.v_w16: +.v_w32: +.v_w64: +.v_w128: + lea r6d, [wq*8-64] + mov r4, srcq + mov r7, dstq + lea r6d, [hq+r6*4] +.v_w8_loop0: + movq m1, [srcq+ssq*0] + movq m2, [srcq+ssq*1] + movq m3, [srcq+ssq*2] + add srcq, ss3q + movq m4, [srcq+ssq*0] + movq m5, [srcq+ssq*1] + movq m6, [srcq+ssq*2] + add srcq, ss3q + movq m0, [srcq+ssq*0] + punpcklbw m1, m2 ; 01 + punpcklbw m2, m3 ; 12 + punpcklbw m3, m4 ; 23 + punpcklbw m4, m5 ; 34 + punpcklbw m5, m6 ; 45 + punpcklbw m6, m0 ; 56 +.v_w8_loop: + movq m13, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddubsw m14, m1, subpel0 ; a0 + mova m1, m3 + pmaddubsw m15, m2, subpel0 ; b0 + mova m2, m4 + pmaddubsw m3, subpel1 ; a1 + mova m12, m0 + pmaddubsw m4, subpel1 ; b1 + movq m0, [srcq+ssq*0] + paddw m14, m3 + paddw m15, m4 + mova m3, m5 + pmaddubsw m5, subpel2 ; a2 + mova m4, m6 + pmaddubsw m6, subpel2 ; b2 + punpcklbw m12, m13 ; 67 + punpcklbw m13, m0 ; 78 + paddw m14, m5 + mova m5, m12 + pmaddubsw m12, subpel3 ; a3 + paddw m15, m6 + mova m6, m13 + pmaddubsw m13, subpel3 ; b3 + paddw m14, m12 + paddw m15, m13 + pmulhrsw m14, m7 + pmulhrsw m15, m7 + packuswb m14, m15 + movq [dstq+dsq*0], xm14 + movhps [dstq+dsq*1], xm14 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w8_loop + add r4, 8 + add r7, 8 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 + sub r6d, 1<<8 + jg .v_w8_loop0 + RET +%endif ;ARCH_X86_64 +%undef subpel0 +%undef subpel1 +%undef subpel2 +%undef subpel3 +.hv: + %assign stack_offset org_stack_offset + cmp wd, 4 + jg .hv_w8 +%if ARCH_X86_32 + and mxd, 0x7f +%else + movzx mxd, mxb +%endif + dec srcq + movd m1, [base_reg+mxq*8+subpel_filters-put_ssse3+2] +%if ARCH_X86_32 + movzx mxd, ssb + shr ssd, 16 + cmp hd, 6 + cmovs ssd, mxd + movq m0, [base_reg+ssq*8+subpel_filters-put_ssse3] + mov ssq, ssmp + lea r6, [ssq*3] + sub srcq, r6 + %define base_reg r6 + mov r6, r1; use as new base + %assign regs_used 2 + ALLOC_STACK -mmsize*14 + %assign regs_used 7 + mov dsq, [rstk+stack_offset+gprsize*2] + %define subpelv0 [rsp+mmsize*0] + %define subpelv1 [rsp+mmsize*1] + %define subpelv2 [rsp+mmsize*2] + %define subpelv3 [rsp+mmsize*3] + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + pshufd m6, m0, q0000 + mova subpelv0, m6 + pshufd m6, m0, q1111 + mova subpelv1, m6 + pshufd m6, m0, q2222 + mova subpelv2, m6 + pshufd m6, m0, q3333 + mova subpelv3, m6 +%else + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + movq m0, [base_reg+myq*8+subpel_filters-put_ssse3] + ALLOC_STACK mmsize*14, 14 + lea ss3q, [ssq*3] + sub srcq, ss3q + %define subpelv0 m10 + %define subpelv1 m11 + %define subpelv2 m12 + %define subpelv3 m13 + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + mova m8, [base+pw_8192] + mova m9, [base+pd_512] + pshufd m10, m0, q0000 + pshufd m11, m0, q1111 + pshufd m12, m0, q2222 + pshufd m13, m0, q3333 +%endif + pshufd m7, m1, q0000 + cmp wd, 4 + je .hv_w4 +.hv_w2: + mova m6, [base+subpel_h_shuf4] + movq m2, [srcq+ssq*0] ; 0 + movhps m2, [srcq+ssq*1] ; 0 _ 1 +%if ARCH_X86_32 + %define w8192reg [base+pw_8192] + %define d512reg [base+pd_512] + lea srcq, [srcq+ssq*2] + movq m0, [srcq+ssq*0] ; 2 + movhps m0, [srcq+ssq*1] ; 2 _ 3 + lea srcq, [srcq+ssq*2] +%else + %define w8192reg m8 + %define d512reg m9 + movq m0, [srcq+ssq*2] ; 2 + add srcq, ss3q + movhps m0, [srcq+ssq*0] ; 2 _ 3 +%endif + pshufb m2, m6 ; 0 ~ 1 ~ + pshufb m0, m6 ; 2 ~ 3 ~ + pmaddubsw m2, m7 ; subpel_filters + pmaddubsw m0, m7 ; subpel_filters + phaddw m2, m0 ; 0 1 2 3 + pmulhrsw m2, w8192reg +%if ARCH_X86_32 + movq m3, [srcq+ssq*0] ; 4 + movhps m3, [srcq+ssq*1] ; 4 _ 5 + lea srcq, [srcq+ssq*2] +%else + movq m3, [srcq+ssq*1] ; 4 + movhps m3, [srcq+ssq*2] ; 4 _ 5 + add srcq, ss3q +%endif + movq m0, [srcq+ssq*0] ; 6 + pshufb m3, m6 ; 4 ~ 5 ~ + pshufb m0, m6 ; 6 ~ + pmaddubsw m3, m7 ; subpel_filters + pmaddubsw m0, m7 ; subpel_filters + phaddw m3, m0 ; 4 5 6 _ + pmulhrsw m3, w8192reg + palignr m4, m3, m2, 4; V 1 2 3 4 + punpcklwd m1, m2, m4 ; V 01 12 0 1 1 2 + punpckhwd m2, m4 ; V 23 34 2 3 3 4 + pshufd m0, m3, q2121; V 5 6 5 6 + punpcklwd m3, m0 ; V 45 56 4 5 5 6 +.hv_w2_loop: + movq m4, [srcq+ssq*1] ; V 7 + lea srcq, [srcq+ssq*2] ; V + movhps m4, [srcq+ssq*0] ; V 7 8 + pshufb m4, m6 + pmaddubsw m4, m7 + pmaddwd m5, m1, subpelv0; V a0 b0 + mova m1, m2 ; V + pmaddwd m2, subpelv1 ; V a1 b1 + paddd m5, m2 ; V + mova m2, m3 ; V + pmaddwd m3, subpelv2 ; a2 b2 + phaddw m4, m4 + pmulhrsw m4, w8192reg + paddd m5, m3 ; V + palignr m3, m4, m0, 12 + mova m0, m4 + punpcklwd m3, m0 ; V 67 78 + pmaddwd m4, m3, subpelv3 ; V a3 b3 + paddd m5, d512reg + paddd m5, m4 + psrad m5, 10 + packssdw m5, m5 + packuswb m5, m5 + movd r4d, m5 + mov [dstq+dsq*0], r4w + shr r4d, 16 + mov [dstq+dsq*1], r4w + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +%undef w8192reg +%undef d512reg +.hv_w4: +%define hv4_line_0_0 4 +%define hv4_line_0_1 5 +%define hv4_line_0_2 6 +%define hv4_line_0_3 7 +%define hv4_line_0_4 8 +%define hv4_line_0_5 9 +%define hv4_line_1_0 10 +%define hv4_line_1_1 11 +%define hv4_line_1_2 12 +%define hv4_line_1_3 13 +%macro SAVELINE_W4 3 + mova [rsp+mmsize*hv4_line_%3_%2], %1 +%endmacro +%macro RESTORELINE_W4 3 + mova %1, [rsp+mmsize*hv4_line_%3_%2] +%endmacro +%if ARCH_X86_32 + %define w8192reg [base+pw_8192] + %define d512reg [base+pd_512] +%else + %define w8192reg m8 + %define d512reg m9 +%endif + ; lower shuffle 0 1 2 3 4 + mova m6, [base+subpel_h_shuf4] + movq m5, [srcq+ssq*0] ; 0 _ _ _ + movhps m5, [srcq+ssq*1] ; 0 _ 1 _ +%if ARCH_X86_32 + lea srcq, [srcq+ssq*2] + movq m4, [srcq+ssq*0] ; 2 _ _ _ + movhps m4, [srcq+ssq*1] ; 2 _ 3 _ + lea srcq, [srcq+ssq*2] +%else + movq m4, [srcq+ssq*2] ; 2 _ _ _ + movhps m4, [srcq+ss3q ] ; 2 _ 3 _ + lea srcq, [srcq+ssq*4] +%endif + pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~ + pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~ + pmaddubsw m2, m7 ;H subpel_filters + pmaddubsw m0, m7 ;H subpel_filters + phaddw m2, m0 ;H 0 1 2 3 + pmulhrsw m2, w8192reg ;H pw_8192 + SAVELINE_W4 m2, 2, 0 + ; upper shuffle 2 3 4 5 6 + mova m6, [base+subpel_h_shuf4+16] + pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~ + pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~ + pmaddubsw m2, m7 ;H subpel_filters + pmaddubsw m0, m7 ;H subpel_filters + phaddw m2, m0 ;H 0 1 2 3 + pmulhrsw m2, w8192reg ;H pw_8192 + ; + ; lower shuffle + mova m6, [base+subpel_h_shuf4] + movq m5, [srcq+ssq*0] ; 4 _ _ _ + movhps m5, [srcq+ssq*1] ; 4 _ 5 _ +%if ARCH_X86_32 + lea srcq, [srcq+ssq*2] + movq m4, [srcq+ssq*0] ; 6 _ _ _ + add srcq, ssq +%else + movq m4, [srcq+ssq*2] ; 6 _ _ _ + add srcq, ss3q +%endif + pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~ + pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~ + pmaddubsw m3, m7 ;H subpel_filters + pmaddubsw m0, m7 ;H subpel_filters + phaddw m3, m0 ;H 4 5 6 7 + pmulhrsw m3, w8192reg ;H pw_8192 + SAVELINE_W4 m3, 3, 0 + ; upper shuffle + mova m6, [base+subpel_h_shuf4+16] + pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~ + pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~ + pmaddubsw m3, m7 ;H subpel_filters + pmaddubsw m0, m7 ;H subpel_filters + phaddw m3, m0 ;H 4 5 6 7 + pmulhrsw m3, w8192reg ;H pw_8192 + ;process high + palignr m4, m3, m2, 4;V 1 2 3 4 + punpcklwd m1, m2, m4 ; V 01 12 + punpckhwd m2, m4 ; V 23 34 + pshufd m0, m3, q2121;V 5 6 5 6 + punpcklwd m3, m0 ; V 45 56 + SAVELINE_W4 m0, 0, 1 + SAVELINE_W4 m1, 1, 1 + SAVELINE_W4 m2, 2, 1 + SAVELINE_W4 m3, 3, 1 + ;process low + RESTORELINE_W4 m2, 2, 0 + RESTORELINE_W4 m3, 3, 0 + palignr m4, m3, m2, 4;V 1 2 3 4 + punpcklwd m1, m2, m4 ; V 01 12 + punpckhwd m2, m4 ; V 23 34 + pshufd m0, m3, q2121;V 5 6 5 6 + punpcklwd m3, m0 ; V 45 56 +.hv_w4_loop: + ;process low + pmaddwd m5, m1, subpelv0 ; V a0 b0 + mova m1, m2 + pmaddwd m2, subpelv1; V a1 b1 + paddd m5, m2 + mova m2, m3 + pmaddwd m3, subpelv2; V a2 b2 + paddd m5, m3 + mova m6, [base+subpel_h_shuf4] + movq m4, [srcq+ssq*0] ; 7 + movhps m4, [srcq+ssq*1] ; 7 _ 8 _ + pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~ + pmaddubsw m4, m7 ;H subpel_filters + phaddw m4, m4 ;H 7 8 7 8 + pmulhrsw m4, w8192reg ;H pw_8192 + palignr m3, m4, m0, 12 ; 6 7 8 7 + mova m0, m4 + punpcklwd m3, m4 ; 67 78 + pmaddwd m4, m3, subpelv3; a3 b3 + paddd m5, d512reg ; pd_512 + paddd m5, m4 + psrad m5, 10 + SAVELINE_W4 m0, 0, 0 + SAVELINE_W4 m1, 1, 0 + SAVELINE_W4 m2, 2, 0 + SAVELINE_W4 m3, 3, 0 + SAVELINE_W4 m5, 5, 0 + ;process high + RESTORELINE_W4 m0, 0, 1 + RESTORELINE_W4 m1, 1, 1 + RESTORELINE_W4 m2, 2, 1 + RESTORELINE_W4 m3, 3, 1 + pmaddwd m5, m1, subpelv0; V a0 b0 + mova m1, m2 + pmaddwd m2, subpelv1; V a1 b1 + paddd m5, m2 + mova m2, m3 + pmaddwd m3, subpelv2; V a2 b2 + paddd m5, m3 + mova m6, [base+subpel_h_shuf4+16] + movq m4, [srcq+ssq*0] ; 7 + movhps m4, [srcq+ssq*1] ; 7 _ 8 _ + lea srcq, [srcq+ssq*2] + pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~ + pmaddubsw m4, m7 ;H subpel_filters + phaddw m4, m4 ;H 7 8 7 8 + pmulhrsw m4, w8192reg ;H pw_8192 + palignr m3, m4, m0, 12 ; 6 7 8 7 + mova m0, m4 + punpcklwd m3, m4 ; 67 78 + pmaddwd m4, m3, subpelv3; a3 b3 + paddd m5, d512reg ; pd_512 + paddd m5, m4 + psrad m4, m5, 10 + RESTORELINE_W4 m5, 5, 0 + packssdw m5, m4 ; d -> w + packuswb m5, m5 ; w -> b + pshuflw m5, m5, q3120 + movd [dstq+dsq*0], m5 + psrlq m5, 32 + movd [dstq+dsq*1], m5 + lea dstq, [dstq+dsq*2] + sub hd, 2 + SAVELINE_W4 m0, 0, 1 + SAVELINE_W4 m1, 1, 1 + SAVELINE_W4 m2, 2, 1 + SAVELINE_W4 m3, 3, 1 + RESTORELINE_W4 m0, 0, 0 + RESTORELINE_W4 m1, 1, 0 + RESTORELINE_W4 m2, 2, 0 + RESTORELINE_W4 m3, 3, 0 + jg .hv_w4_loop + RET +%undef subpelv0 +%undef subpelv1 +%undef subpelv2 +%undef subpelv3 +.hv_w8: + %assign stack_offset org_stack_offset +%define hv8_line_1 0 +%define hv8_line_2 1 +%define hv8_line_3 2 +%define hv8_line_4 3 +%define hv8_line_6 4 +%macro SAVELINE_W8 2 + mova [rsp+hv8_line_%1*mmsize], %2 +%endmacro +%macro RESTORELINE_W8 2 + mova %2, [rsp+hv8_line_%1*mmsize] +%endmacro + shr mxd, 16 + sub srcq, 3 +%if ARCH_X86_32 + %define base_reg r1 + %define subpelh0 [rsp+mmsize*5] + %define subpelh1 [rsp+mmsize*6] + %define subpelv0 [rsp+mmsize*7] + %define subpelv1 [rsp+mmsize*8] + %define subpelv2 [rsp+mmsize*9] + %define subpelv3 [rsp+mmsize*10] + %define accuv0 [rsp+mmsize*11] + %define accuv1 [rsp+mmsize*12] + movq m1, [base_reg+mxq*8+subpel_filters-put_ssse3] + movzx mxd, ssb + shr ssd, 16 + cmp hd, 6 + cmovs ssd, mxd + movq m5, [base_reg+ssq*8+subpel_filters-put_ssse3] + mov ssq, ssmp + ALLOC_STACK -mmsize*13 +%if STACK_ALIGNMENT < 16 + %define dstm [rsp+mmsize*13+gprsize*1] + %define dsm [rsp+mmsize*13+gprsize*2] + mov r6, [rstk+stack_offset+gprsize*2] + mov dsm, r6 +%endif + pshufd m0, m1, q0000 + pshufd m1, m1, q1111 + punpcklbw m5, m5 + psraw m5, 8 ; sign-extend + pshufd m2, m5, q0000 + pshufd m3, m5, q1111 + pshufd m4, m5, q2222 + pshufd m5, m5, q3333 + mova subpelh0, m0 + mova subpelh1, m1 + mova subpelv0, m2 + mova subpelv1, m3 + mova subpelv2, m4 + mova subpelv3, m5 + lea r6, [ssq*3] + mov dstm, dstq + sub srcq, r6 +%else + ALLOC_STACK 16*5, 16 + %define subpelh0 m10 + %define subpelh1 m11 + %define subpelv0 m12 + %define subpelv1 m13 + %define subpelv2 m14 + %define subpelv3 m15 + %define accuv0 m8 + %define accuv1 m9 + movq m0, [base_reg+mxq*8+subpel_filters-put_ssse3] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + movq m1, [base_reg+myq*8+subpel_filters-put_ssse3] + pshufd subpelh0, m0, q0000 + pshufd subpelh1, m0, q1111 + punpcklbw m1, m1 + psraw m1, 8 ; sign-extend + pshufd subpelv0, m1, q0000 + pshufd subpelv1, m1, q1111 + pshufd subpelv2, m1, q2222 + pshufd subpelv3, m1, q3333 + lea ss3q, [ssq*3] + mov r7, dstq + sub srcq, ss3q +%endif + shl wd, 14 + lea r6d, [hq+wq-(1<<16)] + mov r4, srcq +.hv_w8_loop0: + movu m4, [srcq+ssq*0] ; 0 = _ _ + movu m5, [srcq+ssq*1] ; 1 = _ _ +%if ARCH_X86_32 + lea srcq, [srcq+ssq*2] +%endif +%macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3] + %if ARCH_X86_32 + pshufb %3, %1, [base+subpel_h_shufB] + pshufb %4, %1, [base+subpel_h_shufC] + pshufb %1, [base+subpel_h_shufA] + %else + pshufb %3, %1, %6 ; subpel_h_shufB + pshufb %4, %1, %7 ; subpel_h_shufC + pshufb %1, %5 ; subpel_h_shufA + %endif + pmaddubsw %2, %3, subpelh0 ; subpel +0 C0 + pmaddubsw %4, subpelh1; subpel +4 B4 + pmaddubsw %3, subpelh1; C4 + pmaddubsw %1, subpelh0; A0 + paddw %2, %4 ; C0+B4 + paddw %1, %3 ; A0+C4 + phaddw %1, %2 +%endmacro +%if ARCH_X86_64 + mova m7, [base+subpel_h_shufA] + mova m8, [base+subpel_h_shufB] + mova m9, [base+subpel_h_shufC] +%endif + HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 0 ~ ~ ~ + HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 1 ~ ~ ~ +%if ARCH_X86_32 + movu m6, [srcq+ssq*0] ; 2 = _ _ + movu m0, [srcq+ssq*1] ; 3 = _ _ + lea srcq, [srcq+ssq*2] +%else + movu m6, [srcq+ssq*2] ; 2 = _ _ + add srcq, ss3q + movu m0, [srcq+ssq*0] ; 3 = _ _ +%endif + HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 2 ~ ~ ~ + HV_H_W8 m0, m1, m2, m3, m7, m8, m9 ; 3 ~ ~ ~ + mova m7, [base+pw_8192] + pmulhrsw m4, m7 ; H pw_8192 + pmulhrsw m5, m7 ; H pw_8192 + pmulhrsw m6, m7 ; H pw_8192 + pmulhrsw m0, m7 ; H pw_8192 + punpcklwd m1, m4, m5 ; 0 1 ~ + punpcklwd m2, m5, m6 ; 1 2 ~ + punpcklwd m3, m6, m0 ; 2 3 ~ + SAVELINE_W8 1, m1 + SAVELINE_W8 2, m2 + SAVELINE_W8 3, m3 + mova m7, [base+subpel_h_shufA] +%if ARCH_X86_32 + movu m4, [srcq+ssq*0] ; 4 = _ _ + movu m5, [srcq+ssq*1] ; 5 = _ _ + lea srcq, [srcq+ssq*2] +%else + movu m4, [srcq+ssq*1] ; 4 = _ _ + movu m5, [srcq+ssq*2] ; 5 = _ _ + add srcq, ss3q +%endif + movu m6, [srcq+ssq*0] ; 6 = _ _ + HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 4 ~ ~ ~ + HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 5 ~ ~ ~ + HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 6 ~ ~ ~ + mova m7, [base+pw_8192] + pmulhrsw m1, m4, m7 ; H pw_8192 4 ~ + pmulhrsw m2, m5, m7 ; H pw_8192 5 ~ + pmulhrsw m3, m6, m7 ; H pw_8192 6 ~ + punpcklwd m4, m0, m1 ; 3 4 ~ + punpcklwd m5, m1, m2 ; 4 5 ~ + punpcklwd m6, m2, m3 ; 5 6 ~ + SAVELINE_W8 6, m3 + RESTORELINE_W8 1, m1 + RESTORELINE_W8 2, m2 + RESTORELINE_W8 3, m3 +.hv_w8_loop: + ; m8 accu for V a + ; m9 accu for V b + SAVELINE_W8 1, m3 + SAVELINE_W8 2, m4 + SAVELINE_W8 3, m5 + SAVELINE_W8 4, m6 +%if ARCH_X86_32 + pmaddwd m0, m1, subpelv0 ; a0 + pmaddwd m7, m2, subpelv0 ; b0 + pmaddwd m3, subpelv1 ; a1 + pmaddwd m4, subpelv1 ; b1 + paddd m0, m3 + paddd m7, m4 + pmaddwd m5, subpelv2 ; a2 + pmaddwd m6, subpelv2 ; b2 + paddd m0, m5 + paddd m7, m6 + mova m5, [base+pd_512] + paddd m0, m5 ; pd_512 + paddd m7, m5 ; pd_512 + mova accuv0, m0 + mova accuv1, m7 +%else + pmaddwd m8, m1, subpelv0 ; a0 + pmaddwd m9, m2, subpelv0 ; b0 + pmaddwd m3, subpelv1 ; a1 + pmaddwd m4, subpelv1 ; b1 + paddd m8, m3 + paddd m9, m4 + pmaddwd m5, subpelv2 ; a2 + pmaddwd m6, subpelv2 ; b2 + paddd m8, m5 + paddd m9, m6 + mova m7, [base+pd_512] + paddd m8, m7 ; pd_512 + paddd m9, m7 ; pd_512 + mova m7, [base+subpel_h_shufB] + mova m6, [base+subpel_h_shufC] + mova m5, [base+subpel_h_shufA] +%endif + movu m0, [srcq+ssq*1] ; 7 + movu m4, [srcq+ssq*2] ; 8 + lea srcq, [srcq+ssq*2] + HV_H_W8 m0, m1, m2, m3, m5, m7, m6 + HV_H_W8 m4, m1, m2, m3, m5, m7, m6 + mova m5, [base+pw_8192] + pmulhrsw m0, m5 ; H pw_8192 + pmulhrsw m4, m5 ; H pw_8192 + RESTORELINE_W8 6, m6 + punpcklwd m5, m6, m0 ; 6 7 ~ + punpcklwd m6, m0, m4 ; 7 8 ~ + pmaddwd m1, m5, subpelv3 ; a3 + paddd m2, m1, accuv0 + pmaddwd m1, m6, subpelv3 ; b3 + paddd m1, m1, accuv1 ; H + V + psrad m2, 10 + psrad m1, 10 + packssdw m2, m1 ; d -> w + packuswb m2, m1 ; w -> b + movd [dstq+dsq*0], m2 + psrlq m2, 32 +%if ARCH_X86_32 + add dstq, dsm + movd [dstq+dsq*0], m2 + add dstq, dsm +%else + movd [dstq+dsq*1], m2 + lea dstq, [dstq+dsq*2] +%endif + sub hd, 2 + jle .hv_w8_outer + SAVELINE_W8 6, m4 + RESTORELINE_W8 1, m1 + RESTORELINE_W8 2, m2 + RESTORELINE_W8 3, m3 + RESTORELINE_W8 4, m4 + jmp .hv_w8_loop +.hv_w8_outer: +%if ARCH_X86_32 + mov dstq, dstm + add r4, 4 + movzx hd, r6w + add dstq, 4 + mov srcq, r4 + mov dstm, dstq +%else + add r4, 4 + add r7, 4 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 +%endif + sub r6d, 1<<16 + jg .hv_w8_loop0 + RET + +%macro PSHUFB_SUBPEL_H_4 5 ; dst/src1, src2/mask, tmp1, tmp2, reset_mask + %if cpuflag(ssse3) + pshufb %1, %2 + %else + %if %5 == 1 + pcmpeqd %2, %2 + psrlq %2, 32 + %endif + psrldq %3, %1, 1 + pshufd %3, %3, q2301 + pand %1, %2 + pandn %4, %2, %3 + por %1, %4 + %endif +%endmacro + +%macro PSHUFB_SUBPEL_H_4a 6 ; dst, src1, src2/mask, tmp1, tmp2, reset_mask + %ifnidn %1, %2 + mova %1, %2 + %endif + PSHUFB_SUBPEL_H_4 %1, %3, %4, %5, %6 +%endmacro + +%macro PSHUFB_SUBPEL_H_4b 6 ; dst, src1, src2/mask, tmp1, tmp2, reset_mask + %if notcpuflag(ssse3) + psrlq %1, %2, 16 + %elifnidn %1, %2 + mova %1, %2 + %endif + PSHUFB_SUBPEL_H_4 %1, %3, %4, %5, %6 +%endmacro + +%macro PALIGNR 4-5 ; dst, src1, src2, shift[, tmp] + %if cpuflag(ssse3) + palignr %1, %2, %3, %4 + %else + %if %0 == 4 + %assign %%i regnumof%+%1 + 1 + %define %%tmp m %+ %%i + %else + %define %%tmp %5 + %endif + psrldq %1, %3, %4 + pslldq %%tmp, %2, 16-%4 + por %1, %%tmp + %endif +%endmacro + +%macro PHADDW 4 ; dst, src, pw_1/tmp, load_pw_1 + %if cpuflag(ssse3) + phaddw %1, %2 + %elifnidn %1, %2 + %if %4 == 1 + mova %3, [base+pw_1] + %endif + pmaddwd %1, %3 + pmaddwd %2, %3 + packssdw %1, %2 + %else + %if %4 == 1 + pmaddwd %1, [base+pw_1] + %else + pmaddwd %1, %3 + %endif + packssdw %1, %1 + %endif +%endmacro + +%macro PMULHRSW_POW2 4 ; dst, src1, src2, shift + %if cpuflag(ssse3) + pmulhrsw %1, %2, %3 + %else + paddw %1, %2, %3 + psraw %1, %4 + %endif +%endmacro + +%macro PMULHRSW_8192 3 ; dst, src1, src2 + PMULHRSW_POW2 %1, %2, %3, 2 +%endmacro + +%macro PREP_8TAP_H_LOAD4 5 ; dst, src_memloc, tmp[1-2] + movd %1, [%2+0] + movd %3, [%2+1] + movd %4, [%2+2] + movd %5, [%2+3] + punpckldq %1, %3 + punpckldq %4, %5 + punpcklqdq %1, %4 +%endmacro + +%macro PREP_8TAP_H_LOAD 2 ; dst0, src_memloc + %if cpuflag(ssse3) + movu m%1, [%2] + pshufb m2, m%1, m11 ; subpel_h_shufB + pshufb m3, m%1, m9 ; subpel_h_shufC + pshufb m%1, m10 ; subpel_h_shufA + %else + %if ARCH_X86_64 + SWAP m12, m5 + SWAP m13, m6 + SWAP m14, m7 + %define %%mx0 m%+%%i + %define %%mx1 m%+%%j + %assign %%i 0 + %rep 12 + movd %%mx0, [%2+%%i] + %assign %%i %%i+1 + %endrep + %assign %%i 0 + %rep 6 + %assign %%j %%i+1 + punpckldq %%mx0, %%mx1 + %assign %%i %%i+2 + %endrep + %assign %%i 0 + %rep 3 + %assign %%j %%i+2 + punpcklqdq %%mx0, %%mx1 + %assign %%i %%i+4 + %endrep + SWAP m%1, m0 + SWAP m2, m4 + SWAP m3, m8 + SWAP m5, m12 + SWAP m6, m13 + SWAP m7, m14 + %else + PREP_8TAP_H_LOAD4 m0, %2+0, m1, m4, m7 + PREP_8TAP_H_LOAD4 m2, %2+4, m1, m4, m7 + PREP_8TAP_H_LOAD4 m3, %2+8, m1, m4, m7 + SWAP m%1, m0 + %endif + %endif +%endmacro + +%macro PREP_8TAP_H 2 ; dst, src_memloc + PREP_8TAP_H_LOAD %1, %2 + %if ARCH_X86_64 && notcpuflag(ssse3) + SWAP m8, m1 + SWAP m9, m7 + %endif + %xdefine mX m%+%1 + %assign %%i regnumof%+mX + %define mX m%+%%i + mova m4, m2 + PMADDUBSW m4, m5, m1, m7, 1 ; subpel +0 B0 + PMADDUBSW m2, m6, m1, m7, 0 ; subpel +4 B4 + PMADDUBSW m3, m6, m1, m7, 0 ; subpel +4 C4 + PMADDUBSW mX, m5, m1, m7, 0 ; subpel +0 A0 + %undef mX + %if ARCH_X86_64 && notcpuflag(ssse3) + SWAP m1, m8 + SWAP m7, m9 + %endif + paddw m3, m4 + paddw m%1, m2 + PHADDW m%1, m3, m15, ARCH_X86_32 + %if ARCH_X86_64 || cpuflag(ssse3) + PMULHRSW_8192 m%1, m%1, m7 + %else + PMULHRSW_8192 m%1, m%1, [base+pw_2] + %endif +%endmacro + +%macro PREP_8TAP_HV 4 ; dst, src_memloc, tmp[1-2] + %if cpuflag(ssse3) + movu %1, [%2] + pshufb m2, %1, shufB + pshufb m3, %1, shufC + pshufb %1, shufA + %else + PREP_8TAP_H_LOAD4 %1, %2+0, m1, %3, %4 + PREP_8TAP_H_LOAD4 m2, %2+4, m1, %3, %4 + PREP_8TAP_H_LOAD4 m3, %2+8, m1, %3, %4 + %endif + mova m1, m2 + PMADDUBSW m1, subpelh0, %3, %4, 1 ; subpel +0 C0 + PMADDUBSW m3, subpelh1, %3, %4, 0 ; subpel +4 B4 + PMADDUBSW m2, subpelh1, %3, %4, 0 ; C4 + PMADDUBSW %1, subpelh0, %3, %4, 0 ; A0 + paddw m1, m3 ; C0+B4 + paddw %1, m2 ; A0+C4 + PHADDW %1, m1, %3, 1 +%endmacro + +%macro PREP_8TAP 0 +%if ARCH_X86_32 + DECLARE_REG_TMP 1, 2 +%elif WIN64 + DECLARE_REG_TMP 6, 4 +%else + DECLARE_REG_TMP 6, 7 +%endif + +FN prep_8tap, sharp, SHARP, SHARP +FN prep_8tap, sharp_smooth, SHARP, SMOOTH +FN prep_8tap, smooth_sharp, SMOOTH, SHARP +FN prep_8tap, smooth, SMOOTH, SMOOTH +FN prep_8tap, sharp_regular, SHARP, REGULAR +FN prep_8tap, regular_sharp, REGULAR, SHARP +FN prep_8tap, smooth_regular, SMOOTH, REGULAR +FN prep_8tap, regular_smooth, REGULAR, SMOOTH +FN prep_8tap, regular, REGULAR, REGULAR + +%if ARCH_X86_32 + %define base_reg r2 + %define base base_reg-prep%+SUFFIX +%else + %define base_reg r7 + %define base 0 +%endif +cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 +%assign org_stack_offset stack_offset + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + mov wd, wm + movifnidn srcd, srcm + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jnz .v + LEA base_reg, prep_ssse3 + tzcnt wd, wd + movzx wd, word [base_reg-prep_ssse3+prep_ssse3_table+wq*2] + pxor m4, m4 + add wq, base_reg + movifnidn strided, stridem + lea r6, [strideq*3] + %assign stack_offset org_stack_offset +%if WIN64 + pop r8 + pop r7 +%endif + jmp wq +.h: + LEA base_reg, prep%+SUFFIX + test myd, 0xf00 + jnz .hv +%if cpuflag(ssse3) + WIN64_SPILL_XMM 12 +%else + WIN64_SPILL_XMM 16 +%endif +%if ARCH_X86_32 + %define strideq r6 + mov strideq, stridem +%endif + cmp wd, 4 + je .h_w4 + tzcnt wd, wd +%if cpuflag(ssse3) + %if ARCH_X86_64 + mova m10, [base+subpel_h_shufA] + mova m11, [base+subpel_h_shufB] + mova m9, [base+subpel_h_shufC] + %else + %define m10 [base+subpel_h_shufA] + %define m11 [base+subpel_h_shufB] + %define m9 [base+subpel_h_shufC] + %endif +%endif + shr mxd, 16 + sub srcq, 3 + movzx wd, word [base_reg+wq*2+table_offset(prep, _8tap_h)] + movq m6, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX] +%if cpuflag(ssse3) + mova m7, [base+pw_8192] + pshufd m5, m6, q0000 + pshufd m6, m6, q1111 +%else + punpcklbw m6, m6 + psraw m6, 8 + %if ARCH_X86_64 + mova m7, [pw_2] + mova m15, [pw_1] + %else + %define m15 m4 + %endif + pshufd m5, m6, q1010 + punpckhqdq m6, m6 +%endif + add wq, base_reg + jmp wq +.h_w4: +%if ARCH_X86_32 + and mxd, 0x7f +%else + movzx mxd, mxb +%endif + dec srcq + movd m4, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+2] +%if cpuflag(ssse3) + mova m6, [base+pw_8192] + mova m5, [base+subpel_h_shufA] + pshufd m4, m4, q0000 +%else + mova m6, [base+pw_2] + %if ARCH_X86_64 + mova m14, [pw_1] + %else + %define m14 m7 + %endif + punpcklbw m4, m4 + psraw m4, 8 + punpcklqdq m4, m4 +%endif +%if ARCH_X86_64 + lea stride3q, [strideq*3] +%endif +.h_w4_loop: +%if cpuflag(ssse3) + movq m0, [srcq+strideq*0] ; 0 + movq m1, [srcq+strideq*1] ; 1 + %if ARCH_X86_32 + lea srcq, [srcq+strideq*2] + movq m2, [srcq+strideq*0] ; 2 + movq m3, [srcq+strideq*1] ; 3 + lea srcq, [srcq+strideq*2] + %else + movq m2, [srcq+strideq*2] ; 2 + movq m3, [srcq+stride3q ] ; 3 + lea srcq, [srcq+strideq*4] + %endif + pshufb m0, m5 + pshufb m1, m5 + pshufb m2, m5 + pshufb m3, m5 +%elif ARCH_X86_64 + movd m0, [srcq+strideq*0+0] + movd m12, [srcq+strideq*0+1] + movd m1, [srcq+strideq*1+0] + movd m5, [srcq+strideq*1+1] + movd m2, [srcq+strideq*2+0] + movd m13, [srcq+strideq*2+1] + movd m3, [srcq+stride3q +0] + movd m7, [srcq+stride3q +1] + punpckldq m0, m12 + punpckldq m1, m5 + punpckldq m2, m13 + punpckldq m3, m7 + movd m12, [srcq+strideq*0+2] + movd m8, [srcq+strideq*0+3] + movd m5, [srcq+strideq*1+2] + movd m9, [srcq+strideq*1+3] + movd m13, [srcq+strideq*2+2] + movd m10, [srcq+strideq*2+3] + movd m7, [srcq+stride3q +2] + movd m11, [srcq+stride3q +3] + lea srcq, [srcq+strideq*4] + punpckldq m12, m8 + punpckldq m5, m9 + punpckldq m13, m10 + punpckldq m7, m11 + punpcklqdq m0, m12 ; 0 + punpcklqdq m1, m5 ; 1 + punpcklqdq m2, m13 ; 2 + punpcklqdq m3, m7 ; 3 +%else + movd m0, [srcq+strideq*0+0] + movd m1, [srcq+strideq*0+1] + movd m2, [srcq+strideq*0+2] + movd m3, [srcq+strideq*0+3] + punpckldq m0, m1 + punpckldq m2, m3 + punpcklqdq m0, m2 ; 0 + movd m1, [srcq+strideq*1+0] + movd m2, [srcq+strideq*1+1] + movd m3, [srcq+strideq*1+2] + movd m7, [srcq+strideq*1+3] + lea srcq, [srcq+strideq*2] + punpckldq m1, m2 + punpckldq m3, m7 + punpcklqdq m1, m3 ; 1 + movd m2, [srcq+strideq*0+0] + movd m3, [srcq+strideq*0+1] + movd m7, [srcq+strideq*0+2] + movd m5, [srcq+strideq*0+3] + punpckldq m2, m3 + punpckldq m7, m5 + punpcklqdq m2, m7 ; 2 + movd m3, [srcq+strideq*1+0] + movd m7, [srcq+strideq*1+1] + punpckldq m3, m7 + movd m7, [srcq+strideq*1+2] + movd m5, [srcq+strideq*1+3] + lea srcq, [srcq+strideq*2] + punpckldq m7, m5 + punpcklqdq m3, m7 ; 3 +%endif + PMADDUBSW m0, m4, m5, m7, 1 ; subpel_filters + 2 + PMADDUBSW m1, m4, m5, m7, 0 + PMADDUBSW m2, m4, m5, m7, 0 + PMADDUBSW m3, m4, m5, m7, 0 + PHADDW m0, m1, m14, ARCH_X86_32 + PHADDW m2, m3, m14, 0 + PMULHRSW_8192 m0, m0, m6 + PMULHRSW_8192 m2, m2, m6 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m2 + add tmpq, 32 + sub hd, 4 + jg .h_w4_loop + RET +.h_w8: +%if cpuflag(ssse3) + PREP_8TAP_H 0, srcq+strideq*0 + PREP_8TAP_H 1, srcq+strideq*1 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + lea srcq, [srcq+strideq*2] + add tmpq, 32 + sub hd, 2 +%else + PREP_8TAP_H 0, srcq + mova [tmpq], m0 + add srcq, strideq + add tmpq, 16 + dec hd +%endif + jg .h_w8 + RET +.h_w16: + mov r3, -16*1 + jmp .h_start +.h_w32: + mov r3, -16*2 + jmp .h_start +.h_w64: + mov r3, -16*4 + jmp .h_start +.h_w128: + mov r3, -16*8 +.h_start: + sub srcq, r3 + mov r5, r3 +.h_loop: +%if cpuflag(ssse3) + PREP_8TAP_H 0, srcq+r3+8*0 + PREP_8TAP_H 1, srcq+r3+8*1 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + add tmpq, 32 + add r3, 16 +%else + PREP_8TAP_H 0, srcq+r3 + mova [tmpq], m0 + add tmpq, 16 + add r3, 8 +%endif + jl .h_loop + add srcq, strideq + mov r3, r5 + dec hd + jg .h_loop + RET +.v: + LEA base_reg, prep%+SUFFIX +%if ARCH_X86_32 + mov mxd, myd + and mxd, 0x7f +%else + %assign stack_offset org_stack_offset + WIN64_SPILL_XMM 16 + movzx mxd, myb +%endif + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] +%if cpuflag(ssse3) + mova m2, [base+pw_512] + mova m7, [base+pw_8192] + punpcklwd m0, m0 +%else + punpcklbw m0, m0 + psraw m0, 8 +%endif +%if ARCH_X86_32 + %define subpel0 [rsp+mmsize*0] + %define subpel1 [rsp+mmsize*1] + %define subpel2 [rsp+mmsize*2] + %define subpel3 [rsp+mmsize*3] +%assign regs_used 6 ; use r5 (mx) as tmp for stack alignment if needed + %if cpuflag(ssse3) + ALLOC_STACK -mmsize*4 + %else + ALLOC_STACK -mmsize*5 + %endif +%assign regs_used 7 + mov strideq, [rstk+stack_offset+gprsize*3] + pshufd m1, m0, q0000 + mova subpel0, m1 + pshufd m1, m0, q1111 + mova subpel1, m1 + lea r5, [strideq*3] + pshufd m1, m0, q2222 + mova subpel2, m1 + pshufd m1, m0, q3333 + mova subpel3, m1 + sub srcq, r5 +%else + %define subpel0 m8 + %define subpel1 m9 + %define subpel2 m10 + %define subpel3 m11 + pshufd m8, m0, q0000 + pshufd m9, m0, q1111 + lea stride3q, [strideq*3] + pshufd m10, m0, q2222 + pshufd m11, m0, q3333 + sub srcq, stride3q + cmp wd, 8 + jns .v_w8 +%endif +.v_w4: +%if notcpuflag(ssse3) + pxor m6, m6 + %if ARCH_X86_64 + mova m7, [base+pw_2] + %endif +%endif +%if ARCH_X86_32 + %if STACK_ALIGNMENT < mmsize + %define srcm [esp+stack_size+gprsize*1] + %define tmpm [esp+stack_size+gprsize*2] + %endif + mov tmpm, tmpq + mov srcm, srcq + lea r5d, [wq - 4] ; horizontal loop + shl r5d, (16 - 2) ; (wq / 4) << 16 + mov r5w, hw +.v_w4_loop0: +%endif + movd m1, [srcq+strideq*0] + movd m0, [srcq+strideq*1] +%if ARCH_X86_32 + lea srcq, [srcq+strideq*2] + movd m2, [srcq+strideq*0] + movd m4, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + movd m3, [srcq+strideq*0] + movd m5, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] +%else + movd m2, [srcq+strideq*2] + add srcq, stride3q + movd m4, [srcq+strideq*0] + movd m3, [srcq+strideq*1] + movd m5, [srcq+strideq*2] + add srcq, stride3q +%endif + punpckldq m1, m0 ; 0 1 + punpckldq m0, m2 ; 1 2 + punpcklbw m1, m0 ; 01 12 + movd m0, [srcq+strideq*0] + punpckldq m2, m4 ; 2 3 + punpckldq m4, m3 ; 3 4 + punpckldq m3, m5 ; 4 5 + punpckldq m5, m0 ; 5 6 + punpcklbw m2, m4 ; 23 34 + punpcklbw m3, m5 ; 45 56 +.v_w4_loop: +%if ARCH_X86_32 && notcpuflag(ssse3) + mova m7, subpel0 + %define subpel0 m7 +%endif + mova m5, m1 + PMADDUBSW m5, subpel0, m6, m4, 0 ; a0 b0 +%if ARCH_X86_32 && notcpuflag(ssse3) + mova m7, subpel1 + %define subpel1 m7 +%endif + mova m1, m2 + PMADDUBSW m2, subpel1, m6, m4, 0 ; a1 b1 + paddw m5, m2 +%if ARCH_X86_32 && notcpuflag(ssse3) + mova m7, subpel2 + %define subpel2 m7 +%endif + mova m2, m3 + PMADDUBSW m3, subpel2, m6, m4, 0 ; a2 b2 + movd m4, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + paddw m5, m3 + punpckldq m3, m0, m4 ; 6 7 _ _ + movd m0, [srcq+strideq*0] + punpckldq m4, m0 ; 7 8 _ _ + punpcklbw m3, m4 ; 67 78 +%if notcpuflag(ssse3) + %if ARCH_X86_64 + SWAP m12, m0 + %else + mova [esp+mmsize*4], m0 + mova m7, subpel3 + %define subpel3 m7 + %endif +%endif + mova m4, m3 + PMADDUBSW m4, subpel3, m6, m0, 0 ; a3 b3 + paddw m5, m4 +%if ARCH_X86_64 || cpuflag(ssse3) + %if notcpuflag(ssse3) + SWAP m0, m12 + %endif + PMULHRSW_8192 m5, m5, m7 +%else + mova m0, [esp+mmsize*4] + PMULHRSW_8192 m5, m5, [base+pw_2] +%endif + movq [tmpq+wq*0], m5 + movhps [tmpq+wq*2], m5 + lea tmpq, [tmpq+wq*4] + sub hd, 2 + jg .v_w4_loop +%if ARCH_X86_32 + mov srcq, srcm + mov tmpq, tmpm + movzx hd, r5w + add srcq, 4 + add tmpq, 8 + mov srcm, srcq + mov tmpm, tmpq + sub r5d, 1<<16 ; horizontal-- + jg .v_w4_loop0 +%endif + RET +%if ARCH_X86_64 +.v_w8: + lea r6d, [wq*8-64] + mov r5, srcq + mov r8, tmpq + lea r6d, [hq+r6*4] +.v_w8_loop0: + movq m1, [srcq+strideq*0] + movq m2, [srcq+strideq*1] + movq m3, [srcq+strideq*2] + add srcq, stride3q + movq m4, [srcq+strideq*0] + movq m5, [srcq+strideq*1] + movq m6, [srcq+strideq*2] + add srcq, stride3q + movq m0, [srcq+strideq*0] + punpcklbw m1, m2 ; 01 + punpcklbw m2, m3 ; 12 + punpcklbw m3, m4 ; 23 + punpcklbw m4, m5 ; 34 + punpcklbw m5, m6 ; 45 + punpcklbw m6, m0 ; 56 +.v_w8_loop: + movq m13, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] +%if cpuflag(ssse3) + pmaddubsw m14, m1, subpel0 ; a0 + pmaddubsw m15, m2, subpel0 ; b0 + mova m1, m3 + mova m2, m4 + pmaddubsw m3, subpel1 ; a1 + pmaddubsw m4, subpel1 ; b1 + paddw m14, m3 + paddw m15, m4 + mova m3, m5 + mova m4, m6 + pmaddubsw m5, subpel2 ; a2 + pmaddubsw m6, subpel2 ; b2 + punpcklbw m12, m0, m13 ; 67 + movq m0, [srcq+strideq*0] + punpcklbw m13, m0 ; 78 + paddw m14, m5 + mova m5, m12 + pmaddubsw m12, subpel3 ; a3 + paddw m15, m6 + mova m6, m13 + pmaddubsw m13, subpel3 ; b3 + paddw m14, m12 + paddw m15, m13 + pmulhrsw m14, m7 + pmulhrsw m15, m7 +%else + mova m14, m1 + PMADDUBSW m14, subpel0, m7, m12, 1 ; a0 + mova m15, m2 + PMADDUBSW m15, subpel0, m7, m12, 0 ; b0 + mova m1, m3 + PMADDUBSW m3, subpel1, m7, m12, 0 ; a1 + mova m2, m4 + PMADDUBSW m4, subpel1, m7, m12, 0 ; b1 + paddw m14, m3 + mova m3, m5 + PMADDUBSW m5, subpel2, m7, m12, 0 ; a2 + paddw m15, m4 + mova m4, m6 + PMADDUBSW m6, subpel2, m7, m12, 0 ; b2 + paddw m15, m6 + punpcklbw m12, m0, m13 ; 67 + movq m0, [srcq+strideq*0] + punpcklbw m13, m0 ; 78 + paddw m14, m5 + mova m5, m12 + PMADDUBSW m12, subpel3, m7, m6, 0 ; a3 + paddw m14, m12 + mova m6, m13 + PMADDUBSW m13, subpel3, m7, m12, 0 ; b3 + paddw m15, m13 + PMULHRSW_8192 m14, m14, [base+pw_2] + PMULHRSW_8192 m15, m15, [base+pw_2] +%endif + movu [tmpq+wq*0], m14 + movu [tmpq+wq*2], m15 + lea tmpq, [tmpq+wq*4] + sub hd, 2 + jg .v_w8_loop + add r5, 8 + add r8, 16 + movzx hd, r6b + mov srcq, r5 + mov tmpq, r8 + sub r6d, 1<<8 + jg .v_w8_loop0 + RET +%endif ;ARCH_X86_64 +%undef subpel0 +%undef subpel1 +%undef subpel2 +%undef subpel3 +.hv: + %assign stack_offset org_stack_offset + cmp wd, 4 + jg .hv_w8 + and mxd, 0x7f + movd m1, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+2] +%if ARCH_X86_32 + mov mxd, myd + shr myd, 16 + and mxd, 0x7f + cmp hd, 6 + cmovs myd, mxd + movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] + mov strideq, stridem + %assign regs_used 6 + ALLOC_STACK -mmsize*14 + %assign regs_used 7 + lea r5, [strideq*3+1] + sub srcq, r5 + %define subpelv0 [rsp+mmsize*0] + %define subpelv1 [rsp+mmsize*1] + %define subpelv2 [rsp+mmsize*2] + %define subpelv3 [rsp+mmsize*3] + punpcklbw m0, m0 + psraw m0, 8 + pshufd m6, m0, q0000 + mova subpelv0, m6 + pshufd m6, m0, q1111 + mova subpelv1, m6 + pshufd m6, m0, q2222 + mova subpelv2, m6 + pshufd m6, m0, q3333 + mova subpelv3, m6 +%else + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] + %if cpuflag(ssse3) + ALLOC_STACK mmsize*14, 14 + %else + ALLOC_STACK mmsize*14, 16 + %endif + lea stride3q, [strideq*3] + sub srcq, stride3q + dec srcq + %define subpelv0 m10 + %define subpelv1 m11 + %define subpelv2 m12 + %define subpelv3 m13 + punpcklbw m0, m0 + psraw m0, 8 + %if cpuflag(ssse3) + mova m8, [base+pw_8192] + %else + mova m8, [base+pw_2] + %endif + mova m9, [base+pd_32] + pshufd m10, m0, q0000 + pshufd m11, m0, q1111 + pshufd m12, m0, q2222 + pshufd m13, m0, q3333 +%endif + pshufd m7, m1, q0000 +%if notcpuflag(ssse3) + punpcklbw m7, m7 + psraw m7, 8 +%endif +%define hv4_line_0_0 4 +%define hv4_line_0_1 5 +%define hv4_line_0_2 6 +%define hv4_line_0_3 7 +%define hv4_line_0_4 8 +%define hv4_line_0_5 9 +%define hv4_line_1_0 10 +%define hv4_line_1_1 11 +%define hv4_line_1_2 12 +%define hv4_line_1_3 13 +%if ARCH_X86_32 + %if cpuflag(ssse3) + %define w8192reg [base+pw_8192] + %else + %define w8192reg [base+pw_2] + %endif + %define d32reg [base+pd_32] +%else + %define w8192reg m8 + %define d32reg m9 +%endif + ; lower shuffle 0 1 2 3 4 +%if cpuflag(ssse3) + mova m6, [base+subpel_h_shuf4] +%else + %if ARCH_X86_64 + mova m15, [pw_1] + %else + %define m15 m1 + %endif +%endif + movq m5, [srcq+strideq*0] ; 0 _ _ _ + movhps m5, [srcq+strideq*1] ; 0 _ 1 _ +%if ARCH_X86_32 + lea srcq, [srcq+strideq*2] + movq m4, [srcq+strideq*0] ; 2 _ _ _ + movhps m4, [srcq+strideq*1] ; 2 _ 3 _ + lea srcq, [srcq+strideq*2] +%else + movq m4, [srcq+strideq*2] ; 2 _ _ _ + movhps m4, [srcq+stride3q ] ; 2 _ 3 _ + lea srcq, [srcq+strideq*4] +%endif + PSHUFB_SUBPEL_H_4a m2, m5, m6, m1, m3, 1 ;H subpel_h_shuf4 0~1~ + PSHUFB_SUBPEL_H_4a m0, m4, m6, m1, m3, 0 ;H subpel_h_shuf4 2~3~ + PMADDUBSW m2, m7, m1, m3, 1 ;H subpel_filters + PMADDUBSW m0, m7, m1, m3, 0 ;H subpel_filters + PHADDW m2, m0, m15, ARCH_X86_32 ;H 0 1 2 3 + PMULHRSW_8192 m2, m2, w8192reg + SAVELINE_W4 m2, 2, 0 + ; upper shuffle 2 3 4 5 6 +%if cpuflag(ssse3) + mova m6, [base+subpel_h_shuf4+16] +%endif + PSHUFB_SUBPEL_H_4b m2, m5, m6, m1, m3, 0 ;H subpel_h_shuf4 0~1~ + PSHUFB_SUBPEL_H_4b m0, m4, m6, m1, m3, 0 ;H subpel_h_shuf4 2~3~ + PMADDUBSW m2, m7, m1, m3, 1 ;H subpel_filters + PMADDUBSW m0, m7, m1, m3, 0 ;H subpel_filters + PHADDW m2, m0, m15, ARCH_X86_32 ;H 0 1 2 3 + PMULHRSW_8192 m2, m2, w8192reg +%if notcpuflag(ssse3) + %if ARCH_X86_64 + SWAP m14, m2 + %else + mova [esp+mmsize*4], m2 + %endif +%endif + ; lower shuffle +%if cpuflag(ssse3) + mova m6, [base+subpel_h_shuf4] +%endif + movq m5, [srcq+strideq*0] ; 4 _ _ _ + movhps m5, [srcq+strideq*1] ; 4 _ 5 _ +%if ARCH_X86_32 + lea srcq, [srcq+strideq*2] + movq m4, [srcq+strideq*0] ; 6 _ _ _ + add srcq, strideq +%else + movq m4, [srcq+strideq*2] ; 6 _ _ _ + add srcq, stride3q +%endif + PSHUFB_SUBPEL_H_4a m3, m5, m6, m1, m2, 0 ;H subpel_h_shuf4 4~5~ + PSHUFB_SUBPEL_H_4a m0, m4, m6, m1, m2, 0 ;H subpel_h_shuf4 6~6~ + PMADDUBSW m3, m7, m1, m2, 1 ;H subpel_filters + PMADDUBSW m0, m7, m1, m2, 0 ;H subpel_filters + PHADDW m3, m0, m15, ARCH_X86_32 ;H 4 5 6 7 + PMULHRSW_8192 m3, m3, w8192reg + SAVELINE_W4 m3, 3, 0 + ; upper shuffle +%if cpuflag(ssse3) + mova m6, [base+subpel_h_shuf4+16] +%endif + PSHUFB_SUBPEL_H_4b m3, m5, m6, m1, m2, 0 ;H subpel_h_shuf4 4~5~ + PSHUFB_SUBPEL_H_4b m0, m4, m6, m1, m2, 0 ;H subpel_h_shuf4 6~6~ + PMADDUBSW m3, m7, m1, m2, 1 ;H subpel_filters + PMADDUBSW m0, m7, m1, m2, 0 ;H subpel_filters + PHADDW m3, m0, m15, ARCH_X86_32 ;H 4 5 6 7 + PMULHRSW_8192 m3, m3, w8192reg +%if notcpuflag(ssse3) + %if ARCH_X86_64 + SWAP m2, m14 + %else + mova m2, [esp+mmsize*4] + %endif +%endif + ;process high + PALIGNR m4, m3, m2, 4;V 1 2 3 4 + punpcklwd m1, m2, m4 ; V 01 12 + punpckhwd m2, m4 ; V 23 34 + pshufd m0, m3, q2121;V 5 6 5 6 + punpcklwd m3, m0 ; V 45 56 + SAVELINE_W4 m0, 0, 1 + SAVELINE_W4 m1, 1, 1 + SAVELINE_W4 m2, 2, 1 + SAVELINE_W4 m3, 3, 1 + ;process low + RESTORELINE_W4 m2, 2, 0 + RESTORELINE_W4 m3, 3, 0 + PALIGNR m4, m3, m2, 4;V 1 2 3 4 + punpcklwd m1, m2, m4 ; V 01 12 + punpckhwd m2, m4 ; V 23 34 + pshufd m0, m3, q2121;V 5 6 5 6 + punpcklwd m3, m0 ; V 45 56 +.hv_w4_loop: + ;process low + pmaddwd m5, m1, subpelv0 ; V a0 b0 + mova m1, m2 + pmaddwd m2, subpelv1; V a1 b1 + paddd m5, m2 + mova m2, m3 + pmaddwd m3, subpelv2; V a2 b2 + paddd m5, m3 +%if notcpuflag(ssse3) + %if ARCH_X86_64 + SWAP m14, m5 + %else + mova [esp+mmsize*4], m5 + %define m15 m3 + %endif +%endif +%if cpuflag(ssse3) + mova m6, [base+subpel_h_shuf4] +%endif + movq m4, [srcq+strideq*0] ; 7 + movhps m4, [srcq+strideq*1] ; 7 _ 8 _ + PSHUFB_SUBPEL_H_4a m4, m4, m6, m3, m5, 0 ; H subpel_h_shuf4 7~8~ + PMADDUBSW m4, m7, m3, m5, 1 ; H subpel_filters + PHADDW m4, m4, m15, ARCH_X86_32 ; H 7878 + PMULHRSW_8192 m4, m4, w8192reg + PALIGNR m3, m4, m0, 12, m5 ; 6787 + mova m0, m4 + punpcklwd m3, m4 ; 67 78 + pmaddwd m4, m3, subpelv3; a3 b3 +%if notcpuflag(ssse3) + %if ARCH_X86_64 + SWAP m5, m14 + %else + mova m5, [esp+mmsize*4] + %endif +%endif + paddd m5, d32reg ; pd_32 + paddd m5, m4 + psrad m5, 6 + SAVELINE_W4 m0, 0, 0 + SAVELINE_W4 m1, 1, 0 + SAVELINE_W4 m2, 2, 0 + SAVELINE_W4 m3, 3, 0 + SAVELINE_W4 m5, 5, 0 + ;process high + RESTORELINE_W4 m0, 0, 1 + RESTORELINE_W4 m1, 1, 1 + RESTORELINE_W4 m2, 2, 1 + RESTORELINE_W4 m3, 3, 1 + pmaddwd m5, m1, subpelv0; V a0 b0 + mova m1, m2 + pmaddwd m2, subpelv1; V a1 b1 + paddd m5, m2 + mova m2, m3 + pmaddwd m3, subpelv2; V a2 b2 + paddd m5, m3 +%if notcpuflag(ssse3) + %if ARCH_X86_64 + SWAP m14, m5 + %else + mova [esp+0xA0], m5 + %endif +%endif +%if cpuflag(ssse3) + mova m6, [base+subpel_h_shuf4+16] +%endif + movq m4, [srcq+strideq*0] ; 7 + movhps m4, [srcq+strideq*1] ; 7 _ 8 _ + PSHUFB_SUBPEL_H_4b m4, m4, m6, m3, m5, 0 ; H subpel_h_shuf4 7~8~ + PMADDUBSW m4, m7, m3, m5, 1 ; H subpel_filters + PHADDW m4, m4, m15, ARCH_X86_32 ; H 7878 + PMULHRSW_8192 m4, m4, w8192reg + PALIGNR m3, m4, m0, 12, m5 ; 6787 + mova m0, m4 + punpcklwd m3, m4 ; 67 78 + pmaddwd m4, m3, subpelv3; a3 b3 +%if notcpuflag(ssse3) + %if ARCH_X86_64 + SWAP m5, m14 + %else + mova m5, [esp+0xA0] + %endif +%endif + paddd m5, d32reg ; pd_32 + paddd m5, m4 + psrad m4, m5, 6 + RESTORELINE_W4 m5, 5, 0 + packssdw m5, m4 + pshufd m5, m5, q3120 + movu [tmpq], m5 + lea srcq, [srcq+strideq*2] + add tmpq, 16 + sub hd, 2 + SAVELINE_W4 m0, 0, 1 + SAVELINE_W4 m1, 1, 1 + SAVELINE_W4 m2, 2, 1 + SAVELINE_W4 m3, 3, 1 + RESTORELINE_W4 m0, 0, 0 + RESTORELINE_W4 m1, 1, 0 + RESTORELINE_W4 m2, 2, 0 + RESTORELINE_W4 m3, 3, 0 + jg .hv_w4_loop + RET +%undef subpelv0 +%undef subpelv1 +%undef subpelv2 +%undef subpelv3 +.hv_w8: + %assign stack_offset org_stack_offset +%define hv8_line_1 0 +%define hv8_line_2 1 +%define hv8_line_3 2 +%define hv8_line_4 3 +%define hv8_line_6 4 + shr mxd, 16 +%if ARCH_X86_32 + %define subpelh0 [rsp+mmsize*5] + %define subpelh1 [rsp+mmsize*6] + %define subpelv0 [rsp+mmsize*7] + %define subpelv1 [rsp+mmsize*8] + %define subpelv2 [rsp+mmsize*9] + %define subpelv3 [rsp+mmsize*10] + %define accuv0 [rsp+mmsize*11] + %define accuv1 [rsp+mmsize*12] + movq m1, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX] + mov mxd, myd + shr myd, 16 + and mxd, 0x7f + cmp hd, 6 + cmovs myd, mxd + movq m5, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] + mov strideq, stridem + %assign regs_used 6 + ALLOC_STACK -mmsize*14 + %assign regs_used 7 + %if STACK_ALIGNMENT < mmsize + %define tmpm [rsp+mmsize*13+gprsize*1] + %define srcm [rsp+mmsize*13+gprsize*2] + %define stridem [rsp+mmsize*13+gprsize*3] + mov tmpm, tmpq + mov stridem, strideq + %endif + %if cpuflag(ssse3) + pshufd m0, m1, q0000 + pshufd m1, m1, q1111 + %else + punpcklbw m1, m1 + psraw m1, 8 + pshufd m0, m1, q1010 + punpckhqdq m1, m1 + %endif + punpcklbw m5, m5 + psraw m5, 8 + pshufd m2, m5, q0000 + pshufd m3, m5, q1111 + pshufd m4, m5, q2222 + pshufd m5, m5, q3333 + mova subpelh0, m0 + mova subpelh1, m1 + mova subpelv0, m2 + mova subpelv1, m3 + mova subpelv2, m4 + mova subpelv3, m5 + lea r5, [strideq*3+3] + sub srcq, r5 + mov srcm, srcq +%else + ALLOC_STACK mmsize*5, 16 + %define subpelh0 m10 + %define subpelh1 m11 + %define subpelv0 m12 + %define subpelv1 m13 + %define subpelv2 m14 + %define subpelv3 m15 + %define accuv0 m8 + %define accuv1 m9 + movq m0, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + movq m1, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] + %if cpuflag(ssse3) + pshufd subpelh0, m0, q0000 + pshufd subpelh1, m0, q1111 + %else + punpcklbw m0, m0 + psraw m0, 8 + pshufd subpelh0, m0, q1010 + pshufd subpelh1, m0, q3232 + mova m7, [base+pw_2] + %endif + punpcklbw m1, m1 + psraw m1, 8 + pshufd subpelv0, m1, q0000 + pshufd subpelv1, m1, q1111 + pshufd subpelv2, m1, q2222 + pshufd subpelv3, m1, q3333 + lea stride3q, [strideq*3] + sub srcq, 3 + sub srcq, stride3q + mov r6, srcq + mov r8, tmpq +%endif + lea r5d, [wq-4] + shl r5d, 14 + add r5d, hd +.hv_w8_loop0: +%if cpuflag(ssse3) + %if ARCH_X86_64 + mova m7, [base+subpel_h_shufA] + mova m8, [base+subpel_h_shufB] + mova m9, [base+subpel_h_shufC] + %define shufA m7 + %define shufB m8 + %define shufC m9 + %else + %define shufA [base+subpel_h_shufA] + %define shufB [base+subpel_h_shufB] + %define shufC [base+subpel_h_shufC] + %endif +%endif + PREP_8TAP_HV m4, srcq+strideq*0, m7, m0 + PREP_8TAP_HV m5, srcq+strideq*1, m7, m0 +%if ARCH_X86_64 + PREP_8TAP_HV m6, srcq+strideq*2, m7, m0 + add srcq, stride3q + PREP_8TAP_HV m0, srcq+strideq*0, m7, m9 +%else + lea srcq, [srcq+strideq*2] + %if notcpuflag(ssse3) + mova [esp], m4 + %endif + PREP_8TAP_HV m6, srcq+strideq*0, m7, m4 + PREP_8TAP_HV m0, srcq+strideq*1, m7, m4 + lea srcq, [srcq+strideq*2] +%endif +%if cpuflag(ssse3) + mova m7, [base+pw_8192] +%else + mova m7, [base+pw_2] + %if ARCH_X86_32 + mova m4, [esp] + %endif +%endif + PMULHRSW_8192 m4, m4, m7 + PMULHRSW_8192 m5, m5, m7 + PMULHRSW_8192 m6, m6, m7 + PMULHRSW_8192 m0, m0, m7 + punpcklwd m1, m4, m5 ; 01 + punpcklwd m2, m5, m6 ; 12 + punpcklwd m3, m6, m0 ; 23 + SAVELINE_W8 1, m1 + SAVELINE_W8 2, m2 + SAVELINE_W8 3, m3 +%if cpuflag(ssse3) + mova m7, [base+subpel_h_shufA] +%endif +%if ARCH_X86_64 + PREP_8TAP_HV m4, srcq+strideq*1, m8, m9 + PREP_8TAP_HV m5, srcq+strideq*2, m8, m9 + add srcq, stride3q + PREP_8TAP_HV m6, srcq+strideq*0, m8, m9 +%else + %if notcpuflag(ssse3) + mova [esp+0x30], m0 + %endif + PREP_8TAP_HV m4, srcq+strideq*0, m7, m0 + PREP_8TAP_HV m5, srcq+strideq*1, m7, m0 + lea srcq, [srcq+strideq*2] + PREP_8TAP_HV m6, srcq+strideq*0, m7, m0 +%endif +%if cpuflag(ssse3) + mova m7, [base+pw_8192] +%elif ARCH_X86_32 + mova m0, [esp+0x30] + mova m7, [base+pw_2] +%endif + PMULHRSW_8192 m1, m4, m7 + PMULHRSW_8192 m2, m5, m7 + PMULHRSW_8192 m3, m6, m7 + punpcklwd m4, m0, m1 ; 34 + punpcklwd m5, m1, m2 ; 45 + punpcklwd m6, m2, m3 ; 56 + SAVELINE_W8 6, m3 + RESTORELINE_W8 1, m1 + RESTORELINE_W8 2, m2 + RESTORELINE_W8 3, m3 +.hv_w8_loop: + SAVELINE_W8 1, m3 + SAVELINE_W8 2, m4 + SAVELINE_W8 3, m5 + SAVELINE_W8 4, m6 +%if ARCH_X86_32 + pmaddwd m0, m1, subpelv0 ; a0 + pmaddwd m7, m2, subpelv0 ; b0 + pmaddwd m3, subpelv1 ; a1 + pmaddwd m4, subpelv1 ; b1 + paddd m0, m3 + paddd m7, m4 + pmaddwd m5, subpelv2 ; a2 + pmaddwd m6, subpelv2 ; b2 + paddd m0, m5 + paddd m7, m6 + mova m5, [base+pd_32] + paddd m0, m5 + paddd m7, m5 + mova accuv0, m0 + mova accuv1, m7 +%else + pmaddwd accuv0, m1, subpelv0 ; a0 + pmaddwd accuv1, m2, subpelv0 ; b0 + pmaddwd m3, subpelv1 ; a1 + pmaddwd m4, subpelv1 ; b1 + paddd accuv0, m3 + paddd accuv1, m4 + pmaddwd m5, subpelv2 ; a2 + pmaddwd m6, subpelv2 ; b2 + paddd accuv0, m5 + paddd accuv1, m6 + mova m7, [base+pd_32] + paddd accuv0, m7 + paddd accuv1, m7 + %if cpuflag(ssse3) + mova m7, [base+subpel_h_shufB] + mova m6, [base+subpel_h_shufC] + mova m5, [base+subpel_h_shufA] + %define shufA m5 + %define shufB m7 + %define shufC m6 + %endif +%endif + PREP_8TAP_HV m0, srcq+strideq*1, m5, m6 + lea srcq, [srcq+strideq*2] + PREP_8TAP_HV m4, srcq+strideq*0, m5, m6 +%if cpuflag(ssse3) + mova m5, [base+pw_8192] +%else + mova m5, [base+pw_2] +%endif + PMULHRSW_8192 m0, m0, m5 + PMULHRSW_8192 m4, m4, m5 + RESTORELINE_W8 6, m6 + punpcklwd m5, m6, m0 ; 67 + punpcklwd m6, m0, m4 ; 78 + pmaddwd m1, m5, subpelv3 ; a3 + paddd m2, m1, accuv0 + pmaddwd m1, m6, subpelv3 ; b3 + paddd m1, m1, accuv1 + psrad m2, 6 + psrad m1, 6 + packssdw m2, m1 + movq [tmpq+wq*0], m2 + movhps [tmpq+wq*2], m2 + lea tmpq, [tmpq+wq*4] + sub hd, 2 + jle .hv_w8_outer + SAVELINE_W8 6, m4 + RESTORELINE_W8 1, m1 + RESTORELINE_W8 2, m2 + RESTORELINE_W8 3, m3 + RESTORELINE_W8 4, m4 + jmp .hv_w8_loop +.hv_w8_outer: +%if ARCH_X86_32 + mov srcq, srcm + mov tmpq, tmpm + movzx hd, r5w + add srcq, 4 + add tmpq, 8 + mov srcm, srcq + mov tmpm, tmpq +%else + add r6, 4 + add r8, 8 + movzx hd, r5b + mov srcq, r6 + mov tmpq, r8 +%endif + sub r5d, 1<<16 + jg .hv_w8_loop0 + RET +%endmacro + +%macro movifprep 2 + %if isprep + mov %1, %2 + %endif +%endmacro + +%macro REMAP_REG 2 + %xdefine r%1 r%2 + %xdefine r%1q r%2q + %xdefine r%1d r%2d +%endmacro + +%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0 + %if isprep + %xdefine r14_save r14 + %assign %%i 14 + %rep 14 + %assign %%j %%i-1 + REMAP_REG %%i, %%j + %assign %%i %%i-1 + %endrep + %endif +%endmacro + +%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0 + %if isprep + %assign %%i 1 + %rep 13 + %assign %%j %%i+1 + REMAP_REG %%i, %%j + %assign %%i %%i+1 + %endrep + %xdefine r14 r14_save + %undef r14_save + %endif +%endmacro + +%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged + MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT + RET + %if %1 + MCT_8TAP_SCALED_REMAP_REGS_TO_PREV + %endif +%endmacro + +%macro MC_8TAP_SCALED_H 12 ; dst[0-1], tmp[0-5], weights[0-3] + SWAP m%2, m%5 + movq m%1, [srcq+ r4] + movq m%2, [srcq+ r6] + movhps m%1, [srcq+ r7] + movhps m%2, [srcq+ r9] + movq m%3, [srcq+r10] + movq m%4, [srcq+r11] + movhps m%3, [srcq+r13] + movhps m%4, [srcq+ rX] + add srcq, ssq + movq m%5, [srcq+ r4] + movq m%6, [srcq+ r6] + movhps m%5, [srcq+ r7] + movhps m%6, [srcq+ r9] + movq m%7, [srcq+r10] + movq m%8, [srcq+r11] + movhps m%7, [srcq+r13] + movhps m%8, [srcq+ rX] + add srcq, ssq + pmaddubsw m%1, m%9 + pmaddubsw m%5, m%9 + pmaddubsw m%2, m%10 + pmaddubsw m%6, m%10 + pmaddubsw m%3, m%11 + pmaddubsw m%7, m%11 + pmaddubsw m%4, m%12 + pmaddubsw m%8, m%12 + phaddw m%1, m%2 + phaddw m%5, m%6 + phaddw m%3, m%4 + phaddw m%7, m%8 + phaddw m%1, m%3 + phaddw m%5, m%7 + pmulhrsw m%1, m12 + pmulhrsw m%5, m12 + SWAP m%2, m%5 +%endmacro + +%macro MC_8TAP_SCALED 1 +%ifidn %1, put + %assign isprep 0 + %if required_stack_alignment <= STACK_ALIGNMENT +cglobal put_8tap_scaled, 4, 15, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy + %else +cglobal put_8tap_scaled, 4, 14, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy + %endif + %xdefine base_reg r12 + %define rndshift 10 +%else + %assign isprep 1 + %if required_stack_alignment <= STACK_ALIGNMENT +cglobal prep_8tap_scaled, 4, 15, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy + %xdefine tmp_stridem r14q + %else +cglobal prep_8tap_scaled, 4, 14, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy + %define tmp_stridem qword [rsp+0x138] + %endif + %xdefine base_reg r11 + %define rndshift 6 +%endif + LEA base_reg, %1_8tap_scaled_ssse3 +%define base base_reg-%1_8tap_scaled_ssse3 + tzcnt wd, wm + movd m8, dxm + movd m14, mxm + pshufd m8, m8, q0000 + pshufd m14, m14, q0000 +%if isprep && UNIX64 + mov r5d, t0d + DECLARE_REG_TMP 5, 7 +%endif + mov dyd, dym +%ifidn %1, put + %if WIN64 + mov r8d, hm + DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3 + %define hm r5m + %define dxm r8m + %else + DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3 + %define hm r6m + %endif + %if required_stack_alignment > STACK_ALIGNMENT + %define dsm [rsp+0x138] + %define rX r1 + %define rXd r1d + %else + %define dsm dsq + %define rX r14 + %define rXd r14d + %endif +%else ; prep + %if WIN64 + mov r7d, hm + DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3 + %define hm r4m + %define dxm r7m + %else + DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3 + %define hm [rsp+0x94] + %endif + MCT_8TAP_SCALED_REMAP_REGS_TO_PREV + %define rX r14 + %define rXd r14d +%endif + mova m10, [base+pd_0x3ff] + mova m12, [base+pw_8192] +%ifidn %1, put + mova m13, [base+pd_512] +%else + mova m13, [base+pd_32] +%endif + pxor m9, m9 + lea ss3q, [ssq*3] + movzx r7d, t1b + shr t1d, 16 + cmp hd, 6 + cmovs t1d, r7d + sub srcq, ss3q + cmp dyd, 1024 + je .dy1 + cmp dyd, 2048 + je .dy2 + movzx wd, word [base+%1_8tap_scaled_ssse3_table+wq*2] + add wq, base_reg + jmp wq +%ifidn %1, put +.w2: + mov myd, mym + movzx t0d, t0b + dec srcq + movd m15, t0d + punpckldq m9, m8 + SWAP m8, m9 + paddd m14, m8 ; mx+dx*[0-1] + mova m11, [base+pd_0x4000] + pshufd m15, m15, q0000 + pand m8, m14, m10 + psrld m8, 6 + paddd m15, m8 + movd r4d, m15 + psrldq m15, 4 + movd r6d, m15 + mova m5, [base+bdct_lb_dw] + mova m6, [base+subpel_s_shuf2] + movd m15, [base+subpel_filters+r4*8+2] + movd m7, [base+subpel_filters+r6*8+2] + pxor m9, m9 + pcmpeqd m8, m9 + psrld m14, 10 + movq m0, [srcq+ssq*0] + movq m2, [srcq+ssq*2] + movhps m0, [srcq+ssq*1] + movhps m2, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + pshufb m14, m5 + paddb m14, m6 + movq m1, [srcq+ssq*0] + movq m3, [srcq+ssq*2] + movhps m1, [srcq+ssq*1] + movhps m3, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + punpckldq m15, m7 + punpcklqdq m15, m15 + pand m11, m8 + pandn m8, m15 + SWAP m15, m8 + por m15, m11 + pshufb m0, m14 + pshufb m2, m14 + pshufb m1, m14 + pshufb m3, m14 + pmaddubsw m0, m15 + pmaddubsw m2, m15 + pmaddubsw m1, m15 + pmaddubsw m3, m15 + phaddw m0, m2 + phaddw m1, m3 + pmulhrsw m0, m12 ; 0 1 2 3 + pmulhrsw m1, m12 ; 4 5 6 7 + palignr m2, m1, m0, 4 ; 1 2 3 4 + punpcklwd m3, m0, m2 ; 01 12 + punpckhwd m0, m2 ; 23 34 + pshufd m5, m1, q0321 ; 5 6 7 _ + punpcklwd m2, m1, m5 ; 45 56 + punpckhwd m4, m1, m5 ; 67 __ +.w2_loop: + and myd, 0x3ff + mov r6d, 64 << 24 + mov r4d, myd + shr r4d, 6 + lea r4d, [t1+r4] + cmovnz r6q, [base+subpel_filters+r4*8] + movq m11, r6q + punpcklbw m11, m11 + psraw m11, 8 + pshufd m8, m11, q0000 + pshufd m9, m11, q1111 + pshufd m10, m11, q2222 + pshufd m11, m11, q3333 + pmaddwd m5, m3, m8 + pmaddwd m6, m0, m9 + pmaddwd m7, m2, m10 + pmaddwd m8, m4, m11 + paddd m5, m6 + paddd m7, m8 + paddd m5, m13 + paddd m5, m7 + psrad m5, 10 + packssdw m5, m5 + packuswb m5, m5 + pextrw r6d, m5, 0 + mov [dstq], r6w + add dstq, dsq + dec hd + jz .ret + add myd, dyd + test myd, ~0x3ff + jz .w2_loop + movq m5, [srcq] + test myd, 0x400 + jz .w2_skip_line + add srcq, ssq + shufps m3, m0, q1032 ; 01 12 + shufps m0, m2, q1032 ; 23 34 + shufps m2, m4, q1032 ; 45 56 + pshufb m5, m14 + pmaddubsw m5, m15 + phaddw m5, m5 + pmulhrsw m5, m12 + palignr m4, m5, m1, 12 + punpcklqdq m1, m4, m4 ; 6 7 6 7 + punpcklwd m4, m1, m5 ; 67 __ + jmp .w2_loop +.w2_skip_line: + movhps m5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova m3, m0 ; 01 12 + mova m0, m2 ; 23 34 + pshufb m5, m14 + pmaddubsw m5, m15 + phaddw m5, m5 + pmulhrsw m5, m12 ; 6 7 6 7 + palignr m4, m5, m1, 8 ; 4 5 6 7 + pshufd m5, m4, q0321 ; 5 6 7 _ + mova m1, m4 + punpcklwd m2, m4, m5 ; 45 56 + punpckhwd m4, m5 ; 67 __ + jmp .w2_loop + SWAP m15, m8, m9 +%endif +.w4: + mov myd, mym + mova m7, [base+rescale_mul] + movzx t0d, t0b + dec srcq + movd m15, t0d + pmaddwd m8, m7 + mova m11, [base+pd_0x4000] + pshufd m15, m15, q0000 + paddd m14, m8 ; mx+dx*[0-3] + pand m0, m14, m10 + psrld m0, 6 + paddd m15, m0 + psrldq m7, m15, 8 + movd r4d, m15 + movd r11d, m7 + psrldq m15, 4 + psrldq m7, 4 + movd r6d, m15 + movd r13d, m7 + movd m15, [base+subpel_filters+ r4*8+2] + movd m2, [base+subpel_filters+r11*8+2] + movd m3, [base+subpel_filters+ r6*8+2] + movd m4, [base+subpel_filters+r13*8+2] + mova m5, [base+bdct_lb_dw] + movq m6, [base+subpel_s_shuf2] + pcmpeqd m0, m9 + psrld m14, 10 + movu m7, [srcq+ssq*0] + movu m9, [srcq+ssq*1] + movu m8, [srcq+ssq*2] + movu m10, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + punpckldq m15, m3 + punpckldq m2, m4 + punpcklqdq m6, m6 + punpcklqdq m15, m2 + pshufb m14, m5 + paddb m14, m6 + movu m2, [srcq+ssq*0] + movu m4, [srcq+ssq*1] + movu m3, [srcq+ssq*2] + movu m5, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + pand m11, m0 + pandn m0, m15 + SWAP m15, m0 + por m15, m11 + pshufb m7, m14 + pshufb m9, m14 + pshufb m8, m14 + pshufb m10, m14 + pshufb m2, m14 + pshufb m4, m14 + pshufb m3, m14 + pshufb m5, m14 + pmaddubsw m7, m15 + pmaddubsw m9, m15 + pmaddubsw m8, m15 + pmaddubsw m10, m15 + pmaddubsw m2, m15 + pmaddubsw m4, m15 + pmaddubsw m3, m15 + pmaddubsw m5, m15 + phaddw m7, m9 + phaddw m8, m10 + phaddw m9, m2, m4 + phaddw m3, m5 + pmulhrsw m7, m12 ; 0 1 + pmulhrsw m8, m12 ; 2 3 + pmulhrsw m9, m12 ; 4 5 + pmulhrsw m3, m12 ; 6 7 + shufps m4, m7, m8, q1032 ; 1 2 + shufps m5, m8, m9, q1032 ; 3 4 + shufps m6, m9, m3, q1032 ; 5 6 + psrldq m11, m3, 8 ; 7 _ + punpcklwd m0, m7, m4 ; 01 + punpckhwd m7, m4 ; 12 + punpcklwd m1, m8, m5 ; 23 + punpckhwd m8, m5 ; 34 + punpcklwd m2, m9, m6 ; 45 + punpckhwd m9, m6 ; 56 + punpcklwd m3, m11 ; 67 + mova [rsp+0x00], m7 + mova [rsp+0x10], m8 + mova [rsp+0x20], m9 +.w4_loop: + and myd, 0x3ff + mov r6d, 64 << 24 + mov r4d, myd + shr r4d, 6 + lea r4d, [t1+r4] + cmovnz r6q, [base+subpel_filters+r4*8] + movq m10, r6q + punpcklbw m10, m10 + psraw m10, 8 + pshufd m7, m10, q0000 + pshufd m8, m10, q1111 + pshufd m9, m10, q2222 + pshufd m10, m10, q3333 + pmaddwd m4, m0, m7 + pmaddwd m5, m1, m8 + pmaddwd m6, m2, m9 + pmaddwd m7, m3, m10 + paddd m4, m5 + paddd m6, m7 + paddd m4, m13 + paddd m4, m6 + psrad m4, rndshift + packssdw m4, m4 +%ifidn %1, put + packuswb m4, m4 + movd [dstq], m4 + add dstq, dsq +%else + movq [tmpq], m4 + add tmpq, 8 +%endif + dec hd + jz .ret + add myd, dyd + test myd, ~0x3ff + jz .w4_loop + movu m4, [srcq] + test myd, 0x400 + jz .w4_skip_line + mova m0, [rsp+0x00] + mova [rsp+0x00], m1 + mova m1, [rsp+0x10] + mova [rsp+0x10], m2 + mova m2, [rsp+0x20] + mova [rsp+0x20], m3 + pshufb m4, m14 + pmaddubsw m4, m15 + phaddw m4, m4 + pmulhrsw m4, m12 + punpcklwd m3, m11, m4 + mova m11, m4 + add srcq, ssq + jmp .w4_loop +.w4_skip_line: + movu m5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova m6, [rsp+0x10] + mova m7, [rsp+0x20] + pshufb m4, m14 + pshufb m5, m14 + pmaddubsw m4, m15 + pmaddubsw m5, m15 + phaddw m4, m5 + pmulhrsw m4, m12 + punpcklwd m9, m11, m4 + mova [rsp+0x00], m6 + mova [rsp+0x10], m7 + mova [rsp+0x20], m9 + psrldq m11, m4, 8 + mova m0, m1 + mova m1, m2 + mova m2, m3 + punpcklwd m3, m4, m11 + jmp .w4_loop + SWAP m0, m15 +.w8: + mov dword [rsp+0x90], 1 + movifprep tmp_stridem, 16 + jmp .w_start +.w16: + mov dword [rsp+0x90], 2 + movifprep tmp_stridem, 32 + jmp .w_start +.w32: + mov dword [rsp+0x90], 4 + movifprep tmp_stridem, 64 + jmp .w_start +.w64: + mov dword [rsp+0x90], 8 + movifprep tmp_stridem, 128 + jmp .w_start +.w128: + mov dword [rsp+0x90], 16 + movifprep tmp_stridem, 256 +.w_start: +%ifidn %1, put + movifnidn dsm, dsq +%endif + shr t0d, 16 + sub srcq, 3 + movd m15, t0d + pslld m7, m8, 2 ; dx*4 + pmaddwd m8, [base+rescale_mul] ; dx*[0-3] + pshufd m15, m15, q0000 + paddd m14, m8 ; mx+dx*[0-3] + mova [rsp+0x100], m7 + mova [rsp+0x120], m15 + mov [rsp+0x098], srcq + mov [rsp+0x130], r0q ; dstq / tmpq +%if UNIX64 + mov hm, hd +%endif + jmp .hloop +.hloop_prep: + dec dword [rsp+0x090] + jz .ret + add qword [rsp+0x130], 8*(isprep+1) + mov hd, hm + mova m7, [rsp+0x100] + mova m14, [rsp+0x110] + mova m10, [base+pd_0x3ff] + mova m15, [rsp+0x120] + pxor m9, m9 + mov srcq, [rsp+0x098] + mov r0q, [rsp+0x130] ; dstq / tmpq + paddd m14, m7 +.hloop: + mova m11, [base+pq_0x40000000] + psrld m4, m14, 10 + mova [rsp], m4 + pand m6, m14, m10 + psrld m6, 6 + paddd m5, m15, m6 + pcmpeqd m6, m9 + psrldq m4, m5, 8 + movd r4d, m5 + movd r6d, m4 + psrldq m5, 4 + psrldq m4, 4 + movd r7d, m5 + movd r9d, m4 + movq m0, [base+subpel_filters+r4*8] + movq m1, [base+subpel_filters+r6*8] + movhps m0, [base+subpel_filters+r7*8] + movhps m1, [base+subpel_filters+r9*8] + paddd m14, m7 ; mx+dx*[4-7] + pand m5, m14, m10 + psrld m5, 6 + paddd m15, m5 + pcmpeqd m5, m9 + mova [rsp+0x110], m14 + psrldq m4, m15, 8 + movd r10d, m15 + movd r11d, m4 + psrldq m15, 4 + psrldq m4, 4 + movd r13d, m15 + movd rXd, m4 + movq m2, [base+subpel_filters+r10*8] + movq m3, [base+subpel_filters+r11*8] + movhps m2, [base+subpel_filters+r13*8] + movhps m3, [base+subpel_filters+ rX*8] + psrld m14, 10 + psrldq m4, m14, 8 + movd r10d, m14 + movd r11d, m4 + psrldq m14, 4 + psrldq m4, 4 + movd r13d, m14 + movd rXd, m4 + mov r4d, [rsp+ 0] + mov r6d, [rsp+ 8] + mov r7d, [rsp+ 4] + mov r9d, [rsp+12] + pshufd m4, m6, q1100 + pshufd m6, m6, q3322 + pshufd m14, m5, q1100 + pshufd m5, m5, q3322 + pand m7, m11, m4 + pand m8, m11, m6 + pand m15, m11, m14 + pand m11, m11, m5 + pandn m4, m0 + pandn m6, m1 + pandn m14, m2 + pandn m5, m3 + por m7, m4 + por m8, m6 + por m15, m14 + por m11, m5 + mova [rsp+0x10], m7 + mova [rsp+0x20], m8 + mova [rsp+0x30], m15 + mova [rsp+0x40], m11 + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10, 7, 8, 15, 11 ; 0-1 + mova [rsp+0x50], m1 + mova [rsp+0x60], m2 + MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10, 7, 8, 15, 11 ; 2-3 + mova [rsp+0x70], m3 + mova [rsp+0x80], m4 + MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10, 7, 8, 15, 11 ; 4-5 + MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 9, 10, 7, 8, 15, 11 ; 6-7 + SWAP m7, m0 + SWAP m8, m14 + mova m1, [rsp+0x50] + mova m2, [rsp+0x60] + mova m3, [rsp+0x70] + mova m9, [rsp+0x80] + mov myd, mym + mov dyd, dym + punpcklwd m4, m5, m6 ; 45a + punpckhwd m5, m6 ; 45b + punpcklwd m6, m7, m8 ; 67a + punpckhwd m7, m8 ; 67b + punpcklwd m0, m1, m2 ; 01a + punpckhwd m1, m2 ; 01b + punpcklwd m2, m3, m9 ; 23a + punpckhwd m3, m9 ; 23b + mova [rsp+0x50], m4 + mova [rsp+0x60], m5 + mova [rsp+0x70], m6 + mova [rsp+0x80], m7 + SWAP m14, m8 +.vloop: + and myd, 0x3ff + mov r6d, 64 << 24 + mov r4d, myd + shr r4d, 6 + lea r4d, [t1+r4] + cmovnz r6q, [base+subpel_filters+r4*8] + movq m11, r6q + punpcklbw m11, m11 + psraw m11, 8 + pshufd m5, m11, q0000 + pshufd m7, m11, q1111 + pshufd m10, m11, q2222 + pshufd m11, m11, q3333 + pmaddwd m4, m5, m0 + pmaddwd m5, m5, m1 + pmaddwd m6, m7, m2 + pmaddwd m7, m7, m3 + paddd m4, m13 + paddd m5, m13 + paddd m4, m6 + paddd m5, m7 + pmaddwd m6, [rsp+0x50], m10 + pmaddwd m7, [rsp+0x60], m10 + pmaddwd m8, [rsp+0x70], m11 + pmaddwd m9, [rsp+0x80], m11 + paddd m4, m6 + paddd m5, m7 + paddd m4, m8 + paddd m5, m9 + psrad m4, rndshift + psrad m5, rndshift + packssdw m4, m5 +%ifidn %1, put + packuswb m4, m4 + movq [dstq], m4 + add dstq, dsm +%else + mova [tmpq], m4 + add tmpq, tmp_stridem +%endif + dec hd + jz .hloop_prep + add myd, dyd + test myd, ~0x3ff + jz .vloop + test myd, 0x400 + mov [rsp+0x140], myd + mov r4d, [rsp+ 0] + mov r6d, [rsp+ 8] + mov r7d, [rsp+ 4] + mov r9d, [rsp+12] + jz .skip_line + mova m14, [base+unpckw] + movq m6, [srcq+r10] + movq m7, [srcq+r11] + movhps m6, [srcq+r13] + movhps m7, [srcq+ rX] + movq m4, [srcq+ r4] + movq m5, [srcq+ r6] + movhps m4, [srcq+ r7] + movhps m5, [srcq+ r9] + add srcq, ssq + mov myd, [rsp+0x140] + mov dyd, dym + pshufd m9, m14, q1032 + pshufb m0, m14 ; 0a 1a + pshufb m1, m14 ; 0b 1b + pshufb m2, m9 ; 3a 2a + pshufb m3, m9 ; 3b 2b + pmaddubsw m6, [rsp+0x30] + pmaddubsw m7, [rsp+0x40] + pmaddubsw m4, [rsp+0x10] + pmaddubsw m5, [rsp+0x20] + phaddw m6, m7 + phaddw m4, m5 + phaddw m4, m6 + pmulhrsw m4, m12 + pshufb m5, [rsp+0x50], m14 ; 4a 5a + pshufb m6, [rsp+0x60], m14 ; 4b 5b + pshufb m7, [rsp+0x70], m9 ; 7a 6a + pshufb m8, [rsp+0x80], m9 ; 7b 6b + punpckhwd m0, m2 ; 12a + punpckhwd m1, m3 ; 12b + punpcklwd m2, m5 ; 34a + punpcklwd m3, m6 ; 34b + punpckhwd m5, m7 ; 56a + punpckhwd m6, m8 ; 56b + punpcklwd m7, m4 ; 78a + punpckhqdq m4, m4 + punpcklwd m8, m4 ; 78b + mova [rsp+0x50], m5 + mova [rsp+0x60], m6 + mova [rsp+0x70], m7 + mova [rsp+0x80], m8 + jmp .vloop +.skip_line: + mova m0, [rsp+0x10] + mova m1, [rsp+0x20] + mova m14, [rsp+0x30] + mova m15, [rsp+0x40] + MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 10, 11, 0, 1, 14, 15 + mov myd, [rsp+0x140] + mov dyd, dym + mova m0, m2 ; 01a + mova m1, m3 ; 01b + mova m2, [rsp+0x50] ; 23a + mova m3, [rsp+0x60] ; 23b + mova m5, [rsp+0x70] ; 45a + mova m6, [rsp+0x80] ; 45b + punpcklwd m7, m4, m8 ; 67a + punpckhwd m4, m8 ; 67b + mova [rsp+0x50], m5 + mova [rsp+0x60], m6 + mova [rsp+0x70], m7 + mova [rsp+0x80], m4 + jmp .vloop +.dy1: + movzx wd, word [base+%1_8tap_scaled_ssse3_dy1_table+wq*2] + add wq, base_reg + jmp wq +%ifidn %1, put +.dy1_w2: + mov myd, mym + movzx t0d, t0b + dec srcq + movd m15, t0d + punpckldq m9, m8 + SWAP m8, m9 + paddd m14, m8 ; mx+dx*[0-1] + mova m11, [base+pd_0x4000] + pshufd m15, m15, q0000 + pand m8, m14, m10 + psrld m8, 6 + paddd m15, m8 + movd r4d, m15 + psrldq m15, 4 + movd r6d, m15 + mova m5, [base+bdct_lb_dw] + mova m6, [base+subpel_s_shuf2] + movd m15, [base+subpel_filters+r4*8+2] + movd m7, [base+subpel_filters+r6*8+2] + pxor m9, m9 + pcmpeqd m8, m9 + psrld m14, 10 + movq m0, [srcq+ssq*0] + movq m2, [srcq+ssq*2] + movhps m0, [srcq+ssq*1] + movhps m2, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + pshufb m14, m5 + paddb m14, m6 + movq m1, [srcq+ssq*0] + movq m3, [srcq+ssq*2] + movhps m1, [srcq+ssq*1] + add srcq, ss3q + movq xm10, r4q + punpcklbw xm10, xm10 + psraw xm10, 8 + punpckldq m15, m7 + punpcklqdq m15, m15 + pand m11, m8 + pandn m8, m15 + SWAP m15, m8 + por m15, m11 + pshufd m8, m10, q0000 + pshufd m9, m10, q1111 + pshufd m11, m10, q3333 + pshufd m10, m10, q2222 + pshufb m0, m14 + pshufb m2, m14 + pshufb m1, m14 + pshufb m3, m14 + pmaddubsw m0, m15 + pmaddubsw m2, m15 + pmaddubsw m1, m15 + pmaddubsw m3, m15 + phaddw m0, m2 + phaddw m1, m3 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + palignr m2, m1, m0, 4 + pshufd m4, m1, q2121 + punpcklwd m3, m0, m2 ; 01 12 + punpckhwd m0, m2 ; 23 34 + punpcklwd m2, m1, m4 ; 45 56 +.dy1_w2_loop: + movq m1, [srcq+ssq*0] + movhps m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddwd m5, m3, m8 + pmaddwd m6, m0, m9 + pmaddwd m7, m2, m10 + mova m3, m0 + mova m0, m2 + paddd m5, m13 + paddd m6, m7 + pshufb m1, m14 + pmaddubsw m1, m15 + phaddw m1, m1 + pmulhrsw m1, m12 + palignr m7, m1, m4, 12 + punpcklwd m2, m7, m1 ; 67 78 + pmaddwd m7, m2, m11 + mova m4, m1 + paddd m5, m6 + paddd m5, m7 + psrad m5, rndshift + packssdw m5, m5 + packuswb m5, m5 + pextrw r4d, m5, 0 + pextrw r6d, m5, 1 + mov [dstq+dsq*0], r4w + mov [dstq+dsq*1], r6w + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .dy1_w2_loop + RET + SWAP m15, m8, m9 +%endif +.dy1_w4: + mov myd, mym + mova m7, [base+rescale_mul] + movzx t0d, t0b + dec srcq + movd m15, t0d + pmaddwd m8, m7 + mova m11, [base+pd_0x4000] + pshufd m15, m15, q0000 + paddd m14, m8 ; mx+dx*[0-3] + pand m8, m14, m10 + psrld m8, 6 + paddd m15, m8 + psrldq m7, m15, 8 + movd r4d, m15 + movd r11d, m7 + psrldq m15, 4 + psrldq m7, 4 + movd r6d, m15 + movd r13d, m7 + movd m15, [base+subpel_filters+ r4*8+2] + movd m4, [base+subpel_filters+r11*8+2] + movd m5, [base+subpel_filters+ r6*8+2] + movd m7, [base+subpel_filters+r13*8+2] + movq m6, [base+subpel_s_shuf2] + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + pcmpeqd m8, m9 + psrld m14, 10 + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + movu m2, [srcq+ssq*2] + movu m3, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + punpckldq m15, m5 + punpckldq m4, m7 + punpcklqdq m6, m6 + punpcklqdq m15, m4 + pshufb m14, [base+bdct_lb_dw] + movu m4, [srcq+ssq*0] + movu m5, [srcq+ssq*1] + movu m7, [srcq+ssq*2] + add srcq, ss3q + pand m11, m8 + pandn m8, m15 + SWAP m15, m8 + por m15, m11 + paddb m14, m6 + movq m10, r4q + punpcklbw m10, m10 + psraw m10, 8 + pshufb m0, m14 + pshufb m1, m14 + pshufb m2, m14 + pshufb m3, m14 + pshufb m4, m14 + pshufb m5, m14 + pshufb m7, m14 + pmaddubsw m0, m15 + pmaddubsw m1, m15 + pmaddubsw m2, m15 + pmaddubsw m3, m15 + pmaddubsw m4, m15 + pmaddubsw m5, m15 + pmaddubsw m7, m15 + phaddw m0, m1 + phaddw m2, m3 + phaddw m4, m5 + phaddw m6, m7, m7 + pmulhrsw m0, m12 ; 0 1 + pmulhrsw m2, m12 ; 2 3 + pmulhrsw m4, m12 ; 4 5 + pmulhrsw m6, m12 ; 6 _ + shufps m1, m0, m2, q1032 ; 1 2 + shufps m3, m2, m4, q1032 ; 3 4 + shufps m5, m4, m6, q1032 ; 5 6 + punpcklwd m7, m0, m1 ; 01 + punpckhwd m0, m1 ; 12 + punpcklwd m8, m2, m3 ; 23 + punpckhwd m2, m3 ; 34 + punpcklwd m9, m4, m5 ; 45 + punpckhwd m4, m5 ; 56 + pshufd m1, m10, q0000 + pshufd m3, m10, q1111 + pshufd m5, m10, q2222 + pshufd m10, m10, q3333 + mova [rsp+0x00], m8 + mova [rsp+0x10], m2 + mova [rsp+0x20], m9 + mova [rsp+0x30], m4 +.dy1_w4_loop: + movu m11, [srcq+ssq*0] + pmaddwd m7, m1 + pmaddwd m8, m3 + pmaddwd m0, m1 + pmaddwd m2, m3 + pmaddwd m9, m5 + pmaddwd m4, m5 + paddd m7, m8 + paddd m0, m2 + movu m8, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb m11, m14 + pmaddubsw m11, m15 + paddd m7, m13 + paddd m0, m13 + paddd m7, m9 + paddd m0, m4 + pshufb m8, m14 + pmaddubsw m8, m15 + phaddw m11, m8 + mova m8, [rsp+0x20] + pmulhrsw m11, m12 + punpcklwd m9, m6, m11 ; 67 + psrldq m6, m11, 8 + punpcklwd m4, m11, m6 ; 78 + pmaddwd m2, m9, m10 + pmaddwd m11, m4, m10 + paddd m7, m2 + mova m2, [rsp+0x30] + paddd m0, m11 + psrad m7, rndshift + psrad m0, rndshift + packssdw m7, m0 + mova m0, [rsp+0x10] +%ifidn %1, put + packuswb m7, m7 + psrldq m11, m7, 4 + movd [dstq+dsq*0], m7 + movd [dstq+dsq*1], m11 + lea dstq, [dstq+dsq*2] +%else + mova [tmpq], m7 + add tmpq, 16 +%endif + sub hd, 2 + jz .ret + mova m7, [rsp+0x00] + mova [rsp+0x00], m8 + mova [rsp+0x10], m2 + mova [rsp+0x20], m9 + mova [rsp+0x30], m4 + jmp .dy1_w4_loop + SWAP m8, m15 +.dy1_w8: + mov dword [rsp+0x90], 1 + movifprep tmp_stridem, 16 + jmp .dy1_w_start +.dy1_w16: + mov dword [rsp+0x90], 2 + movifprep tmp_stridem, 32 + jmp .dy1_w_start +.dy1_w32: + mov dword [rsp+0x90], 4 + movifprep tmp_stridem, 64 + jmp .dy1_w_start +.dy1_w64: + mov dword [rsp+0x90], 8 + movifprep tmp_stridem, 128 + jmp .dy1_w_start +.dy1_w128: + mov dword [rsp+0x90], 16 + movifprep tmp_stridem, 256 +.dy1_w_start: + mov myd, mym +%ifidn %1, put + movifnidn dsm, dsq +%endif + shr t0d, 16 + sub srcq, 3 + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + movd m15, t0d + pslld m7, m8, 2 ; dx*4 + pmaddwd m8, [base+rescale_mul] ; dx*[0-3] + pshufd m15, m15, q0000 + paddd m14, m8 ; mx+dx*[0-3] + movq m3, r4q + punpcklbw m3, m3 + psraw m3, 8 + mova [rsp+0x100], m7 + mova [rsp+0x120], m15 + mov [rsp+0x098], srcq + mov [rsp+0x130], r0q ; dstq / tmpq + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova [rsp+0x140], m0 + mova [rsp+0x150], m1 + mova [rsp+0x160], m2 + mova [rsp+0x170], m3 +%if UNIX64 + mov hm, hd +%endif + jmp .dy1_hloop +.dy1_hloop_prep: + dec dword [rsp+0x090] + jz .ret + add qword [rsp+0x130], 8*(isprep+1) + mov hd, hm + mova m7, [rsp+0x100] + mova m14, [rsp+0x110] + mova m10, [base+pd_0x3ff] + mova m15, [rsp+0x120] + pxor m9, m9 + mov srcq, [rsp+0x098] + mov r0q, [rsp+0x130] ; dstq / tmpq + paddd m14, m7 +.dy1_hloop: + mova m11, [base+pq_0x40000000] + psrld m4, m14, 10 + mova [rsp], m4 + pand m6, m14, m10 + psrld m6, 6 + paddd m5, m15, m6 + pcmpeqd m6, m9 + psrldq m4, m5, 8 + movd r4d, m5 + movd r6d, m4 + psrldq m5, 4 + psrldq m4, 4 + movd r7d, m5 + movd r9d, m4 + movq m0, [base+subpel_filters+r4*8] + movq m1, [base+subpel_filters+r6*8] + movhps m0, [base+subpel_filters+r7*8] + movhps m1, [base+subpel_filters+r9*8] + paddd m14, m7 ; mx+dx*[4-7] + pand m5, m14, m10 + psrld m5, 6 + paddd m15, m5 + pcmpeqd m5, m9 + mova [rsp+0x110], m14 + psrldq m4, m15, 8 + movd r10d, m15 + movd r11d, m4 + psrldq m15, 4 + psrldq m4, 4 + movd r13d, m15 + movd rXd, m4 + movq m2, [base+subpel_filters+r10*8] + movq m3, [base+subpel_filters+r11*8] + movhps m2, [base+subpel_filters+r13*8] + movhps m3, [base+subpel_filters+ rX*8] + psrld m14, 10 + psrldq m4, m14, 8 + movd r10d, m14 + movd r11d, m4 + psrldq m14, 4 + psrldq m4, 4 + movd r13d, m14 + movd rXd, m4 + punpcklbw m14, m14 + psraw m14, 8 + mov r4d, [rsp+ 0] + mov r6d, [rsp+ 8] + mov r7d, [rsp+ 4] + mov r9d, [rsp+12] + pshufd m4, m6, q1100 + pshufd m6, m6, q3322 + pshufd m7, m5, q1100 + pshufd m5, m5, q3322 + pand m8, m11, m4 + pand m9, m11, m6 + pand m15, m11, m7 + pand m11, m11, m5 + pandn m4, m0 + pandn m6, m1 + pandn m7, m2 + pandn m5, m3 + por m8, m4 + por m9, m6 + por m15, m7 + por m11, m5 + mova [rsp+0x10], m8 + mova [rsp+0x20], m9 + mova [rsp+0x30], m15 + mova [rsp+0x40], m11 + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 10, 8, 9, 15, 11 ; 0-1 + mova [rsp+0x50], m1 + mova [rsp+0x60], m2 + MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 7, 10, 8, 9, 15, 11 ; 2-3 + mova [rsp+0x70], m3 + mova [rsp+0x80], m4 + MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 4-5 + MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 6-7 + SWAP m7, m0 + SWAP m8, m14 + mova m1, [rsp+0x50] + mova m2, [rsp+0x60] + mova m3, [rsp+0x70] + mova m15, [rsp+0x80] + punpcklwd m4, m5, m6 ; 45a + punpckhwd m5, m6 ; 45b + punpcklwd m6, m7, m8 ; 67a + punpckhwd m7, m8 ; 67b + SWAP m14, m8 + mova m8, [rsp+0x140] + mova m9, [rsp+0x150] + mova m10, [rsp+0x160] + mova m11, [rsp+0x170] + punpcklwd m0, m1, m2 ; 01a + punpckhwd m1, m2 ; 01b + punpcklwd m2, m3, m15; 23a + punpckhwd m3, m15 ; 23b + mova [rsp+0x50], m4 + mova [rsp+0x60], m5 + mova [rsp+0x70], m6 + mova [rsp+0x80], m7 + mova m14, [base+unpckw] +.dy1_vloop: + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m8 + pmaddwd m6, m2, m9 + pmaddwd m7, m3, m9 + paddd m4, m13 + paddd m5, m13 + paddd m4, m6 + paddd m5, m7 + pmaddwd m6, [rsp+0x50], m10 + pmaddwd m7, [rsp+0x60], m10 + pmaddwd m15, [rsp+0x70], m11 + paddd m4, m6 + pmaddwd m6, [rsp+0x80], m11 + paddd m5, m7 + paddd m4, m15 + paddd m5, m6 + psrad m4, rndshift + psrad m5, rndshift + packssdw m4, m5 +%ifidn %1, put + packuswb m4, m4 + movq [dstq], m4 + add dstq, dsm +%else + mova [tmpq], m4 + add tmpq, tmp_stridem +%endif + dec hd + jz .dy1_hloop_prep + movq m4, [srcq+ r4] + movq m5, [srcq+ r6] + movhps m4, [srcq+ r7] + movhps m5, [srcq+ r9] + movq m6, [srcq+r10] + movq m7, [srcq+r11] + movhps m6, [srcq+r13] + movhps m7, [srcq+ rX] + add srcq, ssq + pshufd m15, m14, q1032 + pshufb m0, m14 ; 0a 1a + pshufb m1, m14 ; 0b 1b + pshufb m2, m15 ; 3a 2a + pshufb m3, m15 ; 3b 2b + pmaddubsw m4, [rsp+0x10] + pmaddubsw m5, [rsp+0x20] + pmaddubsw m6, [rsp+0x30] + pmaddubsw m7, [rsp+0x40] + phaddw m4, m5 + phaddw m6, m7 + phaddw m4, m6 + pmulhrsw m4, m12 + pshufb m5, [rsp+0x70], m15 ; 7a 6a + pshufb m7, [rsp+0x80], m15 ; 7b 6b + pshufb m6, [rsp+0x50], m14 ; 4a 5a + pshufb m15, [rsp+0x60], m14 ; 4b 5b + punpckhwd m0, m2 ; 12a + punpckhwd m1, m3 ; 12b + punpcklwd m2, m6 ; 34a + punpcklwd m3, m15 ; 34b + punpckhwd m6, m5 ; 56a + punpckhwd m15, m7 ; 56b + punpcklwd m5, m4 ; 78a + psrldq m4, 8 + punpcklwd m7, m4 ; 78b + mova [rsp+0x50], m6 + mova [rsp+0x60], m15 + mova [rsp+0x70], m5 + mova [rsp+0x80], m7 + jmp .dy1_vloop +.dy2: + movzx wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2] + add wq, base_reg + jmp wq +%ifidn %1, put +.dy2_w2: + mov myd, mym + movzx t0d, t0b + dec srcq + movd m15, t0d + punpckldq m9, m8 + SWAP m8, m9 + paddd m14, m8 ; mx+dx*[0-1] + mova m11, [base+pd_0x4000] + pshufd m15, m15, q0000 + pand m8, m14, m10 + psrld m8, 6 + paddd m15, m8 + movd r4d, m15 + psrldq m15, 4 + movd r6d, m15 + mova m5, [base+bdct_lb_dw] + mova m6, [base+subpel_s_shuf2] + movd m15, [base+subpel_filters+r4*8+2] + movd m7, [base+subpel_filters+r6*8+2] + pxor m9, m9 + pcmpeqd m8, m9 + psrld m14, 10 + movq m0, [srcq+ssq*0] + movq m1, [srcq+ssq*1] + movhps m0, [srcq+ssq*2] + movhps m1, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + pshufb m14, m5 + paddb m14, m6 + punpckldq m15, m7 + punpcklqdq m15, m15 + pand m11, m8 + pandn m8, m15 + SWAP m15, m8 + por m15, m11 + movq m3, [srcq+ssq*0] + movhps m3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + pshufb m0, m14 + pshufb m1, m14 + pshufb m3, m14 + pmaddubsw m0, m15 + pmaddubsw m1, m15 + pmaddubsw m3, m15 + movq m11, r4q + punpcklbw m11, m11 + psraw m11, 8 + pslldq m2, m3, 8 + phaddw m0, m2 + phaddw m1, m3 + pmulhrsw m0, m12 ; 0 2 _ 4 + pmulhrsw m1, m12 ; 1 3 _ 5 + pshufd m8, m11, q0000 + pshufd m9, m11, q1111 + pshufd m10, m11, q2222 + pshufd m11, m11, q3333 + pshufd m2, m0, q3110 ; 0 2 2 4 + pshufd m1, m1, q3110 ; 1 3 3 5 + punpcklwd m3, m2, m1 ; 01 23 + punpckhwd m2, m1 ; 23 45 +.dy2_w2_loop: + movq m6, [srcq+ssq*0] + movq m7, [srcq+ssq*1] + movhps m6, [srcq+ssq*2] + movhps m7, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + pmaddwd m4, m3, m8 + pmaddwd m5, m2, m9 + pshufb m6, m14 + pshufb m7, m14 + pmaddubsw m6, m15 + pmaddubsw m7, m15 + phaddw m6, m7 + pmulhrsw m6, m12 + psrldq m7, m6, 8 + palignr m6, m0, 8 + palignr m7, m1, 8 + mova m0, m6 + mova m1, m7 + pshufd m6, m6, q3221 + pshufd m7, m7, q3221 + punpcklwd m3, m6, m7 ; 45 67 + punpckhwd m2, m6, m7 ; 67 89 + pmaddwd m6, m3, m10 + pmaddwd m7, m2, m11 + paddd m4, m5 + paddd m4, m13 + paddd m6, m7 + paddd m4, m6 + psrad m4, rndshift + packssdw m4, m4 + packuswb m4, m4 + movd r4d, m4 + mov [dstq+dsq*0], r4w + shr r4d, 16 + mov [dstq+dsq*1], r4w + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .dy2_w2_loop + RET + SWAP m15, m8, m9 +%endif +.dy2_w4: + mov myd, mym + mova m7, [base+rescale_mul] + movzx t0d, t0b + dec srcq + movd m15, t0d + pmaddwd m8, m7 + mova m11, [base+pd_0x4000] + pshufd m15, m15, q0000 + paddd m14, m8 ; mx+dx*[0-3] + pand m8, m14, m10 + psrld m8, 6 + paddd m15, m8 + psrldq m7, m15, 8 + movd r4d, m15 + movd r11d, m7 + psrldq m15, 4 + psrldq m7, 4 + movd r6d, m15 + movd r13d, m7 + movd m15, [base+subpel_filters+ r4*8+2] + movd m4, [base+subpel_filters+r11*8+2] + movd m5, [base+subpel_filters+ r6*8+2] + movd m7, [base+subpel_filters+r13*8+2] + movq m6, [base+subpel_s_shuf2] + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + pcmpeqd m8, m9 + psrld m14, 10 + movu m0, [srcq+ssq*0] + movu m2, [srcq+ssq*2] + movu m1, [srcq+ssq*1] + movu m3, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + punpckldq m15, m5 + punpckldq m4, m7 + punpcklqdq m6, m6 + punpcklqdq m15, m4 + pshufb m14, [base+bdct_lb_dw] + movu m4, [srcq+ssq*0] + movu m5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pand m11, m8 + pandn m8, m15 + SWAP m15, m8 + por m15, m11 + paddb m14, m6 + movq m11, r4q + punpcklbw m11, m11 + psraw m11, 8 + pshufb m0, m14 + pshufb m2, m14 + pshufb m1, m14 + pshufb m3, m14 + pshufb m4, m14 + pshufb m5, m14 + pmaddubsw m0, m15 + pmaddubsw m2, m15 + pmaddubsw m1, m15 + pmaddubsw m3, m15 + pmaddubsw m4, m15 + pmaddubsw m5, m15 + phaddw m0, m2 + phaddw m1, m3 + phaddw m4, m5 + pmulhrsw m0, m12 ; 0 2 + pmulhrsw m1, m12 ; 1 3 + pmulhrsw m4, m12 ; 4 5 + pshufd m8, m11, q0000 + pshufd m9, m11, q1111 + pshufd m10, m11, q2222 + pshufd m11, m11, q3333 + psrldq m5, m4, 8 ; 5 _ + punpckhwd m2, m0, m1 ; 23 + punpcklwd m0, m1 ; 01 + punpcklwd m4, m5 ; 45 +.dy2_w4_loop: + pmaddwd m0, m8 ; a0 + pmaddwd m5, m2, m8 ; b0 + pmaddwd m2, m9 ; a1 + pmaddwd m7, m4, m9 ; b1 + pmaddwd m3, m4, m10 ; a2 + paddd m0, m13 + paddd m5, m13 + paddd m0, m2 + paddd m5, m7 + paddd m0, m3 + movu m6, [srcq+ssq*0] + movu m7, [srcq+ssq*1] + movu m3, [srcq+ssq*2] + movu m1, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + pshufb m6, m14 + pshufb m7, m14 + pshufb m3, m14 + pshufb m1, m14 + pmaddubsw m6, m15 + pmaddubsw m7, m15 + pmaddubsw m3, m15 + pmaddubsw m1, m15 + phaddw m6, m7 + phaddw m3, m1 + pmulhrsw m6, m12 ; 6 7 + pmulhrsw m3, m12 ; 8 9 + psrldq m7, m6, 8 + psrldq m1, m3, 8 + punpcklwd m6, m7 ; 67 + punpcklwd m3, m1 ; 89 + mova m2, m6 + pmaddwd m1, m6, m10 ; b2 + pmaddwd m6, m11 ; a3 + pmaddwd m7, m3, m11 ; b3 + paddd m5, m1 + paddd m0, m6 + paddd m5, m7 + psrad m0, rndshift + psrad m5, rndshift + packssdw m0, m5 +%ifidn %1, put + packuswb m0, m0 + psrldq m1, m0, 4 + movd [dstq+dsq*0], m0 + movd [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] +%else + mova [tmpq], m0 + add tmpq, 16 +%endif + mova m0, m4 + mova m4, m3 + sub hd, 2 + jg .dy2_w4_loop + MC_8TAP_SCALED_RET + SWAP m8, m15 +.dy2_w8: + mov dword [rsp+0x90], 1 + movifprep tmp_stridem, 16 + jmp .dy2_w_start +.dy2_w16: + mov dword [rsp+0x90], 2 + movifprep tmp_stridem, 32 + jmp .dy2_w_start +.dy2_w32: + mov dword [rsp+0x90], 4 + movifprep tmp_stridem, 64 + jmp .dy2_w_start +.dy2_w64: + mov dword [rsp+0x90], 8 + movifprep tmp_stridem, 128 + jmp .dy2_w_start +.dy2_w128: + mov dword [rsp+0x90], 16 + movifprep tmp_stridem, 256 +.dy2_w_start: + mov myd, mym +%ifidn %1, put + movifnidn dsm, dsq +%endif + shr t0d, 16 + sub srcq, 3 + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + movd m15, t0d + pslld m7, m8, 2 ; dx*4 + pmaddwd m8, [base+rescale_mul] ; dx*[0-3] + pshufd m15, m15, q0000 + paddd m14, m8 ; mx+dx*[0-3] + movq m3, r4q + punpcklbw m3, m3 + psraw m3, 8 + mova [rsp+0x100], m7 + mova [rsp+0x120], m15 + mov [rsp+0x098], srcq + mov [rsp+0x130], r0q ; dstq / tmpq + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova [rsp+0x140], m0 + mova [rsp+0x150], m1 + mova [rsp+0x160], m2 + mova [rsp+0x170], m3 +%if UNIX64 + mov hm, hd +%endif + jmp .dy2_hloop +.dy2_hloop_prep: + dec dword [rsp+0x090] + jz .ret + add qword [rsp+0x130], 8*(isprep+1) + mov hd, hm + mova m7, [rsp+0x100] + mova m14, [rsp+0x110] + mova m10, [base+pd_0x3ff] + mova m15, [rsp+0x120] + pxor m9, m9 + mov srcq, [rsp+0x098] + mov r0q, [rsp+0x130] ; dstq / tmpq + paddd m14, m7 +.dy2_hloop: + mova m11, [base+pq_0x40000000] + psrld m4, m14, 10 + mova [rsp], m4 + pand m6, m14, m10 + psrld m6, 6 + paddd m5, m15, m6 + pcmpeqd m6, m9 + psrldq m4, m5, 8 + movd r4d, m5 + movd r6d, m4 + psrldq m5, 4 + psrldq m4, 4 + movd r7d, m5 + movd r9d, m4 + movq m0, [base+subpel_filters+r4*8] + movq m1, [base+subpel_filters+r6*8] + movhps m0, [base+subpel_filters+r7*8] + movhps m1, [base+subpel_filters+r9*8] + paddd m14, m7 ; mx+dx*[4-7] + pand m5, m14, m10 + psrld m5, 6 + paddd m15, m5 + pcmpeqd m5, m9 + mova [rsp+0x110], m14 + psrldq m4, m15, 8 + movd r10d, m15 + movd r11d, m4 + psrldq m15, 4 + psrldq m4, 4 + movd r13d, m15 + movd rXd, m4 + movq m2, [base+subpel_filters+r10*8] + movq m3, [base+subpel_filters+r11*8] + movhps m2, [base+subpel_filters+r13*8] + movhps m3, [base+subpel_filters+ rX*8] + psrld m14, 10 + psrldq m4, m14, 8 + movd r10d, m14 + movd r11d, m4 + psrldq m14, 4 + psrldq m4, 4 + movd r13d, m14 + movd rXd, m4 + mov r4d, [rsp+ 0] + mov r6d, [rsp+ 8] + mov r7d, [rsp+ 4] + mov r9d, [rsp+12] + pshufd m4, m6, q1100 + pshufd m6, m6, q3322 + pshufd m7, m5, q1100 + pshufd m5, m5, q3322 + pand m8, m11, m4 + pand m9, m11, m6 + pand m15, m11, m7 + pand m11, m11, m5 + pandn m4, m0 + pandn m6, m1 + pandn m7, m2 + pandn m5, m3 + por m8, m4 + por m9, m6 + por m15, m7 + por m11, m5 + mova [rsp+0x10], m8 + mova [rsp+0x20], m9 + mova [rsp+0x30], m15 + mova [rsp+0x40], m11 + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 10, 8, 9, 15, 11 ; 0-1 + mova [rsp+0x50], m1 + mova [rsp+0x60], m2 + MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 7, 10, 8, 9, 15, 11 ; 2-3 + mova [rsp+0x70], m3 + mova [rsp+0x80], m4 + MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 4-5 + MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 6-7 + SWAP m7, m0 + SWAP m8, m14 + mova m1, [rsp+0x50] + mova m2, [rsp+0x60] + mova m3, [rsp+0x70] + mova m15, [rsp+0x80] + punpcklwd m4, m5, m6 ; 45a + punpckhwd m5, m6 ; 45b + punpcklwd m6, m7, m8 ; 67a + punpckhwd m7, m8 ; 67b + SWAP m14, m8 + mova m8, [rsp+0x140] + mova m9, [rsp+0x150] + mova m10, [rsp+0x160] + mova m11, [rsp+0x170] + punpcklwd m0, m1, m2 ; 01a + punpckhwd m1, m2 ; 01b + punpcklwd m2, m3, m15; 23a + punpckhwd m3, m15 ; 23b + mova [rsp+0x50], m4 + mova [rsp+0x60], m5 + mova [rsp+0x70], m6 + mova [rsp+0x80], m7 +.dy2_vloop: + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m8 + pmaddwd m6, m2, m9 + pmaddwd m7, m3, m9 + paddd m4, m13 + paddd m5, m13 + paddd m4, m6 + paddd m5, m7 + pmaddwd m6, [rsp+0x50], m10 + pmaddwd m7, [rsp+0x60], m10 + pmaddwd m15, [rsp+0x70], m11 + paddd m4, m6 + pmaddwd m6, [rsp+0x80], m11 + paddd m5, m7 + paddd m4, m15 + paddd m5, m6 + psrad m4, rndshift + psrad m5, rndshift + packssdw m4, m5 +%ifidn %1, put + packuswb m4, m4 + movq [dstq], m4 + add dstq, dsm +%else + mova [tmpq], m4 + add tmpq, tmp_stridem +%endif + dec hd + jz .dy2_hloop_prep + mova m8, [rsp+0x10] + mova m9, [rsp+0x20] + mova m10, [rsp+0x30] + mova m11, [rsp+0x40] + mova m0, m2 ; 01a + mova m1, m3 ; 01b + MC_8TAP_SCALED_H 2, 6, 3, 4, 5, 7, 14, 15, 8, 9, 10, 11 + mova m3, [rsp+0x50] ; 23a + mova m4, [rsp+0x60] ; 23b + mova m5, [rsp+0x70] ; 45a + mova m7, [rsp+0x80] ; 45b + mova m8, [rsp+0x140] + mova m9, [rsp+0x150] + mova m10, [rsp+0x160] + mova m11, [rsp+0x170] + punpcklwd m14, m2, m6 ; 67a + punpckhwd m2, m6 ; 67b + mova [rsp+0x50], m5 + mova [rsp+0x60], m7 + mova [rsp+0x70], m14 + mova [rsp+0x80], m2 + mova m2, m3 + mova m3, m4 + jmp .dy2_vloop +.ret: + MC_8TAP_SCALED_RET 0 +%undef isprep +%endmacro + +%macro BILIN_SCALED_FN 1 +cglobal %1_bilin_scaled + mov t0d, (5*15 << 16) | 5*15 + mov t1d, (5*15 << 16) | 5*15 + jmp mangle(private_prefix %+ _%1_8tap_scaled %+ SUFFIX) +%endmacro + +%if ARCH_X86_64 +%if WIN64 +DECLARE_REG_TMP 6, 5 +%else +DECLARE_REG_TMP 6, 8 +%endif +BILIN_SCALED_FN put +FN put_8tap_scaled, sharp, SHARP, SHARP +FN put_8tap_scaled, sharp_smooth, SHARP, SMOOTH +FN put_8tap_scaled, smooth_sharp, SMOOTH, SHARP +FN put_8tap_scaled, smooth, SMOOTH, SMOOTH +FN put_8tap_scaled, sharp_regular, SHARP, REGULAR +FN put_8tap_scaled, regular_sharp, REGULAR, SHARP +FN put_8tap_scaled, smooth_regular, SMOOTH, REGULAR +FN put_8tap_scaled, regular_smooth, REGULAR, SMOOTH +FN put_8tap_scaled, regular, REGULAR, REGULAR +MC_8TAP_SCALED put + +%if WIN64 +DECLARE_REG_TMP 5, 4 +%else +DECLARE_REG_TMP 6, 7 +%endif +BILIN_SCALED_FN prep +FN prep_8tap_scaled, sharp, SHARP, SHARP +FN prep_8tap_scaled, sharp_smooth, SHARP, SMOOTH +FN prep_8tap_scaled, smooth_sharp, SMOOTH, SHARP +FN prep_8tap_scaled, smooth, SMOOTH, SMOOTH +FN prep_8tap_scaled, sharp_regular, SHARP, REGULAR +FN prep_8tap_scaled, regular_sharp, REGULAR, SHARP +FN prep_8tap_scaled, smooth_regular, SMOOTH, REGULAR +FN prep_8tap_scaled, regular_smooth, REGULAR, SMOOTH +FN prep_8tap_scaled, regular, REGULAR, REGULAR +MC_8TAP_SCALED prep +%endif + +%if ARCH_X86_32 + %macro SAVE_ALPHA_BETA 0 + mov alpham, alphad + mov betam, betad + %endmacro + + %macro SAVE_DELTA_GAMMA 0 + mov deltam, deltad + mov gammam, gammad + %endmacro + + %macro LOAD_ALPHA_BETA_MX 0 + mov mym, myd + mov alphad, alpham + mov betad, betam + mov mxd, mxm + %endmacro + + %macro LOAD_DELTA_GAMMA_MY 0 + mov mxm, mxd + mov deltad, deltam + mov gammad, gammam + mov myd, mym + %endmacro + + %define PIC_reg r2 + %define PIC_base_offset $$ + %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset) +%else + %define SAVE_ALPHA_BETA + %define SAVE_DELTA_GAMMA + %define PIC_sym(sym) sym +%endif + +%if ARCH_X86_32 + %if STACK_ALIGNMENT < required_stack_alignment + %assign copy_args 8*4 + %else + %assign copy_args 0 + %endif +%endif + +%macro RELOC_ARGS 0 + %if copy_args + mov r0, r0m + mov r1, r1m + mov r2, r2m + mov r3, r3m + mov r5, r5m + mov dstm, r0 + mov dsm, r1 + mov srcm, r2 + mov ssm, r3 + mov mxm, r5 + mov r0, r6m + mov mym, r0 + %endif +%endmacro + +%macro BLENDHWDW 2 ; blend high words from dwords, src1, src2 + %if cpuflag(sse4) + pblendw %1, %2, 0xAA + %else + pand %2, m10 + por %1, %2 + %endif +%endmacro + +%macro WARP_V 10 ; dst0, dst1, 0, 2, 4, 6, 1, 3, 5, 7 + %if ARCH_X86_32 + %define m8 m4 + %define m9 m5 + %define m14 m6 + %define m15 m7 + %define m11 m7 + %endif + %if notcpuflag(ssse3) || ARCH_X86_32 + pxor m11, m11 + %endif + lea tmp1d, [myq+deltaq*4] + lea tmp2d, [myq+deltaq*1] + shr myd, 10 + shr tmp1d, 10 + movq m2, [filterq+myq *8] ; a + movq m8, [filterq+tmp1q*8] ; e + lea tmp1d, [tmp2q+deltaq*4] + lea myd, [tmp2q+deltaq*1] + shr tmp2d, 10 + shr tmp1d, 10 + movq m3, [filterq+tmp2q*8] ; b + movq m0, [filterq+tmp1q*8] ; f + punpcklwd m2, m3 + punpcklwd m8, m0 + lea tmp1d, [myq+deltaq*4] + lea tmp2d, [myq+deltaq*1] + shr myd, 10 + shr tmp1d, 10 + movq m0, [filterq+myq *8] ; c + movq m9, [filterq+tmp1q*8] ; g + lea tmp1d, [tmp2q+deltaq*4] + lea myd, [tmp2q+gammaq] ; my += gamma + shr tmp2d, 10 + shr tmp1d, 10 + movq m3, [filterq+tmp2q*8] ; d + movq m1, [filterq+tmp1q*8] ; h + punpcklwd m0, m3 + punpcklwd m9, m1 + punpckldq m1, m2, m0 + punpckhdq m2, m0 + punpcklbw m0, m11, m1 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8 + punpckhbw m3, m11, m1 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8 + punpcklbw m1, m11, m2 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8 + punpckhbw m14, m11, m2 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8 + pmaddwd m0, %3 + pmaddwd m3, %5 + pmaddwd m1, %7 + pmaddwd m14, %9 + paddd m0, m3 + paddd m1, m14 + paddd m0, m1 + mova %1, m0 + %if ARCH_X86_64 + SWAP m3, m14 + %endif + punpckldq m0, m8, m9 + punpckhdq m8, m9 + punpcklbw m1, m11, m0 ; e0 e2 f0 f2 g0 g2 h0 h2 << 8 + punpckhbw m14, m11, m0 ; e4 e6 f4 f6 g4 g6 h4 h6 << 8 + punpcklbw m2, m11, m8 ; e1 e3 f1 f3 g1 g3 h1 h3 << 8 + punpckhbw m15, m11, m8 ; e5 e7 f5 f7 g5 g7 h5 h7 << 8 + pmaddwd m1, %4 + pmaddwd m14, %6 + pmaddwd m2, %8 + pmaddwd m15, %10 + paddd m1, m14 + paddd m2, m15 + paddd m1, m2 + mova %2, m1 + %if ARCH_X86_64 + SWAP m14, m3 + %endif +%endmacro + +%if ARCH_X86_64 + %define counterd r4d +%else + %if copy_args == 0 + %define counterd dword r4m + %else + %define counterd dword [esp+stack_size-4*7] + %endif +%endif + +%macro WARP_AFFINE_8X8T 0 +%if ARCH_X86_64 +cglobal warp_affine_8x8t, 6, 14, 16, 0x90, tmp, ts +%else +cglobal warp_affine_8x8t, 0, 7, 16, -0x130-copy_args, tmp, ts + %if copy_args + %define tmpm [esp+stack_size-4*1] + %define tsm [esp+stack_size-4*2] + %endif +%endif + call mangle(private_prefix %+ _warp_affine_8x8_%+cpuname).main +.loop: +%if ARCH_X86_32 + %define m12 m4 + %define m13 m5 + %define m14 m6 + %define m15 m7 + mova m12, [esp+0xC0] + mova m13, [esp+0xD0] + mova m14, [esp+0xE0] + mova m15, [esp+0xF0] +%endif +%if cpuflag(ssse3) + psrad m12, 13 + psrad m13, 13 + psrad m14, 13 + psrad m15, 13 + packssdw m12, m13 + packssdw m14, m15 + mova m13, [PIC_sym(pw_8192)] + pmulhrsw m12, m13 ; (x + (1 << 6)) >> 7 + pmulhrsw m14, m13 +%else + %if ARCH_X86_32 + %define m10 m0 + %endif + mova m10, [PIC_sym(pd_16384)] + paddd m12, m10 + paddd m13, m10 + paddd m14, m10 + paddd m15, m10 + psrad m12, 15 + psrad m13, 15 + psrad m14, 15 + psrad m15, 15 + packssdw m12, m13 + packssdw m14, m15 +%endif + mova [tmpq+tsq*0], m12 + mova [tmpq+tsq*2], m14 + dec counterd + jz mangle(private_prefix %+ _warp_affine_8x8_%+cpuname).end +%if ARCH_X86_32 + mov tmpm, tmpd + mov r0, [esp+0x100] + mov r1, [esp+0x104] +%endif + call mangle(private_prefix %+ _warp_affine_8x8_%+cpuname).main2 + lea tmpq, [tmpq+tsq*4] + jmp .loop +%endmacro + +%macro WARP_AFFINE_8X8 0 +%if ARCH_X86_64 +cglobal warp_affine_8x8, 6, 14, 16, 0x90, \ + dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \ + filter, tmp1, delta, my, gamma +%else +cglobal warp_affine_8x8, 0, 7, 16, -0x130-copy_args, \ + dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \ + filter, tmp1, delta, my, gamma + %define alphaq r0 + %define alphad r0 + %define alpham [esp+gprsize+0x100] + %define betaq r1 + %define betad r1 + %define betam [esp+gprsize+0x104] + %define deltaq r0 + %define deltad r0 + %define deltam [esp+gprsize+0x108] + %define gammaq r1 + %define gammad r1 + %define gammam [esp+gprsize+0x10C] + %define filterq r3 + %define tmp1q r4 + %define tmp1d r4 + %define tmp1m [esp+gprsize+0x110] + %define myq r5 + %define myd r5 + %define mym r6m + %if copy_args + %define dstm [esp+stack_size-4*1] + %define dsm [esp+stack_size-4*2] + %define srcm [esp+stack_size-4*3] + %define ssm [esp+stack_size-4*4] + %define mxm [esp+stack_size-4*5] + %define mym [esp+stack_size-4*6] + %endif +%endif + call .main + jmp .start +.loop: +%if ARCH_X86_32 + mov dstm, dstd + mov alphad, [esp+0x100] + mov betad, [esp+0x104] +%endif + call .main2 + lea dstq, [dstq+dsq*2] +.start: +%if notcpuflag(sse4) + %if cpuflag(ssse3) + %define roundval pw_8192 + %else + %define roundval pd_262144 + %endif + %if ARCH_X86_64 + mova m10, [PIC_sym(roundval)] + %else + %define m10 [PIC_sym(roundval)] + %endif +%endif +%if ARCH_X86_32 + %define m12 m5 + %define m13 m6 + mova m12, [esp+0xC0] + mova m13, [esp+0xD0] +%endif +%if cpuflag(sse4) + %if ARCH_X86_32 + %define m11 m4 + pxor m11, m11 + %endif + psrad m12, 18 + psrad m13, 18 + packusdw m12, m13 + pavgw m12, m11 ; (x + (1 << 10)) >> 11 +%else + %if cpuflag(ssse3) + psrad m12, 17 + psrad m13, 17 + packssdw m12, m13 + pmulhrsw m12, m10 + %else + paddd m12, m10 + paddd m13, m10 + psrad m12, 19 + psrad m13, 19 + packssdw m12, m13 + %endif +%endif +%if ARCH_X86_32 + %define m14 m6 + %define m15 m7 + mova m14, [esp+0xE0] + mova m15, [esp+0xF0] +%endif +%if cpuflag(sse4) + psrad m14, 18 + psrad m15, 18 + packusdw m14, m15 + pavgw m14, m11 ; (x + (1 << 10)) >> 11 +%else + %if cpuflag(ssse3) + psrad m14, 17 + psrad m15, 17 + packssdw m14, m15 + pmulhrsw m14, m10 + %else + paddd m14, m10 + paddd m15, m10 + psrad m14, 19 + psrad m15, 19 + packssdw m14, m15 + %endif +%endif + packuswb m12, m14 + movq [dstq+dsq*0], m12 + movhps [dstq+dsq*1], m12 + dec counterd + jg .loop +.end: + RET +ALIGN function_align +.main: +%assign stack_offset stack_offset+gprsize +%if ARCH_X86_32 + %assign stack_size stack_size+4 + %if copy_args + %assign stack_offset stack_offset-4 + %endif + RELOC_ARGS + LEA PIC_reg, $$ + %define PIC_mem [esp+gprsize+0x114] + mov abcdd, abcdm + %if copy_args == 0 + mov ssd, ssm + mov mxd, mxm + %endif + mov PIC_mem, PIC_reg + mov srcd, srcm +%endif + movsx deltad, word [abcdq+2*2] + movsx gammad, word [abcdq+2*3] + lea tmp1d, [deltaq*3] + sub gammad, tmp1d ; gamma -= delta*3 + SAVE_DELTA_GAMMA +%if ARCH_X86_32 + mov abcdd, abcdm +%endif + movsx alphad, word [abcdq+2*0] + movsx betad, word [abcdq+2*1] + lea tmp1q, [ssq*3+3] + add mxd, 512+(64<<10) + lea tmp2d, [alphaq*3] + sub srcq, tmp1q ; src -= src_stride*3 + 3 +%if ARCH_X86_32 + mov srcm, srcd + mov PIC_reg, PIC_mem +%endif + sub betad, tmp2d ; beta -= alpha*3 + lea filterq, [PIC_sym(mc_warp_filter)] +%if ARCH_X86_64 + mov myd, r6m + %if cpuflag(ssse3) + pxor m11, m11 + %endif +%endif + call .h + psrld m2, m0, 16 + psrld m3, m1, 16 +%if ARCH_X86_32 + %if notcpuflag(ssse3) + mova [esp+gprsize+0x00], m2 + %endif + mova [esp+gprsize+0x10], m3 +%endif + call .h + psrld m4, m0, 16 + psrld m5, m1, 16 +%if ARCH_X86_32 + mova [esp+gprsize+0x20], m4 + mova [esp+gprsize+0x30], m5 +%endif + call .h +%if ARCH_X86_64 + %define blendmask [rsp+gprsize+0x80] +%else + %if notcpuflag(ssse3) + mova m2, [esp+gprsize+0x00] + %endif + mova m3, [esp+gprsize+0x10] + %define blendmask [esp+gprsize+0x120] + %define m10 m7 +%endif + pcmpeqd m10, m10 + pslld m10, 16 + mova blendmask, m10 + BLENDHWDW m2, m0 ; 0 + BLENDHWDW m3, m1 ; 2 + mova [rsp+gprsize+0x00], m2 + mova [rsp+gprsize+0x10], m3 + call .h +%if ARCH_X86_32 + mova m4, [esp+gprsize+0x20] + mova m5, [esp+gprsize+0x30] +%endif + mova m10, blendmask + BLENDHWDW m4, m0 ; 1 + BLENDHWDW m5, m1 ; 3 + mova [rsp+gprsize+0x20], m4 + mova [rsp+gprsize+0x30], m5 + call .h +%if ARCH_X86_32 + %if notcpuflag(ssse3) + mova m2, [esp+gprsize+0x00] + %endif + mova m3, [esp+gprsize+0x10] + %define m10 m5 +%endif + psrld m6, m2, 16 + psrld m7, m3, 16 + mova m10, blendmask + BLENDHWDW m6, m0 ; 2 + BLENDHWDW m7, m1 ; 4 + mova [rsp+gprsize+0x40], m6 + mova [rsp+gprsize+0x50], m7 + call .h +%if ARCH_X86_32 + mova m4, [esp+gprsize+0x20] + mova m5, [esp+gprsize+0x30] +%endif + psrld m2, m4, 16 + psrld m3, m5, 16 + mova m10, blendmask + BLENDHWDW m2, m0 ; 3 + BLENDHWDW m3, m1 ; 5 + mova [rsp+gprsize+0x60], m2 + mova [rsp+gprsize+0x70], m3 + call .h +%if ARCH_X86_32 + mova m6, [esp+gprsize+0x40] + mova m7, [esp+gprsize+0x50] + %define m10 m7 +%endif + psrld m4, m6, 16 + psrld m5, m7, 16 + mova m10, blendmask + BLENDHWDW m4, m0 ; 4 + BLENDHWDW m5, m1 ; 6 +%if ARCH_X86_64 + add myd, 512+(64<<10) + mova m6, m2 + mova m7, m3 +%else + mova [esp+gprsize+0x80], m4 + mova [esp+gprsize+0x90], m5 + add dword mym, 512+(64<<10) +%endif + mov counterd, 4 + SAVE_ALPHA_BETA +.main2: + call .h +%if ARCH_X86_32 + mova m6, [esp+gprsize+0x60] + mova m7, [esp+gprsize+0x70] + %define m10 m5 +%endif + psrld m6, 16 + psrld m7, 16 + mova m10, blendmask + BLENDHWDW m6, m0 ; 5 + BLENDHWDW m7, m1 ; 7 +%if ARCH_X86_64 + WARP_V m12, m13, [rsp+gprsize+0x00], [rsp+gprsize+0x10], \ + m4, m5, \ + [rsp+gprsize+0x20], [rsp+gprsize+0x30], \ + m6, m7 +%else + mova [esp+gprsize+0xA0], m6 + mova [esp+gprsize+0xB0], m7 + LOAD_DELTA_GAMMA_MY + WARP_V [esp+gprsize+0xC0], [esp+gprsize+0xD0], \ + [esp+gprsize+0x00], [esp+gprsize+0x10], \ + [esp+gprsize+0x80], [esp+gprsize+0x90], \ + [esp+gprsize+0x20], [esp+gprsize+0x30], \ + [esp+gprsize+0xA0], [esp+gprsize+0xB0] + LOAD_ALPHA_BETA_MX +%endif + call .h + mova m2, [rsp+gprsize+0x40] + mova m3, [rsp+gprsize+0x50] +%if ARCH_X86_32 + mova m4, [rsp+gprsize+0x80] + mova m5, [rsp+gprsize+0x90] + %define m10 m7 +%endif + mova [rsp+gprsize+0x00], m2 + mova [rsp+gprsize+0x10], m3 + mova [rsp+gprsize+0x40], m4 + mova [rsp+gprsize+0x50], m5 + psrld m4, 16 + psrld m5, 16 + mova m10, blendmask + BLENDHWDW m4, m0 ; 6 + BLENDHWDW m5, m1 ; 8 +%if ARCH_X86_64 + WARP_V m14, m15, [rsp+gprsize+0x20], [rsp+gprsize+0x30], \ + m6, m7, \ + [rsp+gprsize+0x00], [rsp+gprsize+0x10], \ + m4, m5 +%else + mova [esp+gprsize+0x80], m4 + mova [esp+gprsize+0x90], m5 + LOAD_DELTA_GAMMA_MY + WARP_V [esp+gprsize+0xE0], [esp+gprsize+0xF0], \ + [esp+gprsize+0x20], [esp+gprsize+0x30], \ + [esp+gprsize+0xA0], [esp+gprsize+0xB0], \ + [esp+gprsize+0x00], [esp+gprsize+0x10], \ + [esp+gprsize+0x80], [esp+gprsize+0x90] + mov mym, myd + mov dstd, dstm + mov dsd, dsm + mov mxd, mxm +%endif + mova m2, [rsp+gprsize+0x60] + mova m3, [rsp+gprsize+0x70] +%if ARCH_X86_32 + mova m6, [esp+gprsize+0xA0] + mova m7, [esp+gprsize+0xB0] +%endif + mova [rsp+gprsize+0x20], m2 + mova [rsp+gprsize+0x30], m3 + mova [rsp+gprsize+0x60], m6 + mova [rsp+gprsize+0x70], m7 + ret +ALIGN function_align +.h: +%if ARCH_X86_32 + %define m8 m3 + %define m9 m4 + %define m10 m5 + %define m14 m6 + %define m15 m7 +%endif + lea tmp1d, [mxq+alphaq*4] + lea tmp2d, [mxq+alphaq*1] +%if ARCH_X86_32 + %assign stack_offset stack_offset+4 + %assign stack_size stack_size+4 + %define PIC_mem [esp+gprsize*2+0x114] + mov PIC_mem, PIC_reg + mov srcd, srcm +%endif + movu m10, [srcq] +%if ARCH_X86_32 + add srcd, ssm + mov srcm, srcd + mov PIC_reg, PIC_mem +%else + add srcq, ssq +%endif + shr mxd, 10 + shr tmp1d, 10 + movq m1, [filterq+mxq *8] ; 0 X + movq m8, [filterq+tmp1q*8] ; 4 X + lea tmp1d, [tmp2q+alphaq*4] + lea mxd, [tmp2q+alphaq*1] + shr tmp2d, 10 + shr tmp1d, 10 + movhps m1, [filterq+tmp2q*8] ; 0 1 + movhps m8, [filterq+tmp1q*8] ; 4 5 + lea tmp1d, [mxq+alphaq*4] + lea tmp2d, [mxq+alphaq*1] + shr mxd, 10 + shr tmp1d, 10 +%if cpuflag(ssse3) + movq m14, [filterq+mxq *8] ; 2 X + movq m9, [filterq+tmp1q*8] ; 6 X + lea tmp1d, [tmp2q+alphaq*4] + lea mxd, [tmp2q+betaq] ; mx += beta + shr tmp2d, 10 + shr tmp1d, 10 + movhps m14, [filterq+tmp2q*8] ; 2 3 + movhps m9, [filterq+tmp1q*8] ; 6 7 + pshufb m0, m10, [PIC_sym(warp_8x8_shufA)] + pmaddubsw m0, m1 + pshufb m1, m10, [PIC_sym(warp_8x8_shufB)] + pmaddubsw m1, m8 + pshufb m15, m10, [PIC_sym(warp_8x8_shufC)] + pmaddubsw m15, m14 + pshufb m10, m10, [PIC_sym(warp_8x8_shufD)] + pmaddubsw m10, m9 + phaddw m0, m15 + phaddw m1, m10 +%else + %if ARCH_X86_32 + %define m11 m2 + %endif + pcmpeqw m0, m0 + psrlw m14, m0, 8 + psrlw m15, m10, 8 ; 01 03 05 07 09 11 13 15 + pand m14, m10 ; 00 02 04 06 08 10 12 14 + packuswb m14, m15 ; 00 02 04 06 08 10 12 14 01 03 05 07 09 11 13 15 + psrldq m9, m0, 4 + pshufd m0, m14, q0220 + pand m0, m9 + psrldq m14, 1 ; 02 04 06 08 10 12 14 01 03 05 07 09 11 13 15 __ + pslldq m15, m14, 12 + por m0, m15 ; shufA + psrlw m15, m0, 8 + psraw m11, m1, 8 + psllw m0, 8 + psllw m1, 8 + psrlw m0, 8 + psraw m1, 8 + pmullw m15, m11 + pmullw m0, m1 + paddw m0, m15 ; pmaddubsw m0, m1 + pshufd m15, m14, q0220 + pand m15, m9 + psrldq m14, 1 ; 04 06 08 10 12 14 01 03 05 07 09 11 13 15 __ __ + pslldq m1, m14, 12 + por m15, m1 ; shufC + pshufd m1, m14, q0220 + pand m1, m9 + psrldq m14, 1 ; 06 08 10 12 14 01 03 05 07 09 11 13 15 __ __ __ + pslldq m11, m14, 12 + por m1, m11 ; shufB + pshufd m10, m14, q0220 + pand m10, m9 + psrldq m14, 1 ; 08 10 12 14 01 03 05 07 09 11 13 15 __ __ __ __ + pslldq m14, m14, 12 + por m10, m14 ; shufD + psrlw m9, m1, 8 + psraw m11, m8, 8 + psllw m1, 8 + psllw m8, 8 + psrlw m1, 8 + psraw m8, 8 + pmullw m9, m11 + pmullw m1, m8 + paddw m1, m9 ; pmaddubsw m1, m8 + movq m14, [filterq+mxq *8] ; 2 X + movq m9, [filterq+tmp1q*8] ; 6 X + lea tmp1d, [tmp2q+alphaq*4] + lea mxd, [tmp2q+betaq] ; mx += beta + shr tmp2d, 10 + shr tmp1d, 10 + movhps m14, [filterq+tmp2q*8] ; 2 3 + movhps m9, [filterq+tmp1q*8] ; 6 7 + psrlw m8, m15, 8 + psraw m11, m14, 8 + psllw m15, 8 + psllw m14, 8 + psrlw m15, 8 + psraw m14, 8 + pmullw m8, m11 + pmullw m15, m14 + paddw m15, m8 ; pmaddubsw m15, m14 + psrlw m8, m10, 8 + psraw m11, m9, 8 + psllw m10, 8 + psllw m9, 8 + psrlw m10, 8 + psraw m9, 8 + pmullw m8, m11 + pmullw m10, m9 + paddw m10, m8 ; pmaddubsw m10, m9 + pslld m8, m0, 16 + pslld m9, m1, 16 + pslld m14, m15, 16 + pslld m11, m10, 16 + paddw m0, m8 + paddw m1, m9 + paddw m15, m14 + paddw m10, m11 + psrad m0, 16 + psrad m1, 16 + psrad m15, 16 + psrad m10, 16 + packssdw m0, m15 ; phaddw m0, m15 + packssdw m1, m10 ; phaddw m1, m10 +%endif + mova m14, [PIC_sym(pw_8192)] + mova m9, [PIC_sym(pd_32768)] + pmaddwd m0, m14 ; 17-bit intermediate, upshifted by 13 + pmaddwd m1, m14 + paddd m0, m9 ; rounded 14-bit result in upper 16 bits of dword + paddd m1, m9 + ret +%endmacro + +%if WIN64 +DECLARE_REG_TMP 6, 4 +%else +DECLARE_REG_TMP 6, 7 +%endif + +%macro BIDIR_FN 1 ; op + %1 0 + lea stride3q, [strideq*3] + jmp wq +.w4_loop: + %1_INC_PTR 2 + %1 0 + lea dstq, [dstq+strideq*4] +.w4: ; tile 4x + movd [dstq ], m0 ; copy dw[0] + pshuflw m1, m0, q1032 ; swap dw[1] and dw[0] + movd [dstq+strideq*1], m1 ; copy dw[1] + punpckhqdq m0, m0 ; swap dw[3,2] with dw[1,0] + movd [dstq+strideq*2], m0 ; dw[2] + psrlq m0, 32 ; shift right in dw[3] + movd [dstq+stride3q ], m0 ; copy + sub hd, 4 + jg .w4_loop + RET +.w8_loop: + %1_INC_PTR 2 + %1 0 + lea dstq, [dstq+strideq*2] +.w8: + movq [dstq ], m0 + movhps [dstq+strideq*1], m0 + sub hd, 2 + jg .w8_loop + RET +.w16_loop: + %1_INC_PTR 2 + %1 0 + lea dstq, [dstq+strideq] +.w16: + mova [dstq ], m0 + dec hd + jg .w16_loop + RET +.w32_loop: + %1_INC_PTR 4 + %1 0 + lea dstq, [dstq+strideq] +.w32: + mova [dstq ], m0 + %1 2 + mova [dstq + 16 ], m0 + dec hd + jg .w32_loop + RET +.w64_loop: + %1_INC_PTR 8 + %1 0 + add dstq, strideq +.w64: + %assign i 0 + %rep 4 + mova [dstq + i*16 ], m0 + %assign i i+1 + %if i < 4 + %1 2*i + %endif + %endrep + dec hd + jg .w64_loop + RET +.w128_loop: + %1_INC_PTR 16 + %1 0 + add dstq, strideq +.w128: + %assign i 0 + %rep 8 + mova [dstq + i*16 ], m0 + %assign i i+1 + %if i < 8 + %1 2*i + %endif + %endrep + dec hd + jg .w128_loop + RET +%endmacro + +%macro AVG 1 ; src_offset + ; writes AVG of tmp1 tmp2 uint16 coeffs into uint8 pixel + mova m0, [tmp1q+(%1+0)*mmsize] ; load 8 coef(2bytes) from tmp1 + paddw m0, [tmp2q+(%1+0)*mmsize] ; load/add 8 coef(2bytes) tmp2 + mova m1, [tmp1q+(%1+1)*mmsize] + paddw m1, [tmp2q+(%1+1)*mmsize] + pmulhrsw m0, m2 + pmulhrsw m1, m2 + packuswb m0, m1 ; pack/trunc 16 bits from m0 & m1 to 8 bit +%endmacro + +%macro AVG_INC_PTR 1 + add tmp1q, %1*mmsize + add tmp2q, %1*mmsize +%endmacro + +cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3 + LEA r6, avg_ssse3_table + tzcnt wd, wm ; leading zeros + movifnidn hd, hm ; move h(stack) to h(register) if not already that register + movsxd wq, dword [r6+wq*4] ; push table entry matching the tile width (tzcnt) in widen reg + mova m2, [pw_1024+r6-avg_ssse3_table] ; fill m2 with shift/align + add wq, r6 + BIDIR_FN AVG + +%macro W_AVG 1 ; src_offset + ; (a * weight + b * (16 - weight) + 128) >> 8 + ; = ((a - b) * weight + (b << 4) + 128) >> 8 + ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4 + ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4 + mova m2, [tmp1q+(%1+0)*mmsize] + mova m0, m2 + psubw m2, [tmp2q+(%1+0)*mmsize] + mova m3, [tmp1q+(%1+1)*mmsize] + mova m1, m3 + psubw m3, [tmp2q+(%1+1)*mmsize] + pmulhw m2, m4 + pmulhw m3, m4 + paddw m0, m2 + paddw m1, m3 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + packuswb m0, m1 +%endmacro + +%define W_AVG_INC_PTR AVG_INC_PTR + +cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 + LEA r6, w_avg_ssse3_table + tzcnt wd, wm + movd m4, r6m + movifnidn hd, hm + pxor m0, m0 + movsxd wq, dword [r6+wq*4] + mova m5, [pw_2048+r6-w_avg_ssse3_table] + pshufb m4, m0 + psllw m4, 12 ; (weight-16) << 12 when interpreted as signed + add wq, r6 + cmp dword r6m, 7 + jg .weight_gt7 + mov r6, tmp1q + psubw m0, m4 + mov tmp1q, tmp2q + mova m4, m0 ; -weight + mov tmp2q, r6 +.weight_gt7: + BIDIR_FN W_AVG + +%macro MASK 1 ; src_offset + ; (a * m + b * (64 - m) + 512) >> 10 + ; = ((a - b) * m + (b << 6) + 512) >> 10 + ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4 + mova m3, [maskq+(%1+0)*(mmsize/2)] + mova m0, [tmp2q+(%1+0)*mmsize] ; b + psubw m1, m0, [tmp1q+(%1+0)*mmsize] ; b - a + mova m6, m3 ; m + psubb m3, m4, m6 ; -m + paddw m1, m1 ; (b - a) << 1 + paddb m3, m3 ; -m << 1 + punpcklbw m2, m4, m3 ; -m << 9 (<< 8 when ext as uint16) + pmulhw m1, m2 ; (-m * (b - a)) << 10 + paddw m0, m1 ; + b + mova m1, [tmp2q+(%1+1)*mmsize] ; b + psubw m2, m1, [tmp1q+(%1+1)*mmsize] ; b - a + paddw m2, m2 ; (b - a) << 1 + mova m6, m3 ; (-m << 1) + punpckhbw m3, m4, m6 ; (-m << 9) + pmulhw m2, m3 ; (-m << 9) + paddw m1, m2 ; (-m * (b - a)) << 10 + pmulhrsw m0, m5 ; round + pmulhrsw m1, m5 ; round + packuswb m0, m1 ; interleave 16 -> 8 +%endmacro + +%macro MASK_INC_PTR 1 + add maskq, %1*mmsize/2 + add tmp1q, %1*mmsize + add tmp2q, %1*mmsize +%endmacro + +%if ARCH_X86_64 +cglobal mask, 4, 8, 7, dst, stride, tmp1, tmp2, w, h, mask, stride3 + movifnidn hd, hm +%else +cglobal mask, 4, 7, 7, dst, stride, tmp1, tmp2, w, mask, stride3 +%define hd dword r5m +%endif +%define base r6-mask_ssse3_table + LEA r6, mask_ssse3_table + tzcnt wd, wm + movsxd wq, dword [r6+wq*4] + pxor m4, m4 + mova m5, [base+pw_2048] + add wq, r6 + mov maskq, r6m + BIDIR_FN MASK +%undef hd + +%macro W_MASK_420_B 2 ; src_offset in bytes, mask_out + ;**** do m0 = u16.dst[7..0], m%2 = u16.m[7..0] **** + mova m0, [tmp1q+(%1)] + mova m1, [tmp2q+(%1)] + mova m2, reg_pw_6903 + psubw m1, m0 + pabsw m%2, m1 ; abs(tmp1 - tmp2) + mova m3, m2 + psubusw m2, m%2 + psrlw m2, 8 ; 64 - m + mova m%2, m2 + psllw m2, 10 + pmulhw m1, m2 ; tmp2 * () + paddw m0, m1 ; tmp1 + () + ;**** do m1 = u16.dst[7..0], m%2 = u16.m[7..0] **** + mova m1, [tmp1q+(%1)+mmsize] + mova m2, [tmp2q+(%1)+mmsize] + psubw m2, m1 + pabsw m7, m2 ; abs(tmp1 - tmp2) + psubusw m3, m7 + psrlw m3, 8 ; 64 - m + phaddw m%2, m3 ; pack both u16.m[8..0]runs as u8.m [15..0] + psllw m3, 10 + pmulhw m2, m3 +%if ARCH_X86_32 + mova reg_pw_2048, [base+pw_2048] +%endif + paddw m1, m2 + pmulhrsw m0, reg_pw_2048 ; round/scale 2048 + pmulhrsw m1, reg_pw_2048 ; round/scale 2048 + packuswb m0, m1 ; concat m0 = u8.dst[15..0] +%endmacro + +%macro W_MASK_420 2 + W_MASK_420_B (%1*16), %2 +%endmacro + +%define base r6-w_mask_420_ssse3_table +%if ARCH_X86_64 +%define reg_pw_6903 m8 +%define reg_pw_2048 m9 +; args: dst, stride, tmp1, tmp2, w, h, mask, sign +cglobal w_mask_420, 4, 8, 10, dst, stride, tmp1, tmp2, w, h, mask + lea r6, [w_mask_420_ssse3_table] + mov wd, wm + tzcnt r7d, wd + movd m0, r7m ; sign + movifnidn hd, hm + movsxd r7, [r6+r7*4] + mova reg_pw_6903, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 + mova reg_pw_2048, [base+pw_2048] + movd m6, [base+pw_258] ; 64 * 4 + 2 + add r7, r6 + mov maskq, maskmp + psubw m6, m0 + pshuflw m6, m6, q0000 + punpcklqdq m6, m6 + W_MASK_420 0, 4 + jmp r7 + %define loop_w r7d +%else +%define reg_pw_6903 [base+pw_6903] +%define reg_pw_2048 m3 +cglobal w_mask_420, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask + tzcnt wd, wm + LEA r6, w_mask_420_ssse3_table + movd m0, r7m ; sign + mov maskq, r6mp + mov wd, [r6+wq*4] + movd m6, [base+pw_258] + add wq, r6 + psubw m6, m0 + pshuflw m6, m6, q0000 + punpcklqdq m6, m6 + W_MASK_420 0, 4 + jmp wd + %define loop_w dword r0m + %define hd dword r5m +%endif +.w4_loop: + add tmp1q, 2*16 + add tmp2q, 2*16 + W_MASK_420 0, 4 + lea dstq, [dstq+strideq*2] + add maskq, 4 +.w4: + movd [dstq ], m0 ; copy m0[0] + pshuflw m1, m0, q1032 + movd [dstq+strideq*1], m1 ; copy m0[1] + lea dstq, [dstq+strideq*2] + punpckhqdq m0, m0 + movd [dstq+strideq*0], m0 ; copy m0[2] + psrlq m0, 32 + movd [dstq+strideq*1], m0 ; copy m0[3] + psubw m1, m6, m4 ; a _ c _ + psrlq m4, 32 ; b _ d _ + psubw m1, m4 + psrlw m1, 2 + packuswb m1, m1 + pshuflw m1, m1, q2020 + movd [maskq], m1 + sub hd, 4 + jg .w4_loop + RET +.w8_loop: + add tmp1q, 2*16 + add tmp2q, 2*16 + W_MASK_420 0, 4 + lea dstq, [dstq+strideq*2] + add maskq, 4 +.w8: + movq [dstq ], m0 + movhps [dstq+strideq*1], m0 + psubw m0, m6, m4 + punpckhqdq m4, m4 + psubw m0, m4 + psrlw m0, 2 + packuswb m0, m0 + movd [maskq], m0 + sub hd, 2 + jg .w8_loop + RET +.w16: ; w32/64/128 +%if ARCH_X86_32 + mov wd, wm ; because we altered it in 32bit setup +%endif + mov loop_w, wd ; use width as counter + jmp .w16ge_inner_loop_first +.w16ge_loop: + lea tmp1q, [tmp1q+wq*2] ; skip even line pixels + lea tmp2q, [tmp2q+wq*2] ; skip even line pixels + sub dstq, wq + mov loop_w, wd + lea dstq, [dstq+strideq*2] +.w16ge_inner_loop: + W_MASK_420_B 0, 4 +.w16ge_inner_loop_first: + mova [dstq ], m0 + W_MASK_420_B wq*2, 5 ; load matching even line (offset = widthpx * (16+16)) + mova [dstq+strideq*1], m0 + psubw m1, m6, m4 ; m9 == 64 * 4 + 2 + psubw m1, m5 ; - odd line mask + psrlw m1, 2 ; >> 2 + packuswb m1, m1 + movq [maskq], m1 + add tmp1q, 2*16 + add tmp2q, 2*16 + add maskq, 8 + add dstq, 16 + sub loop_w, 16 + jg .w16ge_inner_loop + sub hd, 2 + jg .w16ge_loop + RET + +%undef reg_pw_6903 +%undef reg_pw_2048 +%undef dst_bak +%undef loop_w +%undef orig_w +%undef hd + +%macro BLEND_64M 4; a, b, mask1, mask2 + punpcklbw m0, %1, %2; {b;a}[7..0] + punpckhbw %1, %2 ; {b;a}[15..8] + pmaddubsw m0, %3 ; {b*m[0] + (64-m[0])*a}[7..0] u16 + pmaddubsw %1, %4 ; {b*m[1] + (64-m[1])*a}[15..8] u16 + pmulhrsw m0, m5 ; {((b*m[0] + (64-m[0])*a) + 1) / 32}[7..0] u16 + pmulhrsw %1, m5 ; {((b*m[1] + (64-m[0])*a) + 1) / 32}[15..8] u16 + packuswb m0, %1 ; {blendpx}[15..0] u8 +%endmacro + +%macro BLEND 2; a, b + psubb m3, m4, m0 ; m3 = (64 - m) + punpcklbw m2, m3, m0 ; {m;(64-m)}[7..0] + punpckhbw m3, m0 ; {m;(64-m)}[15..8] + BLEND_64M %1, %2, m2, m3 +%endmacro + +cglobal blend, 3, 7, 7, dst, ds, tmp, w, h, mask +%define base r6-blend_ssse3_table + LEA r6, blend_ssse3_table + tzcnt wd, wm + movifnidn hd, hm + movifnidn maskq, maskmp + movsxd wq, dword [r6+wq*4] + mova m4, [base+pb_64] + mova m5, [base+pw_512] + add wq, r6 + lea r6, [dsq*3] + jmp wq +.w4: + movq m0, [maskq]; m + movd m1, [dstq+dsq*0] ; a + movd m6, [dstq+dsq*1] + punpckldq m1, m6 + movq m6, [tmpq] ; b + psubb m3, m4, m0 ; m3 = (64 - m) + punpcklbw m2, m3, m0 ; {m;(64-m)}[7..0] + punpcklbw m1, m6 ; {b;a}[7..0] + pmaddubsw m1, m2 ; {b*m[0] + (64-m[0])*a}[7..0] u16 + pmulhrsw m1, m5 ; {((b*m[0] + (64-m[0])*a) + 1) / 32}[7..0] u16 + packuswb m1, m0 ; {blendpx}[15..0] u8 + movd [dstq+dsq*0], m1 + psrlq m1, 32 + movd [dstq+dsq*1], m1 + add maskq, 8 + add tmpq, 8 + lea dstq, [dstq+dsq*2] ; dst_stride * 2 + sub hd, 2 + jg .w4 + RET +.w8: + mova m0, [maskq]; m + movq m1, [dstq+dsq*0] ; a + movhps m1, [dstq+dsq*1] + mova m6, [tmpq] ; b + BLEND m1, m6 + movq [dstq+dsq*0], m0 + movhps [dstq+dsq*1], m0 + add maskq, 16 + add tmpq, 16 + lea dstq, [dstq+dsq*2] ; dst_stride * 2 + sub hd, 2 + jg .w8 + RET +.w16: + mova m0, [maskq]; m + mova m1, [dstq] ; a + mova m6, [tmpq] ; b + BLEND m1, m6 + mova [dstq], m0 + add maskq, 16 + add tmpq, 16 + add dstq, dsq ; dst_stride + dec hd + jg .w16 + RET +.w32: + %assign i 0 + %rep 2 + mova m0, [maskq+16*i]; m + mova m1, [dstq+16*i] ; a + mova m6, [tmpq+16*i] ; b + BLEND m1, m6 + mova [dstq+i*16], m0 + %assign i i+1 + %endrep + add maskq, 32 + add tmpq, 32 + add dstq, dsq ; dst_stride + dec hd + jg .w32 + RET + +cglobal blend_v, 3, 6, 6, dst, ds, tmp, w, h, mask +%define base r5-blend_v_ssse3_table + LEA r5, blend_v_ssse3_table + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, dword [r5+wq*4] + mova m5, [base+pw_512] + add wq, r5 + add maskq, obmc_masks-blend_v_ssse3_table + jmp wq +.w2: + movd m3, [maskq+4] + punpckldq m3, m3 + ; 2 mask blend is provided for 4 pixels / 2 lines +.w2_loop: + movd m1, [dstq+dsq*0] ; a {..;a;a} + pinsrw m1, [dstq+dsq*1], 1 + movd m2, [tmpq] ; b + punpcklbw m0, m1, m2; {b;a}[7..0] + pmaddubsw m0, m3 ; {b*m + (64-m)*a}[7..0] u16 + pmulhrsw m0, m5 ; {((b*m + (64-m)*a) + 1) / 32}[7..0] u16 + packuswb m0, m1 ; {blendpx}[8..0] u8 + movd r3d, m0 + mov [dstq+dsq*0], r3w + shr r3d, 16 + mov [dstq+dsq*1], r3w + add tmpq, 2*2 + lea dstq, [dstq + dsq * 2] + sub hd, 2 + jg .w2_loop + RET +.w4: + movddup m3, [maskq+8] + ; 4 mask blend is provided for 8 pixels / 2 lines +.w4_loop: + movd m1, [dstq+dsq*0] ; a + movd m2, [dstq+dsq*1] ; + punpckldq m1, m2 + movq m2, [tmpq] ; b + punpcklbw m1, m2 ; {b;a}[7..0] + pmaddubsw m1, m3 ; {b*m + (64-m)*a}[7..0] u16 + pmulhrsw m1, m5 ; {((b*m + (64-m)*a) + 1) / 32}[7..0] u16 + packuswb m1, m1 ; {blendpx}[8..0] u8 + movd [dstq], m1 + psrlq m1, 32 + movd [dstq+dsq*1], m1 + add tmpq, 2*4 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w4_loop + RET +.w8: + mova m3, [maskq+16] + ; 8 mask blend is provided for 16 pixels +.w8_loop: + movq m1, [dstq+dsq*0] ; a + movhps m1, [dstq+dsq*1] + mova m2, [tmpq]; b + BLEND_64M m1, m2, m3, m3 + movq [dstq+dsq*0], m0 + movhps [dstq+dsq*1], m0 + add tmpq, 16 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w8_loop + RET +.w16: + ; 16 mask blend is provided for 32 pixels + mova m3, [maskq+32] ; obmc_masks_16[0] (64-m[0]) + mova m4, [maskq+48] ; obmc_masks_16[1] (64-m[1]) +.w16_loop: + mova m1, [dstq] ; a + mova m2, [tmpq] ; b + BLEND_64M m1, m2, m3, m4 + mova [dstq], m0 + add tmpq, 16 + add dstq, dsq + dec hd + jg .w16_loop + RET +.w32: +%if WIN64 + mova [rsp+8], xmm6 +%endif + mova m3, [maskq+64] ; obmc_masks_32[0] (64-m[0]) + mova m4, [maskq+80] ; obmc_masks_32[1] (64-m[1]) + mova m6, [maskq+96] ; obmc_masks_32[2] (64-m[2]) + ; 16 mask blend is provided for 64 pixels +.w32_loop: + mova m1, [dstq+16*0] ; a + mova m2, [tmpq+16*0] ; b + BLEND_64M m1, m2, m3, m4 + movq m1, [dstq+16*1] ; a + punpcklbw m1, [tmpq+16*1] ; b + pmaddubsw m1, m6 + pmulhrsw m1, m5 + packuswb m1, m1 + mova [dstq+16*0], m0 + movq [dstq+16*1], m1 + add tmpq, 32 + add dstq, dsq + dec hd + jg .w32_loop +%if WIN64 + mova xmm6, [rsp+8] +%endif + RET + +cglobal blend_h, 3, 7, 6, dst, ds, tmp, w, h, mask +%define base t0-blend_h_ssse3_table +%if ARCH_X86_32 + ; We need to keep the PIC pointer for w4, reload wd from stack instead + DECLARE_REG_TMP 6 +%else + DECLARE_REG_TMP 5 + mov r6d, wd +%endif + LEA t0, blend_h_ssse3_table + tzcnt wd, wm + mov hd, hm + movsxd wq, dword [t0+wq*4] + mova m5, [base+pw_512] + add wq, t0 + lea maskq, [base+obmc_masks+hq*2] + lea hd, [hq*3] + shr hd, 2 ; h * 3/4 + lea maskq, [maskq+hq*2] + neg hq + jmp wq +.w2: + movd m0, [dstq+dsq*0] + pinsrw m0, [dstq+dsq*1], 1 + movd m2, [maskq+hq*2] + movd m1, [tmpq] + punpcklwd m2, m2 + punpcklbw m0, m1 + pmaddubsw m0, m2 + pmulhrsw m0, m5 + packuswb m0, m0 + movd r3d, m0 + mov [dstq+dsq*0], r3w + shr r3d, 16 + mov [dstq+dsq*1], r3w + lea dstq, [dstq+dsq*2] + add tmpq, 2*2 + add hq, 2 + jl .w2 + RET +.w4: +%if ARCH_X86_32 + mova m3, [base+blend_shuf] +%else + mova m3, [blend_shuf] +%endif +.w4_loop: + movd m0, [dstq+dsq*0] + movd m2, [dstq+dsq*1] + punpckldq m0, m2 ; a + movq m1, [tmpq] ; b + movq m2, [maskq+hq*2] ; m + pshufb m2, m3 + punpcklbw m0, m1 + pmaddubsw m0, m2 + pmulhrsw m0, m5 + packuswb m0, m0 + movd [dstq+dsq*0], m0 + psrlq m0, 32 + movd [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] + add tmpq, 4*2 + add hq, 2 + jl .w4_loop + RET +.w8: + movd m4, [maskq+hq*2] + punpcklwd m4, m4 + pshufd m3, m4, q0000 + pshufd m4, m4, q1111 + movq m1, [dstq+dsq*0] ; a + movhps m1, [dstq+dsq*1] + mova m2, [tmpq] + BLEND_64M m1, m2, m3, m4 + movq [dstq+dsq*0], m0 + movhps [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] + add tmpq, 8*2 + add hq, 2 + jl .w8 + RET +; w16/w32/w64/w128 +.w16: +%if ARCH_X86_32 + mov r6d, wm +%endif + sub dsq, r6 +.w16_loop0: + movd m3, [maskq+hq*2] + pshuflw m3, m3, q0000 + punpcklqdq m3, m3 + mov wd, r6d +.w16_loop: + mova m1, [dstq] ; a + mova m2, [tmpq] ; b + BLEND_64M m1, m2, m3, m3 + mova [dstq], m0 + add dstq, 16 + add tmpq, 16 + sub wd, 16 + jg .w16_loop + add dstq, dsq + inc hq + jl .w16_loop0 + RET + +; emu_edge args: +; const intptr_t bw, const intptr_t bh, const intptr_t iw, const intptr_t ih, +; const intptr_t x, const intptr_t y, pixel *dst, const ptrdiff_t dst_stride, +; const pixel *ref, const ptrdiff_t ref_stride +; +; bw, bh total filled size +; iw, ih, copied block -> fill bottom, right +; x, y, offset in bw/bh -> fill top, left +cglobal emu_edge, 10, 13, 2, bw, bh, iw, ih, x, \ + y, dst, dstride, src, sstride, \ + bottomext, rightext, blk + ; we assume that the buffer (stride) is larger than width, so we can + ; safely overwrite by a few bytes + pxor m1, m1 + +%if ARCH_X86_64 + %define reg_zero r12q + %define reg_tmp r10 + %define reg_src srcq + %define reg_bottomext bottomextq + %define reg_rightext rightextq + %define reg_blkm r9m +%else + %define reg_zero r6 + %define reg_tmp r0 + %define reg_src r1 + %define reg_bottomext r0 + %define reg_rightext r1 + %define reg_blkm r2m +%endif + ; + ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) + xor reg_zero, reg_zero + lea reg_tmp, [ihq-1] + cmp yq, ihq + cmovs reg_tmp, yq + test yq, yq + cmovs reg_tmp, reg_zero +%if ARCH_X86_64 + imul reg_tmp, sstrideq + add srcq, reg_tmp +%else + imul reg_tmp, sstridem + mov reg_src, srcm + add reg_src, reg_tmp +%endif + ; + ; ref += iclip(x, 0, iw - 1) + lea reg_tmp, [iwq-1] + cmp xq, iwq + cmovs reg_tmp, xq + test xq, xq + cmovs reg_tmp, reg_zero + add reg_src, reg_tmp +%if ARCH_X86_32 + mov srcm, reg_src +%endif + ; + ; bottom_ext = iclip(y + bh - ih, 0, bh - 1) +%if ARCH_X86_32 + mov r1, r1m ; restore bh +%endif + lea reg_bottomext, [yq+bhq] + sub reg_bottomext, ihq + lea r3, [bhq-1] + cmovs reg_bottomext, reg_zero + ; + + DEFINE_ARGS bw, bh, iw, ih, x, \ + topext, dst, dstride, src, sstride, \ + bottomext, rightext, blk + + ; top_ext = iclip(-y, 0, bh - 1) + neg topextq + cmovs topextq, reg_zero + cmp reg_bottomext, bhq + cmovns reg_bottomext, r3 + cmp topextq, bhq + cmovg topextq, r3 + %if ARCH_X86_32 + mov r4m, reg_bottomext + ; + ; right_ext = iclip(x + bw - iw, 0, bw - 1) + mov r0, r0m ; restore bw + %endif + lea reg_rightext, [xq+bwq] + sub reg_rightext, iwq + lea r2, [bwq-1] + cmovs reg_rightext, reg_zero + + DEFINE_ARGS bw, bh, iw, ih, leftext, \ + topext, dst, dstride, src, sstride, \ + bottomext, rightext, blk + + ; left_ext = iclip(-x, 0, bw - 1) + neg leftextq + cmovs leftextq, reg_zero + cmp reg_rightext, bwq + cmovns reg_rightext, r2 + %if ARCH_X86_32 + mov r3m, r1 + %endif + cmp leftextq, bwq + cmovns leftextq, r2 + +%undef reg_zero +%undef reg_tmp +%undef reg_src +%undef reg_bottomext +%undef reg_rightext + + DEFINE_ARGS bw, centerh, centerw, dummy, leftext, \ + topext, dst, dstride, src, sstride, \ + bottomext, rightext, blk + + ; center_h = bh - top_ext - bottom_ext +%if ARCH_X86_64 + lea r3, [bottomextq+topextq] + sub centerhq, r3 +%else + mov r1, centerhm ; restore r1 + sub centerhq, topextq + sub centerhq, r4m + mov r1m, centerhq +%endif + ; + ; blk += top_ext * PXSTRIDE(dst_stride) + mov r2, topextq +%if ARCH_X86_64 + imul r2, dstrideq +%else + mov r6, r6m ; restore dstq + imul r2, dstridem +%endif + add dstq, r2 + mov reg_blkm, dstq ; save pointer for ext + ; + ; center_w = bw - left_ext - right_ext + mov centerwq, bwq +%if ARCH_X86_64 + lea r3, [rightextq+leftextq] + sub centerwq, r3 +%else + sub centerwq, r3m + sub centerwq, leftextq +%endif + +; vloop Macro +%macro v_loop 3 ; need_left_ext, need_right_ext, suffix + %if ARCH_X86_64 + %define reg_tmp r12 + %else + %define reg_tmp r0 + %endif +.v_loop_%3: + %if ARCH_X86_32 + mov r0, r0m + mov r1, r1m + %endif +%if %1 + ; left extension + %if ARCH_X86_64 + movd m0, [srcq] + %else + mov r3, srcm + movd m0, [r3] + %endif + pshufb m0, m1 + xor r3, r3 +.left_loop_%3: + mova [dstq+r3], m0 + add r3, mmsize + cmp r3, leftextq + jl .left_loop_%3 + ; body + lea reg_tmp, [dstq+leftextq] +%endif + xor r3, r3 +.body_loop_%3: + %if ARCH_X86_64 + movu m0, [srcq+r3] + %else + mov r1, srcm + movu m0, [r1+r3] + %endif +%if %1 + movu [reg_tmp+r3], m0 +%else + movu [dstq+r3], m0 +%endif + add r3, mmsize + cmp r3, centerwq + jl .body_loop_%3 +%if %2 + ; right extension +%if %1 + add reg_tmp, centerwq +%else + lea reg_tmp, [dstq+centerwq] +%endif + %if ARCH_X86_64 + movd m0, [srcq+centerwq-1] + %else + mov r3, srcm + movd m0, [r3+centerwq-1] + %endif + pshufb m0, m1 + xor r3, r3 +.right_loop_%3: + movu [reg_tmp+r3], m0 + add r3, mmsize + %if ARCH_X86_64 + cmp r3, rightextq + %else + cmp r3, r3m + %endif + jl .right_loop_%3 +%endif + %if ARCH_X86_64 + add dstq, dstrideq + add srcq, sstrideq + dec centerhq + jg .v_loop_%3 + %else + add dstq, dstridem + mov r0, sstridem + add srcm, r0 + sub dword centerhm, 1 + jg .v_loop_%3 + mov r0, r0m ; restore r0 + %endif +%endmacro ; vloop MACRO + + test leftextq, leftextq + jnz .need_left_ext + %if ARCH_X86_64 + test rightextq, rightextq + jnz .need_right_ext + %else + cmp leftextq, r3m ; leftextq == 0 + jne .need_right_ext + %endif + v_loop 0, 0, 0 + jmp .body_done + + ;left right extensions +.need_left_ext: + %if ARCH_X86_64 + test rightextq, rightextq + %else + mov r3, r3m + test r3, r3 + %endif + jnz .need_left_right_ext + v_loop 1, 0, 1 + jmp .body_done + +.need_left_right_ext: + v_loop 1, 1, 2 + jmp .body_done + +.need_right_ext: + v_loop 0, 1, 3 + +.body_done: +; r0 ; bw +; r1 ;; x loop +; r4 ;; y loop +; r5 ; topextq +; r6 ;dstq +; r7 ;dstrideq +; r8 ; srcq +%if ARCH_X86_64 + %define reg_dstride dstrideq +%else + %define reg_dstride r2 +%endif + ; + ; bottom edge extension + %if ARCH_X86_64 + test bottomextq, bottomextq + jz .top + %else + xor r1, r1 + cmp r1, r4m + je .top + %endif + ; + %if ARCH_X86_64 + mov srcq, dstq + sub srcq, dstrideq + xor r1, r1 + %else + mov r3, dstq + mov reg_dstride, dstridem + sub r3, reg_dstride + mov srcm, r3 + %endif + ; +.bottom_x_loop: + %if ARCH_X86_64 + mova m0, [srcq+r1] + lea r3, [dstq+r1] + mov r4, bottomextq + %else + mov r3, srcm + mova m0, [r3+r1] + lea r3, [dstq+r1] + mov r4, r4m + %endif + ; +.bottom_y_loop: + mova [r3], m0 + add r3, reg_dstride + dec r4 + jg .bottom_y_loop + add r1, mmsize + cmp r1, bwq + jl .bottom_x_loop + +.top: + ; top edge extension + test topextq, topextq + jz .end +%if ARCH_X86_64 + mov srcq, reg_blkm +%else + mov r3, reg_blkm + mov reg_dstride, dstridem +%endif + mov dstq, dstm + xor r1, r1 + ; +.top_x_loop: +%if ARCH_X86_64 + mova m0, [srcq+r1] +%else + mov r3, reg_blkm + mova m0, [r3+r1] +%endif + lea r3, [dstq+r1] + mov r4, topextq + ; +.top_y_loop: + mova [r3], m0 + add r3, reg_dstride + dec r4 + jg .top_y_loop + add r1, mmsize + cmp r1, bwq + jl .top_x_loop + +.end: + RET + +%undef reg_dstride +%undef reg_blkm +%undef reg_tmp + +cextern resize_filter + +%macro SCRATCH 3 +%if ARCH_X86_32 + mova [rsp+%3*mmsize], m%1 +%define m%2 [rsp+%3*mmsize] +%else + SWAP %1, %2 +%endif +%endmacro + +%if ARCH_X86_64 +cglobal resize, 0, 14, 16, dst, dst_stride, src, src_stride, \ + dst_w, h, src_w, dx, mx0 +%elif STACK_ALIGNMENT >= 16 +cglobal resize, 0, 7, 8, 3 * 16, dst, dst_stride, src, src_stride, \ + dst_w, h, src_w, dx, mx0 +%else +cglobal resize, 0, 6, 8, 3 * 16, dst, dst_stride, src, src_stride, \ + dst_w, h, src_w, dx, mx0 +%endif + movifnidn dstq, dstmp + movifnidn srcq, srcmp +%if STACK_ALIGNMENT >= 16 + movifnidn dst_wd, dst_wm +%endif +%if ARCH_X86_64 + movifnidn hd, hm +%endif + sub dword mx0m, 4<<14 + sub dword src_wm, 8 + movd m7, dxm + movd m6, mx0m + movd m5, src_wm + pshufd m7, m7, q0000 + pshufd m6, m6, q0000 + pshufd m5, m5, q0000 + +%if ARCH_X86_64 + DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, picptr + LEA r7, $$ +%define base r7-$$ +%else + DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, x +%if STACK_ALIGNMENT >= 16 + LEA r6, $$ +%define base r6-$$ +%else + LEA r4, $$ +%define base r4-$$ +%endif +%endif + +%if ARCH_X86_64 + mova m12, [base+pw_m256] + mova m11, [base+pd_63] + mova m10, [base+pb_8x0_8x8] +%else +%define m12 [base+pw_m256] +%define m11 [base+pd_63] +%define m10 [base+pb_8x0_8x8] +%endif + pmaddwd m4, m7, [base+rescale_mul] ; dx*[0,1,2,3] + pslld m7, 2 ; dx*4 + pslld m5, 14 + paddd m6, m4 ; mx+[0..3]*dx + SCRATCH 7, 15, 0 + SCRATCH 6, 14, 1 + SCRATCH 5, 13, 2 + + ; m2 = 0, m3 = pmulhrsw constant for x=(x+64)>>7 + ; m8 = mx+[0..3]*dx, m5 = dx*4, m6 = src_w, m7 = 0x3f, m15=0,8 + +.loop_y: + xor xd, xd + mova m0, m14 ; per-line working version of mx + +.loop_x: + pxor m1, m1 + pcmpgtd m1, m0 + pandn m1, m0 + psrad m2, m0, 8 ; filter offset (unmasked) + pcmpgtd m3, m13, m1 + pand m1, m3 + pandn m3, m13 + por m1, m3 + psubd m3, m0, m1 ; pshufb offset + psrad m1, 14 ; clipped src_x offset + psrad m3, 14 ; pshufb edge_emu offset + pand m2, m11 ; filter offset (masked) + + ; load source pixels +%if ARCH_X86_64 + movd r8d, xm1 + pshuflw xm1, xm1, q3232 + movd r9d, xm1 + punpckhqdq xm1, xm1 + movd r10d, xm1 + psrlq xm1, 32 + movd r11d, xm1 + movq xm4, [srcq+r8] + movq xm5, [srcq+r10] + movhps xm4, [srcq+r9] + movhps xm5, [srcq+r11] +%else + movd r3d, xm1 + pshufd xm1, xm1, q3312 + movd r1d, xm1 + pshuflw xm1, xm1, q3232 + movq xm4, [srcq+r3] + movq xm5, [srcq+r1] + movd r3d, xm1 + punpckhqdq xm1, xm1 + movd r1d, xm1 + movhps xm4, [srcq+r3] + movhps xm5, [srcq+r1] +%endif + + ; if no emulation is required, we don't need to shuffle or emulate edges + ; this also saves 2 quasi-vpgatherdqs + pxor m6, m6 + pcmpeqb m6, m3 +%if ARCH_X86_64 + pmovmskb r8d, m6 + cmp r8d, 0xffff +%else + pmovmskb r3d, m6 + cmp r3d, 0xffff +%endif + je .filter + +%if ARCH_X86_64 + movd r8d, xm3 + pshuflw xm3, xm3, q3232 + movd r9d, xm3 + punpckhqdq xm3, xm3 + movd r10d, xm3 + psrlq xm3, 32 + movd r11d, xm3 + movsxd r8, r8d + movsxd r9, r9d + movsxd r10, r10d + movsxd r11, r11d + movq xm6, [base+resize_shuf+4+r8] + movq xm7, [base+resize_shuf+4+r10] + movhps xm6, [base+resize_shuf+4+r9] + movhps xm7, [base+resize_shuf+4+r11] +%else + movd r3d, xm3 + pshufd xm3, xm3, q3312 + movd r1d, xm3 + pshuflw xm3, xm3, q3232 + movq xm6, [base+resize_shuf+4+r3] + movq xm7, [base+resize_shuf+4+r1] + movd r3d, xm3 + punpckhqdq xm3, xm3 + movd r1d, xm3 + movhps xm6, [base+resize_shuf+4+r3] + movhps xm7, [base+resize_shuf+4+r1] +%endif + + paddb m6, m10 + paddb m7, m10 + pshufb m4, m6 + pshufb m5, m7 + +.filter: +%if ARCH_X86_64 + movd r8d, xm2 + pshuflw xm2, xm2, q3232 + movd r9d, xm2 + punpckhqdq xm2, xm2 + movd r10d, xm2 + psrlq xm2, 32 + movd r11d, xm2 + movq xm6, [base+resize_filter+r8*8] + movq xm7, [base+resize_filter+r10*8] + movhps xm6, [base+resize_filter+r9*8] + movhps xm7, [base+resize_filter+r11*8] +%else + movd r3d, xm2 + pshufd xm2, xm2, q3312 + movd r1d, xm2 + pshuflw xm2, xm2, q3232 + movq xm6, [base+resize_filter+r3*8] + movq xm7, [base+resize_filter+r1*8] + movd r3d, xm2 + punpckhqdq xm2, xm2 + movd r1d, xm2 + movhps xm6, [base+resize_filter+r3*8] + movhps xm7, [base+resize_filter+r1*8] +%endif + + pmaddubsw m4, m6 + pmaddubsw m5, m7 + phaddw m4, m5 + phaddsw m4, m4 + pmulhrsw m4, m12 ; x=(x+64)>>7 + packuswb m4, m4 + movd [dstq+xq], m4 + + paddd m0, m15 + add xd, 4 +%if STACK_ALIGNMENT >= 16 + cmp xd, dst_wd +%else + cmp xd, dst_wm +%endif + jl .loop_x + +%if ARCH_X86_64 + add dstq, dst_strideq + add srcq, src_strideq + dec hd +%else + add dstq, dst_stridem + add srcq, src_stridem + dec dword r5m +%endif + jg .loop_y + RET + +INIT_XMM ssse3 +PREP_BILIN +PREP_8TAP +WARP_AFFINE_8X8 +WARP_AFFINE_8X8T + +INIT_XMM sse4 +WARP_AFFINE_8X8 +WARP_AFFINE_8X8T + +INIT_XMM sse2 +PREP_BILIN +PREP_8TAP +WARP_AFFINE_8X8 +WARP_AFFINE_8X8T diff --git a/third_party/dav1d/src/x86/msac.asm b/third_party/dav1d/src/x86/msac.asm new file mode 100644 index 0000000000..8d59c64a26 --- /dev/null +++ b/third_party/dav1d/src/x86/msac.asm @@ -0,0 +1,669 @@ +; Copyright © 2019, VideoLAN and dav1d authors +; Copyright © 2019, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA 64 ; avoids cacheline splits + +min_prob: dw 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0 +pw_0xff00: times 8 dw 0xff00 +pw_32: times 8 dw 32 + +%if ARCH_X86_64 +%define resp resq +%define movp movq +%define c_shuf q3333 +%macro DECODE_SYMBOL_ADAPT_INIT 0-1 +%endmacro +%else +%define resp resd +%define movp movd +%define c_shuf q1111 +%macro DECODE_SYMBOL_ADAPT_INIT 0-1 0 ; hi_tok + mov t0, r0m + mov t1, r1m +%if %1 == 0 + mov t2, r2m +%endif +%if STACK_ALIGNMENT >= 16 + sub esp, 40-%1*4 +%else + mov eax, esp + and esp, ~15 + sub esp, 40-%1*4 + mov [esp], eax +%endif +%endmacro +%endif + +struc msac + .buf: resp 1 + .end: resp 1 + .dif: resp 1 + .rng: resd 1 + .cnt: resd 1 + .update_cdf: resd 1 +endstruc + +%define m(x, y) mangle(private_prefix %+ _ %+ x %+ y) + +SECTION .text + +%if WIN64 +DECLARE_REG_TMP 0, 1, 2, 3, 4, 5, 7, 3, 8 +%define buf rsp+stack_offset+8 ; shadow space +%elif UNIX64 +DECLARE_REG_TMP 0, 1, 2, 3, 4, 5, 7, 0, 8 +%define buf rsp-40 ; red zone +%else +DECLARE_REG_TMP 2, 3, 4, 1, 5, 6, 5, 2, 3 +%define buf esp+8 +%endif + +INIT_XMM sse2 +cglobal msac_decode_symbol_adapt4, 0, 6, 6 + DECODE_SYMBOL_ADAPT_INIT + LEA rax, pw_0xff00 + movd m2, [t0+msac.rng] + movq m1, [t1] + movp m3, [t0+msac.dif] + mov t3d, [t0+msac.update_cdf] + mov t4d, t2d + not t2 ; -(n_symbols + 1) + pshuflw m2, m2, q0000 + movd [buf+12], m2 + pand m2, [rax] + mova m0, m1 + psrlw m1, 6 + psllw m1, 7 + pmulhuw m1, m2 + movq m2, [rax+t2*2] + pshuflw m3, m3, c_shuf + paddw m1, m2 + mova [buf+16], m1 + psubusw m1, m3 + pxor m2, m2 + pcmpeqw m1, m2 ; c >= v + pmovmskb eax, m1 + test t3d, t3d + jz .renorm ; !allow_update_cdf + +; update_cdf: + movzx t3d, word [t1+t4*2] ; count + pcmpeqw m2, m2 + mov t2d, t3d + shr t3d, 4 + cmp t4d, 3 + sbb t3d, -5 ; (count >> 4) + (n_symbols > 2) + 4 + cmp t2d, 32 + adc t2d, 0 ; count + (count < 32) + movd m3, t3d + pavgw m2, m1 ; i >= val ? -1 : 32768 + psubw m2, m0 ; for (i = 0; i < val; i++) + psubw m0, m1 ; cdf[i] += (32768 - cdf[i]) >> rate; + psraw m2, m3 ; for (; i < n_symbols; i++) + paddw m0, m2 ; cdf[i] += (( -1 - cdf[i]) >> rate) + 1; + movq [t1], m0 + mov [t1+t4*2], t2w + +.renorm: + tzcnt eax, eax + mov t4, [t0+msac.dif] + movzx t1d, word [buf+rax+16] ; v + movzx t2d, word [buf+rax+14] ; u + shr eax, 1 +.renorm2: +%if ARCH_X86_64 == 0 +%if STACK_ALIGNMENT >= 16 + add esp, 40 +%else + mov esp, [esp] +%endif +%endif + not t4 + sub t2d, t1d ; rng + shl t1, gprsize*8-16 + add t4, t1 ; ~dif +.renorm3: + mov t1d, [t0+msac.cnt] + movifnidn t7, t0 +.renorm4: + bsr ecx, t2d + xor ecx, 15 ; d + shl t2d, cl + shl t4, cl + mov [t7+msac.rng], t2d + not t4 + sub t1d, ecx + jae .end ; no refill required + +; refill: + mov t2, [t7+msac.buf] + mov rcx, [t7+msac.end] +%if ARCH_X86_64 == 0 + push t5 +%endif + lea t5, [t2+gprsize] + cmp t5, rcx + ja .refill_eob + mov t2, [t2] + lea ecx, [t1+23] + add t1d, 16 + shr ecx, 3 ; shift_bytes + bswap t2 + sub t5, rcx + shl ecx, 3 ; shift_bits + shr t2, cl + sub ecx, t1d ; shift_bits - 16 - cnt + mov t1d, gprsize*8-16 + shl t2, cl + mov [t7+msac.buf], t5 + sub t1d, ecx ; cnt + gprsize*8 - shift_bits + xor t4, t2 +%if ARCH_X86_64 == 0 + pop t5 +%endif +.end: + mov [t7+msac.cnt], t1d + mov [t7+msac.dif], t4 + RET +.refill_eob: ; avoid overreading the input buffer + mov t5, rcx + mov ecx, gprsize*8-24 + sub ecx, t1d ; c +.refill_eob_loop: + cmp t2, t5 + jae .refill_eob_end ; eob reached + movzx t1d, byte [t2] + inc t2 + shl t1, cl + xor t4, t1 + sub ecx, 8 + jge .refill_eob_loop +.refill_eob_end: + mov t1d, gprsize*8-24 +%if ARCH_X86_64 == 0 + pop t5 +%endif + sub t1d, ecx + mov [t7+msac.buf], t2 + mov [t7+msac.dif], t4 + mov [t7+msac.cnt], t1d + RET + +cglobal msac_decode_symbol_adapt8, 0, 6, 6 + DECODE_SYMBOL_ADAPT_INIT + LEA rax, pw_0xff00 + movd m2, [t0+msac.rng] + mova m1, [t1] + movp m3, [t0+msac.dif] + mov t3d, [t0+msac.update_cdf] + mov t4d, t2d + not t2 + pshuflw m2, m2, q0000 + movd [buf+12], m2 + punpcklqdq m2, m2 + mova m0, m1 + psrlw m1, 6 + pand m2, [rax] + psllw m1, 7 + pmulhuw m1, m2 + movu m2, [rax+t2*2] + pshuflw m3, m3, c_shuf + paddw m1, m2 + punpcklqdq m3, m3 + mova [buf+16], m1 + psubusw m1, m3 + pxor m2, m2 + pcmpeqw m1, m2 + pmovmskb eax, m1 + test t3d, t3d + jz m(msac_decode_symbol_adapt4, SUFFIX).renorm + movzx t3d, word [t1+t4*2] + pcmpeqw m2, m2 + mov t2d, t3d + shr t3d, 4 + cmp t4d, 3 ; may be called with n_symbols <= 2 + sbb t3d, -5 + cmp t2d, 32 + adc t2d, 0 + movd m3, t3d + pavgw m2, m1 + psubw m2, m0 + psubw m0, m1 + psraw m2, m3 + paddw m0, m2 + mova [t1], m0 + mov [t1+t4*2], t2w + jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm + +cglobal msac_decode_symbol_adapt16, 0, 6, 6 + DECODE_SYMBOL_ADAPT_INIT + LEA rax, pw_0xff00 + movd m4, [t0+msac.rng] + mova m2, [t1] + mova m3, [t1+16] + movp m5, [t0+msac.dif] + mov t3d, [t0+msac.update_cdf] + mov t4d, t2d + not t2 +%if WIN64 + sub rsp, 48 ; need 36 bytes, shadow space is only 32 +%endif + pshuflw m4, m4, q0000 + movd [buf-4], m4 + punpcklqdq m4, m4 + mova m0, m2 + psrlw m2, 6 + mova m1, m3 + psrlw m3, 6 + pand m4, [rax] + psllw m2, 7 + psllw m3, 7 + pmulhuw m2, m4 + pmulhuw m3, m4 + movu m4, [rax+t2*2] + pshuflw m5, m5, c_shuf + paddw m2, m4 + psubw m4, [rax-pw_0xff00+pw_32] + punpcklqdq m5, m5 + paddw m3, m4 + mova [buf], m2 + psubusw m2, m5 + mova [buf+16], m3 + psubusw m3, m5 + pxor m4, m4 + pcmpeqw m2, m4 + pcmpeqw m3, m4 + packsswb m5, m2, m3 + pmovmskb eax, m5 + test t3d, t3d + jz .renorm + movzx t3d, word [t1+t4*2] + pcmpeqw m4, m4 + mova m5, m4 + lea t2d, [t3+80] ; only support n_symbols > 2 + shr t2d, 4 + cmp t3d, 32 + adc t3d, 0 + pavgw m4, m2 + pavgw m5, m3 + psubw m4, m0 + psubw m0, m2 + movd m2, t2d + psubw m5, m1 + psubw m1, m3 + psraw m4, m2 + psraw m5, m2 + paddw m0, m4 + paddw m1, m5 + mova [t1], m0 + mova [t1+16], m1 + mov [t1+t4*2], t3w +.renorm: + tzcnt eax, eax + mov t4, [t0+msac.dif] + movzx t1d, word [buf+rax*2] + movzx t2d, word [buf+rax*2-2] +%if WIN64 + add rsp, 48 +%endif + jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm2 + +cglobal msac_decode_bool_adapt, 0, 6, 0 + movifnidn t1, r1mp + movifnidn t0, r0mp + movzx eax, word [t1] + movzx t3d, byte [t0+msac.rng+1] + mov t4, [t0+msac.dif] + mov t2d, [t0+msac.rng] +%if ARCH_X86_64 + mov t5d, eax +%endif + and eax, ~63 + imul eax, t3d +%if UNIX64 + mov t6, t4 +%endif + shr eax, 7 + add eax, 4 ; v + mov t3d, eax + shl rax, gprsize*8-16 ; vw + sub t2d, t3d ; r - v + sub t4, rax ; dif - vw + setb al + cmovb t2d, t3d + mov t3d, [t0+msac.update_cdf] +%if UNIX64 + cmovb t4, t6 +%else + cmovb t4, [t0+msac.dif] +%endif +%if ARCH_X86_64 == 0 + movzx eax, al +%endif + not t4 + test t3d, t3d + jz m(msac_decode_symbol_adapt4, SUFFIX).renorm3 +%if UNIX64 == 0 + push t6 +%endif + movzx t6d, word [t1+2] +%if ARCH_X86_64 == 0 + push t5 + movzx t5d, word [t1] +%endif + movifnidn t7, t0 + lea ecx, [t6+64] + cmp t6d, 32 + adc t6d, 0 + mov [t1+2], t6w + imul t6d, eax, -32769 + shr ecx, 4 ; rate + add t6d, t5d ; if (bit) + sub t5d, eax ; cdf[0] -= ((cdf[0] - 32769) >> rate) + 1; + sar t6d, cl ; else + sub t5d, t6d ; cdf[0] -= cdf[0] >> rate; + mov [t1], t5w +%if WIN64 + mov t1d, [t7+msac.cnt] + pop t6 + jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm4 +%else +%if ARCH_X86_64 == 0 + pop t5 + pop t6 +%endif + jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm3 +%endif + +cglobal msac_decode_bool_equi, 0, 6, 0 + movifnidn t0, r0mp + mov t1d, [t0+msac.rng] + mov t4, [t0+msac.dif] + mov t2d, t1d + mov t1b, 8 + mov t3, t4 + mov eax, t1d + shr t1d, 1 ; v + shl rax, gprsize*8-17 ; vw + sub t2d, t1d ; r - v + sub t4, rax ; dif - vw + cmovb t2d, t1d + cmovb t4, t3 + setb al ; the upper 32 bits contains garbage but that's OK + not t4 +%if ARCH_X86_64 == 0 + movzx eax, al +%endif + jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm3 + +cglobal msac_decode_bool, 0, 6, 0 + movifnidn t0, r0mp + movifnidn t1d, r1m + movzx eax, byte [t0+msac.rng+1] ; r >> 8 + mov t4, [t0+msac.dif] + mov t2d, [t0+msac.rng] + and t1d, ~63 + imul eax, t1d + mov t3, t4 + shr eax, 7 + add eax, 4 ; v + mov t1d, eax + shl rax, gprsize*8-16 ; vw + sub t2d, t1d ; r - v + sub t4, rax ; dif - vw + cmovb t2d, t1d + cmovb t4, t3 + setb al + not t4 +%if ARCH_X86_64 == 0 + movzx eax, al +%endif + jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm3 + +%macro HI_TOK 1 ; update_cdf +%if ARCH_X86_64 == 0 + mov eax, -24 +%endif +%%loop: +%if %1 + movzx t2d, word [t1+3*2] +%endif + mova m1, m0 + pshuflw m2, m2, q0000 + psrlw m1, 6 + movd [buf+12], m2 + pand m2, m4 + psllw m1, 7 + pmulhuw m1, m2 +%if ARCH_X86_64 == 0 + add eax, 5 + mov [buf+8], eax +%endif + pshuflw m3, m3, c_shuf + paddw m1, m5 + movq [buf+16], m1 + psubusw m1, m3 + pxor m2, m2 + pcmpeqw m1, m2 + pmovmskb eax, m1 +%if %1 + lea ecx, [t2+80] + pcmpeqw m2, m2 + shr ecx, 4 + cmp t2d, 32 + adc t2d, 0 + movd m3, ecx + pavgw m2, m1 + psubw m2, m0 + psubw m0, m1 + psraw m2, m3 + paddw m0, m2 + movq [t1], m0 + mov [t1+3*2], t2w +%endif + tzcnt eax, eax + movzx ecx, word [buf+rax+16] + movzx t2d, word [buf+rax+14] + not t4 +%if ARCH_X86_64 + add t6d, 5 +%endif + sub eax, 5 ; setup for merging the tok_br and tok branches + sub t2d, ecx + shl rcx, gprsize*8-16 + add t4, rcx + bsr ecx, t2d + xor ecx, 15 + shl t2d, cl + shl t4, cl + movd m2, t2d + mov [t7+msac.rng], t2d + not t4 + sub t5d, ecx + jae %%end + mov t2, [t7+msac.buf] + mov rcx, [t7+msac.end] +%if UNIX64 == 0 + push t8 +%endif + lea t8, [t2+gprsize] + cmp t8, rcx + ja %%refill_eob + mov t2, [t2] + lea ecx, [t5+23] + add t5d, 16 + shr ecx, 3 + bswap t2 + sub t8, rcx + shl ecx, 3 + shr t2, cl + sub ecx, t5d + mov t5d, gprsize*8-16 + shl t2, cl + mov [t7+msac.buf], t8 +%if UNIX64 == 0 + pop t8 +%endif + sub t5d, ecx + xor t4, t2 +%%end: + movp m3, t4 +%if ARCH_X86_64 + add t6d, eax ; CF = tok_br < 3 || tok == 15 + jnc %%loop + lea eax, [t6+30] +%else + add eax, [buf+8] + jnc %%loop + add eax, 30 +%if STACK_ALIGNMENT >= 16 + add esp, 36 +%else + mov esp, [esp] +%endif +%endif + mov [t7+msac.dif], t4 + shr eax, 1 + mov [t7+msac.cnt], t5d + RET +%%refill_eob: + mov t8, rcx + mov ecx, gprsize*8-24 + sub ecx, t5d +%%refill_eob_loop: + cmp t2, t8 + jae %%refill_eob_end + movzx t5d, byte [t2] + inc t2 + shl t5, cl + xor t4, t5 + sub ecx, 8 + jge %%refill_eob_loop +%%refill_eob_end: +%if UNIX64 == 0 + pop t8 +%endif + mov t5d, gprsize*8-24 + mov [t7+msac.buf], t2 + sub t5d, ecx + jmp %%end +%endmacro + +cglobal msac_decode_hi_tok, 0, 7 + ARCH_X86_64, 6 + DECODE_SYMBOL_ADAPT_INIT 1 +%if ARCH_X86_64 == 0 && PIC + LEA t2, min_prob+12*2 + %define base t2-(min_prob+12*2) +%else + %define base 0 +%endif + movq m0, [t1] + movd m2, [t0+msac.rng] + mov eax, [t0+msac.update_cdf] + movq m4, [base+pw_0xff00] + movp m3, [t0+msac.dif] + movq m5, [base+min_prob+12*2] + mov t4, [t0+msac.dif] + mov t5d, [t0+msac.cnt] +%if ARCH_X86_64 + mov t6d, -24 +%endif + movifnidn t7, t0 + test eax, eax + jz .no_update_cdf + HI_TOK 1 +.no_update_cdf: + HI_TOK 0 + +%if ARCH_X86_64 +INIT_YMM avx2 +cglobal msac_decode_symbol_adapt16, 3, 6, 6 + lea rax, [pw_0xff00] + vpbroadcastw m2, [t0+msac.rng] + mova m0, [t1] + vpbroadcastw m3, [t0+msac.dif+6] + vbroadcasti128 m4, [rax] + mov t3d, [t0+msac.update_cdf] + mov t4d, t2d + not t2 +%if STACK_ALIGNMENT < 32 + mov r5, rsp +%if WIN64 + and rsp, ~31 + sub rsp, 40 +%else + and r5, ~31 + %define buf r5-32 +%endif +%elif WIN64 + sub rsp, 64 +%else + %define buf rsp-56 +%endif + psrlw m1, m0, 6 + movd [buf-4], xm2 + pand m2, m4 + psllw m1, 7 + pmulhuw m1, m2 + paddw m1, [rax+t2*2] + mova [buf], m1 + pmaxuw m1, m3 + pcmpeqw m1, m3 + pmovmskb eax, m1 + test t3d, t3d + jz .renorm + movzx t3d, word [t1+t4*2] + pcmpeqw m2, m2 + lea t2d, [t3+80] + shr t2d, 4 + cmp t3d, 32 + adc t3d, 0 + movd xm3, t2d + pavgw m2, m1 + psubw m2, m0 + psubw m0, m1 + psraw m2, xm3 + paddw m0, m2 + mova [t1], m0 + mov [t1+t4*2], t3w +.renorm: + tzcnt eax, eax + mov t4, [t0+msac.dif] + movzx t1d, word [buf+rax-0] + movzx t2d, word [buf+rax-2] + shr eax, 1 +%if WIN64 +%if STACK_ALIGNMENT < 32 + mov rsp, r5 +%else + add rsp, 64 +%endif +%endif + vzeroupper + jmp m(msac_decode_symbol_adapt4, _sse2).renorm2 +%endif diff --git a/third_party/dav1d/src/x86/msac.h b/third_party/dav1d/src/x86/msac.h new file mode 100644 index 0000000000..e11cd08c8a --- /dev/null +++ b/third_party/dav1d/src/x86/msac.h @@ -0,0 +1,64 @@ +/* + * Copyright © 2019, VideoLAN and dav1d authors + * Copyright © 2019, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_X86_MSAC_H +#define DAV1D_SRC_X86_MSAC_H + +unsigned dav1d_msac_decode_symbol_adapt4_sse2(MsacContext *s, uint16_t *cdf, + size_t n_symbols); +unsigned dav1d_msac_decode_symbol_adapt8_sse2(MsacContext *s, uint16_t *cdf, + size_t n_symbols); +unsigned dav1d_msac_decode_symbol_adapt16_sse2(MsacContext *s, uint16_t *cdf, + size_t n_symbols); +unsigned dav1d_msac_decode_bool_adapt_sse2(MsacContext *s, uint16_t *cdf); +unsigned dav1d_msac_decode_bool_equi_sse2(MsacContext *s); +unsigned dav1d_msac_decode_bool_sse2(MsacContext *s, unsigned f); +unsigned dav1d_msac_decode_hi_tok_sse2(MsacContext *s, uint16_t *cdf); + +/* Needed for checkasm */ +unsigned dav1d_msac_decode_symbol_adapt16_avx2(MsacContext *s, uint16_t *cdf, + size_t n_symbols); + +#if ARCH_X86_64 || defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2) +#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_sse2 +#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_sse2 +#define dav1d_msac_decode_hi_tok dav1d_msac_decode_hi_tok_sse2 +#endif + +#define dav1d_msac_decode_bool_adapt dav1d_msac_decode_bool_adapt_sse2 +#define dav1d_msac_decode_bool_equi dav1d_msac_decode_bool_equi_sse2 +#define dav1d_msac_decode_bool dav1d_msac_decode_bool_sse2 + +#if ARCH_X86_64 +#define dav1d_msac_decode_symbol_adapt16(ctx, cdf, symb) ((ctx)->symbol_adapt16(ctx, cdf, symb)) +#elif defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2) +#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_sse2 +#endif + +void dav1d_msac_init_x86(MsacContext *const s); + +#endif /* DAV1D_SRC_X86_MSAC_H */ diff --git a/third_party/dav1d/src/x86/msac_init.c b/third_party/dav1d/src/x86/msac_init.c new file mode 100644 index 0000000000..a634da27c4 --- /dev/null +++ b/third_party/dav1d/src/x86/msac_init.c @@ -0,0 +1,43 @@ +/* + * Copyright © 2020, VideoLAN and dav1d authors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/msac.h" +#include "src/x86/msac.h" + +#if ARCH_X86_64 +void dav1d_msac_init_x86(MsacContext *const s) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (flags & DAV1D_X86_CPU_FLAG_SSE2) { + s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_sse2; + } + + if (flags & DAV1D_X86_CPU_FLAG_AVX2) { + s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_avx2; + } +} +#endif -- cgit v1.2.3