From a90a5cba08fdf6c0ceb95101c275108a152a3aed Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 12 Jun 2024 07:35:37 +0200 Subject: Merging upstream version 127.0. Signed-off-by: Daniel Baumann --- third_party/dav1d/meson.build | 2 + third_party/dav1d/meson_options.txt | 5 + third_party/dav1d/src/arm/64/mc.S | 4 +- third_party/dav1d/src/arm/64/mc_dotprod.S | 1413 +++++++++++ third_party/dav1d/src/arm/64/msac.S | 21 +- third_party/dav1d/src/arm/itx.h | 63 - third_party/dav1d/src/arm/mc.h | 85 +- third_party/dav1d/src/cdf.c | 1378 ++++++----- third_party/dav1d/src/cdf.h | 48 +- third_party/dav1d/src/decode.c | 95 +- third_party/dav1d/src/internal.h | 9 +- third_party/dav1d/src/itx.h | 63 + third_party/dav1d/src/lf_mask.c | 6 +- third_party/dav1d/src/meson.build | 1 + third_party/dav1d/src/refmvs.c | 4 +- third_party/dav1d/src/riscv/itx.h | 63 - third_party/dav1d/src/x86/ipred_avx2.asm | 3 +- third_party/dav1d/src/x86/itx.h | 64 - third_party/dav1d/src/x86/mc16_avx2.asm | 1602 +++++++++--- third_party/dav1d/src/x86/mc_avx2.asm | 1475 +++++++++--- third_party/dav1d/src/x86/mc_avx512.asm | 3739 ++++++++++++++++++----------- third_party/dav1d/tests/meson.build | 2 +- 22 files changed, 7113 insertions(+), 3032 deletions(-) create mode 100644 third_party/dav1d/src/arm/64/mc_dotprod.S (limited to 'third_party/dav1d') diff --git a/third_party/dav1d/meson.build b/third_party/dav1d/meson.build index e371415d53..a2637ed797 100644 --- a/third_party/dav1d/meson.build +++ b/third_party/dav1d/meson.build @@ -81,6 +81,8 @@ cdata.set10('TRIM_DSP_FUNCTIONS', get_option('trim_dsp') == 'true' or # Logging option cdata.set10('CONFIG_LOG', get_option('logging')) +cdata.set10('CONFIG_MACOS_KPERF', get_option('macos_kperf')) + # # OS/Compiler checks and defines # diff --git a/third_party/dav1d/meson_options.txt b/third_party/dav1d/meson_options.txt index c04deffd73..b0b45b474d 100644 --- a/third_party/dav1d/meson_options.txt +++ b/third_party/dav1d/meson_options.txt @@ -68,3 +68,8 @@ option('trim_dsp', choices: ['true', 'false', 'if-release'], value: 'if-release', description: 'Eliminate redundant DSP functions where possible') + +option('macos_kperf', + type: 'boolean', + value: false, + description: 'Use the private macOS kperf API for benchmarking') diff --git a/third_party/dav1d/src/arm/64/mc.S b/third_party/dav1d/src/arm/64/mc.S index 3df0393c3a..5b493be82d 100644 --- a/third_party/dav1d/src/arm/64/mc.S +++ b/third_party/dav1d/src/arm/64/mc.S @@ -837,7 +837,7 @@ endfunc // This has got the same signature as the put_8tap functions, // and assumes that x8 is set to (clz(w)-24). -function put_neon +function put_neon, export=1 adr x9, L(put_tbl) ldrh w8, [x9, x8, lsl #1] sub x9, x9, w8, uxtw @@ -939,7 +939,7 @@ endfunc // This has got the same signature as the prep_8tap functions, // and assumes that x8 is set to (clz(w)-24), and x7 to w*2. -function prep_neon +function prep_neon, export=1 adr x9, L(prep_tbl) ldrh w8, [x9, x8, lsl #1] sub x9, x9, w8, uxtw diff --git a/third_party/dav1d/src/arm/64/mc_dotprod.S b/third_party/dav1d/src/arm/64/mc_dotprod.S new file mode 100644 index 0000000000..fcf04ee4d0 --- /dev/null +++ b/third_party/dav1d/src/arm/64/mc_dotprod.S @@ -0,0 +1,1413 @@ +/* + * Copyright © 2024, VideoLAN and dav1d authors + * Copyright © 2024, Janne Grunau + * Copyright © 2024, Martin Storsjo + * Copyright © 2024, Arm Limited + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + + +#if HAVE_DOTPROD +ENABLE_DOTPROD + +// No spaces in these expressions, due to gas-preprocessor. It is translated by +// -1 to save the negative offset at getting the address of `mc_subpel_filters`. +#define REGULAR1 (((0*15-1)<<7)|(3*15-1)) +#define SMOOTH1 (((1*15-1)<<7)|(4*15-1)) +#define SHARP1 (((2*15-1)<<7)|(3*15-1)) + +#define FUNC_ALIGN 2 +#define JUMP_ALIGN 2 +#define LOOP_ALIGN 2 + + +// Lookup table used to help conversion of shifted 32-bit values to 8-bit. + .align 4 +L(hv_tbl_neon_dotprod): + .byte 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 + +// Shuffle indices to permute horizontal samples in preparation for input to +// SDOT instructions. The 8-tap horizontal convolution uses sample indices in the +// interval of [-3, 4] relative to the current sample position. We load samples +// from index value -4 to keep loads word aligned, so the shuffle bytes are +// translated by 1 to handle this. + .align 4 +L(h_tbl_neon_dotprod): + .byte 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7 + .byte 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11 + .byte 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15 + .byte 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18, 16, 17, 18, 19 + +// Vertical convolutions are also using SDOT instructions, where a 128-bit +// register contains a transposed 4x4 matrix of values. Subsequent iterations of +// the vertical convolution can reuse the 3x4 sub-matrix from the previous loop +// iteration. These shuffle indices shift and merge this 4x4 matrix with the +// values of a new line. + .align 4 +L(v_tbl_neon_dotprod): + .byte 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28 + .byte 1, 2, 3, 16, 5, 6, 7, 17, 9, 10, 11, 18, 13, 14, 15, 19 + .byte 1, 2, 3, 20, 5, 6, 7, 21, 9, 10, 11, 22, 13, 14, 15, 23 + .byte 1, 2, 3, 24, 5, 6, 7, 25, 9, 10, 11, 26, 13, 14, 15, 27 + .byte 1, 2, 3, 28, 5, 6, 7, 29, 9, 10, 11, 30, 13, 14, 15, 31 + + +.macro make_8tap_fn op, type, type_h, type_v, isa, jump=1 +function \op\()_8tap_\type\()_8bpc_\isa, export=1, align=FUNC_ALIGN + mov x9, \type_h + mov x10, \type_v + .if \jump + b \op\()_8tap_\isa + .endif +endfunc +.endm + +.macro filter_8tap_fn type, isa, dst, d_strd, src, s_strd, w, h, mx, my, xmx, xmy, ldst, lsrc, wd_strd +make_8tap_fn \type, sharp, SHARP1, SHARP1, \isa +make_8tap_fn \type, sharp_smooth, SHARP1, SMOOTH1, \isa +make_8tap_fn \type, sharp_regular, SHARP1, REGULAR1, \isa +make_8tap_fn \type, smooth_sharp, SMOOTH1, SHARP1, \isa +make_8tap_fn \type, smooth, SMOOTH1, SMOOTH1, \isa +make_8tap_fn \type, smooth_regular, SMOOTH1, REGULAR1, \isa +make_8tap_fn \type, regular_sharp, REGULAR1, SHARP1, \isa +make_8tap_fn \type, regular_smooth, REGULAR1, SMOOTH1, \isa +make_8tap_fn \type, regular, REGULAR1, REGULAR1, \isa, jump=0 + +function \type\()_8tap_\isa, align=FUNC_ALIGN + clz w8, \w + mov w11, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) + sub w8, w8, #24 // for jump tables + movrel x12, X(mc_subpel_filters) + cbnz \mx, L(\type\()_8tap_h_hv_\isa) + cbnz \my, L(\type\()_8tap_v_\isa) +.ifc \type, prep + add \wd_strd, \w, \w // prep_neon needs w * 2 as stride +.endif + b X(\type\()_neon) + + .align JUMP_ALIGN +L(\type\()_8tap_v_\isa): + madd \my, \my, w11, w10 +.ifc \type, prep + mov w8, 0x2002 // FILTER_WEIGHT * 128 + rounding +.endif + sub \src, \src, \s_strd + ldr q6, L(v_tbl_neon_dotprod) +.ifc \type, prep + dup v4.4s, w8 +.endif + ubfx w11, \my, #7, #7 + and \my, \my, #0x7F + ldr q28, L(v_tbl_neon_dotprod) + 16 + cmp \h, #4 + csel \my, \my, w11, le + sub \src, \src, \s_strd, lsl #1 // src - src_stride * 3 + ldr q29, L(v_tbl_neon_dotprod) + 32 + add \xmy, x12, \xmy, lsl #3 // subpel V filter address + movi v5.16b, #128 + ldr d7, [\xmy] + cmp \w, #8 + b.eq 80f + b.lt 40f + + // .align JUMP_ALIGN // fallthrough +160: // V - 16xN+ + ldr q30, L(v_tbl_neon_dotprod) + 48 + ldr q31, L(v_tbl_neon_dotprod) + 64 +.ifc \type, prep + add \wd_strd, \w, \w +.endif + .align LOOP_ALIGN +161: + mov \lsrc, \src + mov \ldst, \dst + sub w8, \h, #1 + + ldr q16, [\lsrc] + ldr q17, [\lsrc, \s_strd] + add \lsrc, \lsrc, \s_strd, lsl #1 + ldr q18, [\lsrc] + ldr q19, [\lsrc, \s_strd] + add \lsrc, \lsrc, \s_strd, lsl #1 + + zip1 v0.16b, v16.16b, v17.16b + zip2 v1.16b, v16.16b, v17.16b + zip1 v2.16b, v18.16b, v19.16b + zip2 v3.16b, v18.16b, v19.16b + + ldr q20, [\lsrc] + ldr q21, [\lsrc, \s_strd] + add \lsrc, \lsrc, \s_strd, lsl #1 + ldr q22, [\lsrc] + ldr q23, [\lsrc, \s_strd] + add \lsrc, \lsrc, \s_strd, lsl #1 + + zip1 v18.16b, v20.16b, v21.16b + zip2 v21.16b, v20.16b, v21.16b + zip1 v24.16b, v22.16b, v23.16b + zip2 v27.16b, v22.16b, v23.16b + + zip1 v16.8h, v0.8h, v2.8h + zip2 v19.8h, v0.8h, v2.8h + zip1 v22.8h, v1.8h, v3.8h + zip2 v25.8h, v1.8h, v3.8h + + zip1 v17.8h, v18.8h, v24.8h + zip2 v20.8h, v18.8h, v24.8h + zip1 v23.8h, v21.8h, v27.8h + zip2 v26.8h, v21.8h, v27.8h + + sub v16.16b, v16.16b, v5.16b + sub v19.16b, v19.16b, v5.16b + sub v22.16b, v22.16b, v5.16b + sub v25.16b, v25.16b, v5.16b + + sub v17.16b, v17.16b, v5.16b + sub v20.16b, v20.16b, v5.16b + sub v23.16b, v23.16b, v5.16b + sub v26.16b, v26.16b, v5.16b + + .align LOOP_ALIGN +16: + ldr q27, [\lsrc] + add \lsrc, \lsrc, \s_strd +.ifc \type, prep + mov v0.16b, v4.16b + mov v1.16b, v4.16b + mov v2.16b, v4.16b + mov v3.16b, v4.16b +.else + movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT + movi v1.4s, #32, lsl 8 + movi v2.4s, #32, lsl 8 + movi v3.4s, #32, lsl 8 +.endif + sub v18.16b, v27.16b, v5.16b + sub v21.16b, v27.16b, v5.16b + sub v24.16b, v27.16b, v5.16b + sub v27.16b, v27.16b, v5.16b + + sdot v0.4s, v16.16b, v7.4b[0] + sdot v1.4s, v19.16b, v7.4b[0] + sdot v2.4s, v22.16b, v7.4b[0] + sdot v3.4s, v25.16b, v7.4b[0] + + tbl v16.16b, {v16.16b, v17.16b}, v6.16b + tbl v19.16b, {v19.16b, v20.16b}, v6.16b + tbl v22.16b, {v22.16b, v23.16b}, v6.16b + tbl v25.16b, {v25.16b, v26.16b}, v6.16b + + sdot v0.4s, v17.16b, v7.4b[1] + sdot v1.4s, v20.16b, v7.4b[1] + sdot v2.4s, v23.16b, v7.4b[1] + sdot v3.4s, v26.16b, v7.4b[1] + + tbl v17.16b, {v17.16b, v18.16b}, v28.16b + tbl v20.16b, {v20.16b, v21.16b}, v29.16b + tbl v23.16b, {v23.16b, v24.16b}, v30.16b + tbl v26.16b, {v26.16b, v27.16b}, v31.16b + + subs w8, w8, #1 + uzp1 v0.8h, v0.8h, v1.8h + uzp1 v2.8h, v2.8h, v3.8h +.ifc \type, prep + sshr v0.8h, v0.8h, #2 + sshr v1.8h, v2.8h, #2 + st1 {v0.8h, v1.8h}, [\ldst], \d_strd +.else + sqrshrun v0.8b, v0.8h, #6 + sqrshrun2 v0.16b, v2.8h, #6 + st1 {v0.16b}, [\ldst], \d_strd +.endif + b.gt 16b + +.ifc \type, prep + mov v0.16b, v4.16b + mov v1.16b, v4.16b + mov v2.16b, v4.16b + mov v3.16b, v4.16b +.else + movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT + movi v1.4s, #32, lsl 8 + movi v2.4s, #32, lsl 8 + movi v3.4s, #32, lsl 8 +.endif + sdot v0.4s, v16.16b, v7.4b[0] + sdot v1.4s, v19.16b, v7.4b[0] + sdot v2.4s, v22.16b, v7.4b[0] + sdot v3.4s, v25.16b, v7.4b[0] + + sdot v0.4s, v17.16b, v7.4b[1] + sdot v1.4s, v20.16b, v7.4b[1] + sdot v2.4s, v23.16b, v7.4b[1] + sdot v3.4s, v26.16b, v7.4b[1] + + subs \w, \w, #16 + uzp1 v0.8h, v0.8h, v1.8h + uzp1 v2.8h, v2.8h, v3.8h +.ifc \type, prep + sshr v0.8h, v0.8h, #2 + sshr v1.8h, v2.8h, #2 + stp q0, q1, [\ldst] + add \dst, \dst, #32 +.else + sqrshrun v0.8b, v0.8h, #6 + sqrshrun2 v0.16b, v2.8h, #6 + str q0, [\ldst] + add \dst, \dst, #16 +.endif + add \src, \src, #16 + b.gt 161b + ret + + .align JUMP_ALIGN +80: // V - 8xN + ldr d16, [\src] + ldr d17, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + ldr d18, [\src] + ldr d19, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + + ldr d20, [\src] + ldr d21, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + ldr d22, [\src] + ldr d23, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + subs \h, \h, #2 // for prep: sub is enough + + zip1 v0.16b, v16.16b, v17.16b + zip1 v2.16b, v18.16b, v19.16b + zip1 v18.16b, v20.16b, v21.16b + zip1 v24.16b, v22.16b, v23.16b + + zip1 v16.8h, v0.8h, v2.8h + zip2 v19.8h, v0.8h, v2.8h + zip1 v17.8h, v18.8h, v24.8h + zip2 v20.8h, v18.8h, v24.8h + + sub v16.16b, v16.16b, v5.16b + sub v19.16b, v19.16b, v5.16b + sub v17.16b, v17.16b, v5.16b + sub v20.16b, v20.16b, v5.16b +.ifc \type, put + b.eq 82f +.endif + + .align LOOP_ALIGN +8: + ldr d21, [\src] + ldr d27, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 +.ifc \type, prep + mov v0.16b, v4.16b + mov v1.16b, v4.16b + mov v2.16b, v4.16b + mov v3.16b, v4.16b +.else + movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT + movi v1.4s, #32, lsl 8 + movi v2.4s, #32, lsl 8 + movi v3.4s, #32, lsl 8 +.endif + sub v18.16b, v21.16b, v5.16b + sub v21.16b, v21.16b, v5.16b + sub v24.16b, v27.16b, v5.16b + sub v27.16b, v27.16b, v5.16b + + tbl v22.16b, {v16.16b, v17.16b}, v6.16b + tbl v25.16b, {v19.16b, v20.16b}, v6.16b + tbl v23.16b, {v17.16b, v18.16b}, v28.16b + tbl v26.16b, {v20.16b, v21.16b}, v29.16b + + sdot v0.4s, v16.16b, v7.4b[0] + sdot v0.4s, v17.16b, v7.4b[1] + sdot v1.4s, v19.16b, v7.4b[0] + sdot v1.4s, v20.16b, v7.4b[1] + + tbl v16.16b, {v22.16b, v23.16b}, v6.16b + tbl v19.16b, {v25.16b, v26.16b}, v6.16b + tbl v17.16b, {v23.16b, v24.16b}, v28.16b + tbl v20.16b, {v26.16b, v27.16b}, v29.16b + + sdot v2.4s, v22.16b, v7.4b[0] + sdot v2.4s, v23.16b, v7.4b[1] + sdot v3.4s, v25.16b, v7.4b[0] + sdot v3.4s, v26.16b, v7.4b[1] + + subs \h, \h, #2 + uzp1 v0.8h, v0.8h, v1.8h + uzp1 v2.8h, v2.8h, v3.8h +.ifc \type, prep + sshr v0.8h, v0.8h, #2 + sshr v1.8h, v2.8h, #2 + stp q0, q1, [\dst], #32 +.else + sqrshrun v0.8b, v0.8h, #6 + sqrshrun v1.8b, v2.8h, #6 + str d0, [\dst] + str d1, [\dst, \d_strd] + add \dst, \dst, \d_strd, lsl #1 +.endif + b.gt 8b + +.ifc \type, put + .align JUMP_ALIGN +82: + ldr d21, [\src] + movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT + movi v1.4s, #32, lsl 8 + movi v2.4s, #32, lsl 8 + movi v3.4s, #32, lsl 8 +.else + ldr d21, [\src] + mov v0.16b, v4.16b + mov v1.16b, v4.16b + mov v2.16b, v4.16b + mov v3.16b, v4.16b +.endif + sub v18.16b, v21.16b, v5.16b + sub v21.16b, v21.16b, v5.16b + + tbl v22.16b, {v16.16b, v17.16b}, v6.16b + tbl v25.16b, {v19.16b, v20.16b}, v6.16b + tbl v23.16b, {v17.16b, v18.16b}, v28.16b + tbl v26.16b, {v20.16b, v21.16b}, v29.16b + + sdot v0.4s, v16.16b, v7.4b[0] + sdot v0.4s, v17.16b, v7.4b[1] + sdot v1.4s, v19.16b, v7.4b[0] + sdot v1.4s, v20.16b, v7.4b[1] + + sdot v2.4s, v22.16b, v7.4b[0] + sdot v2.4s, v23.16b, v7.4b[1] + sdot v3.4s, v25.16b, v7.4b[0] + sdot v3.4s, v26.16b, v7.4b[1] + + uzp1 v0.8h, v0.8h, v1.8h + uzp1 v2.8h, v2.8h, v3.8h +.ifc \type, prep + sshr v0.8h, v0.8h, #2 + sshr v1.8h, v2.8h, #2 + stp q0, q1, [\dst] +.else + sqrshrun v0.8b, v0.8h, #6 + sqrshrun v1.8b, v2.8h, #6 + str d0, [\dst] + str d1, [\dst, \d_strd] +.endif + ret + + .align JUMP_ALIGN +40: // V - 4xN or 2xN (put only) +.ifc \type, put + cmp \w, #2 + b.eq 20f +.endif + ldr s16, [\src] + ldr s17, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + ldr s18, [\src] + ldr s19, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + + ldr s20, [\src] + ldr s21, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + ldr s22, [\src] + ldr s23, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + subs \h, \h, #2 // for prep: sub is enough + + zip1 v0.8b, v16.8b, v17.8b + zip1 v2.8b, v18.8b, v19.8b + zip1 v18.8b, v20.8b, v21.8b + zip1 v24.8b, v22.8b, v23.8b + + zip1 v16.8h, v0.8h, v2.8h + zip1 v17.8h, v18.8h, v24.8h + + sub v16.16b, v16.16b, v5.16b + sub v17.16b, v17.16b, v5.16b +.ifc \type, put + b.eq 42f +.endif + + .align LOOP_ALIGN +4: + ldr s18, [\src] + ldr s21, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 +.ifc \type, prep + mov v0.16b, v4.16b + mov v1.16b, v4.16b +.else + movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT + movi v1.4s, #32, lsl 8 +.endif + sub v18.16b, v18.16b, v5.16b + sub v21.16b, v21.16b, v5.16b + + tbl v19.16b, {v16.16b, v17.16b}, v6.16b + tbl v20.16b, {v17.16b, v18.16b}, v28.16b + + sdot v0.4s, v16.16b, v7.4b[0] + sdot v0.4s, v17.16b, v7.4b[1] + + tbl v16.16b, {v19.16b, v20.16b}, v6.16b + tbl v17.16b, {v20.16b, v21.16b}, v28.16b + + sdot v1.4s, v19.16b, v7.4b[0] + sdot v1.4s, v20.16b, v7.4b[1] +.ifc \type, prep + subs \h, \h, #2 + shrn v0.4h, v0.4s, #2 + shrn2 v0.8h, v1.4s, #2 + str q0, [\dst], #16 +.else + uzp1 v0.8h, v0.8h, v1.8h + sqrshrun v0.8b, v0.8h, #6 + subs \h, \h, #2 + fmov x8, d0 + lsr x9, x8, #32 + str w8, [\dst] + str w9, [\dst, \d_strd] + add \dst, \dst, \d_strd, lsl #1 +.endif + b.gt 4b + +.ifc \type, put + .align JUMP_ALIGN +42: + ldr s18, [\src] + movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT + movi v1.4s, #32, lsl 8 +.else + ldr s18, [\src] + mov v0.16b, v4.16b + mov v1.16b, v4.16b +.endif + sub v18.16b, v18.16b, v5.16b + + tbl v19.16b, {v16.16b, v17.16b}, v6.16b + tbl v20.16b, {v17.16b, v18.16b}, v28.16b + + sdot v0.4s, v16.16b, v7.4b[0] + sdot v0.4s, v17.16b, v7.4b[1] + + sdot v1.4s, v19.16b, v7.4b[0] + sdot v1.4s, v20.16b, v7.4b[1] +.ifc \type, prep + shrn v0.4h, v0.4s, #2 + shrn2 v0.8h, v1.4s, #2 + str q0, [\dst] + ret +.else + uzp1 v0.8h, v0.8h, v1.8h + sqrshrun v0.8b, v0.8h, #6 + fmov x8, d0 + lsr x9, x8, #32 + str w8, [\dst] + str w9, [\dst, \d_strd] + ret + + .align JUMP_ALIGN +20: // V - 2xN + ldr h16, [\src] + ldr h17, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + ldr h18, [\src] + ldr h19, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + + ldr h20, [\src] + ldr h21, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + ldr h22, [\src] + ldr h23, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + subs \h, \h, #2 + + zip1 v0.8b, v16.8b, v17.8b + zip1 v2.8b, v18.8b, v19.8b + zip1 v18.8b, v20.8b, v21.8b + zip1 v24.8b, v22.8b, v23.8b + + zip1 v16.4h, v0.4h, v2.4h + zip1 v17.4h, v18.4h, v24.4h + + sub v16.8b, v16.8b, v5.8b + sub v17.8b, v17.8b, v5.8b + + b.eq 22f + + .align LOOP_ALIGN +2: + ldr h18, [\src] + ldr h21, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + + movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT + movi v1.4s, #32, lsl 8 + + sub v18.8b, v18.8b, v5.8b + sub v21.8b, v21.8b, v5.8b + + tbl v19.16b, {v16.16b, v17.16b}, v6.16b + tbl v20.16b, {v17.16b, v18.16b}, v28.16b + + sdot v0.4s, v16.16b, v7.4b[0] + sdot v0.4s, v17.16b, v7.4b[1] + + tbl v16.16b, {v19.16b, v20.16b}, v6.16b + tbl v17.16b, {v20.16b, v21.16b}, v28.16b + + sdot v1.4s, v19.16b, v7.4b[0] + sdot v1.4s, v20.16b, v7.4b[1] + + uzp1 v0.8h, v0.8h, v1.8h + sqrshrun v0.8b, v0.8h, #6 + + subs \h, \h, #2 + fmov x8, d0 + lsr x9, x8, #32 + strh w8, [\dst] + strh w9, [\dst, \d_strd] + add \dst, \dst, \d_strd, lsl #1 + b.gt 2b + + .align JUMP_ALIGN +22: + ldr h18, [\src] + + movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT + movi v1.4s, #32, lsl 8 + + sub v18.8b, v18.8b, v5.8b + + tbl v19.16b, {v16.16b, v17.16b}, v6.16b + tbl v20.16b, {v17.16b, v18.16b}, v28.16b + + sdot v0.4s, v16.16b, v7.4b[0] + sdot v0.4s, v17.16b, v7.4b[1] + + sdot v1.4s, v19.16b, v7.4b[0] + sdot v1.4s, v20.16b, v7.4b[1] + + uzp1 v0.8h, v0.8h, v1.8h + sqrshrun v0.8b, v0.8h, #6 + + fmov x8, d0 + lsr x9, x8, #32 + strh w8, [\dst] + strh w9, [\dst, \d_strd] + ret +.endif + + .align JUMP_ALIGN +L(\type\()_8tap_h_hv_\isa): + madd \mx, \mx, w11, w9 + madd w14, \my, w11, w10 // for HV + ldr q28, L(h_tbl_neon_dotprod) + mov w13, 0x2002 // FILTER_WEIGHT * 128 + rounding + sub \src, \src, #4 // src - 4 + dup v27.4s, w13 + ubfx w9, \mx, #7, #7 + and \mx, \mx, #0x7F + ubfx w11, w14, #7, #7 // for HV + and w14, w14, #0x7F // for HV + cmp \w, #4 + csel \mx, \mx, w9, le + add \xmx, x12, \xmx, lsl #3 // subpel H filter address + movi v24.16b, #128 + cbz \my, L(\type\()_8tap_h_\isa) + + // HV cases + cmp \h, #4 + csel w14, w14, w11, le + sub \src, \src, \s_strd, lsl #1 // src - src_stride * 2 - 4 + add \xmy, x12, x14, lsl #3 // subpel V filter address + mov x15, x30 + ldr d7, [\xmy] +.ifc \type, put + ldr q25, L(hv_tbl_neon_dotprod) +.endif + sxtl v7.8h, v7.8b + cmp w10, SHARP1 + b.ne L(\type\()_6tap_hv_\isa) // vertical != SHARP1 + + // HV 8-tap cases + sub \src, \src, \s_strd // src - src_stride * 3 - 4 + cmp \w, #4 + b.eq 40f +.ifc \type, put + b.lt 20f +.endif + + // .align JUMP_ALIGN // fallthrough +80: // HV8 - 8xN+ + ldr q29, L(h_tbl_neon_dotprod) + 16 + ldr q30, L(h_tbl_neon_dotprod) + 32 + ldr d26, [\xmx] +.ifc \type, prep + add \wd_strd, \w, \w +.endif + + .align LOOP_ALIGN +81: + mov \lsrc, \src + mov \ldst, \dst + mov w8, \h + + bl L(\type\()_hv_filter8_\isa) + mov v16.16b, v22.16b + bl L(\type\()_hv_filter8_\isa) + mov v17.16b, v22.16b + bl L(\type\()_hv_filter8_\isa) + mov v18.16b, v22.16b + bl L(\type\()_hv_filter8_\isa) + mov v19.16b, v22.16b + bl L(\type\()_hv_filter8_\isa) + mov v20.16b, v22.16b + bl L(\type\()_hv_filter8_\isa) + mov v21.16b, v22.16b + bl L(\type\()_hv_filter8_\isa) + + .align LOOP_ALIGN +8: + ldr q23, [\lsrc] + add \lsrc, \lsrc, \s_strd + + smull v0.4s, v16.4h, v7.h[0] + smull2 v1.4s, v16.8h, v7.h[0] + mov v16.16b, v17.16b + + sub v23.16b, v23.16b, v24.16b + + mov v5.16b, v27.16b + mov v6.16b, v27.16b + + smlal v0.4s, v17.4h, v7.h[1] + smlal2 v1.4s, v17.8h, v7.h[1] + mov v17.16b, v18.16b + + tbl v2.16b, {v23.16b}, v28.16b + tbl v3.16b, {v23.16b}, v29.16b + tbl v4.16b, {v23.16b}, v30.16b + + smlal v0.4s, v18.4h, v7.h[2] + smlal2 v1.4s, v18.8h, v7.h[2] + mov v18.16b, v19.16b + + sdot v5.4s, v2.16b, v26.4b[0] + sdot v6.4s, v3.16b, v26.4b[0] + + smlal v0.4s, v19.4h, v7.h[3] + smlal2 v1.4s, v19.8h, v7.h[3] + mov v19.16b, v20.16b + + sdot v5.4s, v3.16b, v26.4b[1] + sdot v6.4s, v4.16b, v26.4b[1] + + smlal v0.4s, v20.4h, v7.h[4] + smlal2 v1.4s, v20.8h, v7.h[4] + mov v20.16b, v21.16b + + smlal v0.4s, v21.4h, v7.h[5] + smlal2 v1.4s, v21.8h, v7.h[5] +.ifc \type, prep + uzp1 v23.8h, v5.8h, v6.8h +.endif + mov v21.16b, v22.16b + + smlal v0.4s, v22.4h, v7.h[6] + smlal2 v1.4s, v22.8h, v7.h[6] +.ifc \type, prep + sshr v22.8h, v23.8h, #2 + smlal v0.4s, v22.4h, v7.h[7] + smlal2 v1.4s, v22.8h, v7.h[7] + rshrn v0.4h, v0.4s, #6 + rshrn2 v0.8h, v1.4s, #6 + subs w8, w8, #1 + st1 {v0.8h}, [\ldst], \d_strd + b.gt 8b + add \dst, \dst, #16 +.else + shrn v22.4h, v5.4s, #2 + shrn2 v22.8h, v6.4s, #2 + smlal v0.4s, v22.4h, v7.h[7] + smlal2 v1.4s, v22.8h, v7.h[7] + tbl v0.16b, {v0.16b, v1.16b}, v25.16b + subs w8, w8, #1 + sqrshrun v0.8b, v0.8h, #2 + st1 {v0.8b}, [\ldst], \d_strd + b.gt 8b + add \dst, \dst, #8 +.endif + add \src, \src, #8 + subs \w, \w, #8 + b.gt 81b + ret x15 + + .align JUMP_ALIGN +40: // HV8 - 4xN + ldr s26, [\xmx, #2] + add \src, \src, #2 + + bl L(\type\()_hv_filter4_\isa) + mov v16.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v17.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v18.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v19.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v20.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v21.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + + .align LOOP_ALIGN +4: + ld1 {v4.8b}, [\src], \s_strd + + smull v0.4s, v16.4h, v7.h[0] + smlal v0.4s, v17.4h, v7.h[1] + mov v16.16b, v17.16b + mov v17.16b, v18.16b + sub v4.16b, v4.16b, v24.16b + + smlal v0.4s, v18.4h, v7.h[2] + smlal v0.4s, v19.4h, v7.h[3] + tbl v2.16b, {v4.16b}, v28.16b + mov v5.16b, v27.16b + + mov v18.16b, v19.16b + mov v19.16b, v20.16b + + smlal v0.4s, v20.4h, v7.h[4] + smlal v0.4s, v21.4h, v7.h[5] + + sdot v5.4s, v2.16b, v26.4b[0] + mov v20.16b, v21.16b + mov v21.16b, v22.16b +.ifc \type, put + subs \h, \h, #1 +.endif + smlal v0.4s, v22.4h, v7.h[6] + shrn v22.4h, v5.4s, #2 + + smlal v0.4s, v22.4h, v7.h[7] +.ifc \type, prep + rshrn v0.4h, v0.4s, #6 + str d0, [\dst], #8 + subs \h, \h, #1 +.else + tbl v0.16b, {v0.16b, v1.16b}, v25.16b + sqrshrun v0.8b, v0.8h, #2 + str s0, [\dst] + add \dst, \dst, \d_strd +.endif + b.gt 4b + ret x15 + +.ifc \type, put + .align JUMP_ALIGN +20: // HV8 - 2xN + ldr s26, [\xmx, #2] + add \src, \src, #2 + + bl L(\type\()_hv_filter4_\isa) + mov v16.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v17.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v18.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v19.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v20.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v21.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + + .align LOOP_ALIGN +2: + ld1 {v4.8b}, [\src], \s_strd + + smull v0.4s, v16.4h, v7.h[0] + smlal v0.4s, v17.4h, v7.h[1] + mov v16.16b, v17.16b + mov v17.16b, v18.16b + sub v4.16b, v4.16b, v24.16b + + smlal v0.4s, v18.4h, v7.h[2] + smlal v0.4s, v19.4h, v7.h[3] + tbl v2.16b, {v4.16b}, v28.16b + mov v5.16b, v27.16b + + mov v18.16b, v19.16b + mov v19.16b, v20.16b + + smlal v0.4s, v20.4h, v7.h[4] + smlal v0.4s, v21.4h, v7.h[5] + + sdot v5.4s, v2.16b, v26.4b[0] + mov v20.16b, v21.16b + mov v21.16b, v22.16b + + subs \h, \h, #1 + smlal v0.4s, v22.4h, v7.h[6] + shrn v22.4h, v5.4s, #2 + + smlal v0.4s, v22.4h, v7.h[7] + tbl v0.16b, {v0.16b, v1.16b}, v25.16b + sqrshrun v0.8b, v0.8h, #2 + + str h0, [\dst] + add \dst, \dst, \d_strd + b.gt 2b + ret x15 +.endif + + .align JUMP_ALIGN +L(\type\()_6tap_hv_\isa): + cmp \w, #4 + b.eq 40f +.ifc \type, put + b.lt 20f +.endif + + // .align JUMP_ALIGN // fallthrough +80: // HV6 - 8xN+ + ldr q29, L(h_tbl_neon_dotprod) + 16 + ldr q30, L(h_tbl_neon_dotprod) + 32 + ldr d26, [\xmx] +.ifc \type, prep + add \wd_strd, \w, \w +.endif + + .align LOOP_ALIGN +81: + mov \lsrc, \src + mov \ldst, \dst + mov w8, \h + + bl L(\type\()_hv_filter8_\isa) + mov v16.16b, v22.16b + bl L(\type\()_hv_filter8_\isa) + mov v17.16b, v22.16b + bl L(\type\()_hv_filter8_\isa) + mov v18.16b, v22.16b + bl L(\type\()_hv_filter8_\isa) + mov v19.16b, v22.16b + bl L(\type\()_hv_filter8_\isa) + mov v20.16b, v22.16b + + .align LOOP_ALIGN +8: + ldr q23, [\xmy] + add \xmy, \xmy, \s_strd + + smull v0.4s, v16.4h, v7.h[1] + smull2 v1.4s, v16.8h, v7.h[1] + sub v23.16b, v23.16b, v24.16b + mov v16.16b, v17.16b + + mov v5.16b, v27.16b + mov v6.16b, v27.16b + + tbl v2.16b, {v23.16b}, v28.16b + tbl v3.16b, {v23.16b}, v29.16b + + smlal v0.4s, v17.4h, v7.h[2] + smlal2 v1.4s, v17.8h, v7.h[2] + tbl v4.16b, {v23.16b}, v30.16b + mov v17.16b, v18.16b + + sdot v5.4s, v2.16b, v26.4b[0] + sdot v6.4s, v3.16b, v26.4b[0] + smlal v0.4s, v18.4h, v7.h[3] + smlal2 v1.4s, v18.8h, v7.h[3] + mov v18.16b, v19.16b + + sdot v5.4s, v3.16b, v26.4b[1] + sdot v6.4s, v4.16b, v26.4b[1] + smlal v0.4s, v19.4h, v7.h[4] + smlal2 v1.4s, v19.8h, v7.h[4] + mov v19.16b, v20.16b + uzp1 v23.8h, v5.8h, v6.8h + + smlal v0.4s, v20.4h, v7.h[5] + smlal2 v1.4s, v20.8h, v7.h[5] + sshr v20.8h, v23.8h, #2 +.ifc \type, prep + smlal v0.4s, v20.4h, v7.h[6] + smlal2 v1.4s, v20.8h, v7.h[6] + rshrn v0.4h, v0.4s, #6 + rshrn2 v0.8h, v1.4s, #6 + st1 {v0.8h}, [\ldst], \d_strd + subs w8, w8, #1 + b.gt 8b + add \dst, \dst, #16 +.else + subs w8, w8, #1 + smlal v0.4s, v20.4h, v7.h[6] + smlal2 v1.4s, v20.8h, v7.h[6] + tbl v0.16b, {v0.16b, v1.16b}, v25.16b + sqrshrun v0.8b, v0.8h, #2 + st1 {v0.8b}, [\ldst], \d_strd + b.gt 8b + add \dst, \dst, #8 +.endif + add \src, \src, #8 + subs \w, \w, #8 + b.gt 81b + ret x15 + + .align FUNC_ALIGN +L(\type\()_hv_filter8_\isa): + ldr q4, [\lsrc] + add \lsrc, \lsrc, \s_strd + sub v4.16b, v4.16b, v24.16b + mov v22.16b, v27.16b + mov v23.16b, v27.16b + tbl v2.16b, {v4.16b}, v28.16b + tbl v3.16b, {v4.16b}, v29.16b + tbl v4.16b, {v4.16b}, v30.16b + sdot v22.4s, v2.16b, v26.4b[0] + sdot v22.4s, v3.16b, v26.4b[1] + sdot v23.4s, v3.16b, v26.4b[0] + sdot v23.4s, v4.16b, v26.4b[1] + shrn v22.4h, v22.4s, #2 + shrn2 v22.8h, v23.4s, #2 + ret + + .align FUNC_ALIGN +L(\type\()_hv_filter4_\isa): + mov v22.16b, v27.16b + ld1 {v4.8b}, [\src], \s_strd + sub v4.16b, v4.16b, v24.16b + tbl v2.16b, {v4.16b}, v28.16b + sdot v22.4s, v2.16b, v26.4b[0] + shrn v22.4h, v22.4s, #2 + ret + + .align JUMP_ALIGN +40: // HV6 - 4xN + ldr s26, [\xmx, #2] + add \src, \src, #2 + + bl L(\type\()_hv_filter4_\isa) + mov v16.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v17.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v18.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v19.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v20.16b, v22.16b + + .align LOOP_ALIGN +4: + ld1 {v4.8b}, [\src], \s_strd + + smull v0.4s, v16.4h, v7.h[1] + smlal v0.4s, v17.4h, v7.h[2] + sub v4.16b, v4.16b, v24.16b + mov v16.16b, v17.16b + mov v17.16b, v18.16b + + smlal v0.4s, v18.4h, v7.h[3] + smlal v0.4s, v19.4h, v7.h[4] + tbl v2.16b, {v4.16b}, v28.16b + mov v5.16b, v27.16b + + mov v18.16b, v19.16b + mov v19.16b, v20.16b + sdot v5.4s, v2.16b, v26.4b[0] + + smlal v0.4s, v20.4h, v7.h[5] + shrn v20.4h, v5.4s, #2 +.ifc \type, prep + smlal v0.4s, v20.4h, v7.h[6] + rshrn v0.4h, v0.4s, #6 + str d0, [\dst], #8 + subs \h, \h, #1 +.else + subs \h, \h, #1 + smlal v0.4s, v20.4h, v7.h[6] + tbl v0.16b, {v0.16b}, v25.16b + sqrshrun v0.8b, v0.8h, #2 + str s0, [\dst] + add \dst, \dst, \d_strd +.endif + b.gt 4b + ret x15 + +.ifc \type, put + .align JUMP_ALIGN +20: // HV6 - 2xN + ldr s26, [\xmx, #2] + add \src, \src, #2 + + bl L(\type\()_hv_filter4_\isa) + mov v16.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v17.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v18.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v19.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v20.16b, v22.16b + + .align LOOP_ALIGN +2: + ld1 {v4.8b}, [\src], \s_strd + + smull v0.4s, v16.4h, v7.h[1] + smlal v0.4s, v17.4h, v7.h[2] + sub v4.16b, v4.16b, v24.16b + mov v16.16b, v17.16b + mov v17.16b, v18.16b + + smlal v0.4s, v18.4h, v7.h[3] + smlal v0.4s, v19.4h, v7.h[4] + tbl v2.16b, {v4.16b}, v28.16b + mov v5.16b, v27.16b + + mov v18.16b, v19.16b + mov v19.16b, v20.16b + sdot v5.4s, v2.16b, v26.4b[0] + + smlal v0.4s, v20.4h, v7.h[5] + shrn v20.4h, v5.4s, #2 + + subs \h, \h, #1 + smlal v0.4s, v20.4h, v7.h[6] + + tbl v0.16b, {v0.16b}, v25.16b + sqrshrun v0.8b, v0.8h, #2 + + str h0, [\dst] + add \dst, \dst, \d_strd + b.gt 2b + ret x15 +.endif + + .align JUMP_ALIGN +L(\type\()_8tap_h_\isa): + adr x9, L(\type\()_8tap_h_\isa\()_tbl) + ldrh w8, [x9, x8, lsl #1] +.ifc \type, put + mov w10, #0x2022 // 64 * 128 + 34, bias and rounding for SDOT + dup v27.4s, w10 +.endif + sub x9, x9, x8 + br x9 + +.ifc \type, put + .align JUMP_ALIGN +20: // H - 2xN + AARCH64_VALID_JUMP_TARGET + add \src, \src, #2 + ldr s6, [\xmx, #2] + + .align LOOP_ALIGN +2: + ldr d0, [\src] + ldr d1, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + + sub v0.8b, v0.8b, v24.8b + sub v1.8b, v1.8b, v24.8b + + mov v4.16b, v27.16b + mov v5.16b, v27.16b + + tbl v2.16b, {v0.16b}, v28.16b + tbl v3.16b, {v1.16b}, v28.16b + + sdot v4.4s, v2.16b, v6.4b[0] + sdot v5.4s, v3.16b, v6.4b[0] + + uzp1 v4.8h, v4.8h, v5.8h + sqshrun v4.8b, v4.8h, #6 + + subs \h, \h, #2 + fmov x8, d4 + lsr x9, x8, #32 + strh w8, [\dst] + strh w9, [\dst, \d_strd] + add \dst, \dst, \d_strd, lsl #1 + b.gt 2b + ret + +.endif + + .align JUMP_ALIGN +40: // H - 4xN + AARCH64_VALID_JUMP_TARGET + add \src, \src, #2 + ldr s26, [\xmx, #2] + + .align LOOP_ALIGN +4: + ldr d0, [\src] + ldr d1, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + + sub v0.8b, v0.8b, v24.8b + sub v1.8b, v1.8b, v24.8b + + mov v4.16b, v27.16b + mov v5.16b, v27.16b + + tbl v2.16b, {v0.16b}, v28.16b + tbl v3.16b, {v1.16b}, v28.16b + + sdot v4.4s, v2.16b, v26.4b[0] + sdot v5.4s, v3.16b, v26.4b[0] +.ifc \type, prep + subs \h, \h, #2 + shrn v4.4h, v4.4s, #2 + shrn2 v4.8h, v5.4s, #2 + str q4, [\dst], #16 +.else + uzp1 v4.8h, v4.8h, v5.8h + sqshrun v4.8b, v4.8h, #6 + subs \h, \h, #2 + fmov x8, d4 + lsr x9, x8, #32 + str w8, [\dst] + str w9, [\dst, \d_strd] + add \dst, \dst, \d_strd, lsl #1 +.endif + b.gt 4b + ret + + .align JUMP_ALIGN +80: // H - 8xN + AARCH64_VALID_JUMP_TARGET + ldr q29, L(h_tbl_neon_dotprod) + 16 + ldr q30, L(h_tbl_neon_dotprod) + 32 + ldr d26, [\xmx] + + .align LOOP_ALIGN +8: + ldr q0, [\src] + ldr q16, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + + sub v0.16b, v0.16b, v24.16b + sub v16.16b, v16.16b, v24.16b + + mov v4.16b, v27.16b + mov v5.16b, v27.16b + mov v20.16b, v27.16b + mov v21.16b, v27.16b + + tbl v1.16b, {v0.16b}, v28.16b + tbl v2.16b, {v0.16b}, v29.16b + tbl v3.16b, {v0.16b}, v30.16b + tbl v17.16b, {v16.16b}, v28.16b + tbl v18.16b, {v16.16b}, v29.16b + tbl v19.16b, {v16.16b}, v30.16b + + sdot v4.4s, v1.16b, v26.4b[0] + sdot v5.4s, v2.16b, v26.4b[0] + sdot v20.4s, v17.16b, v26.4b[0] + sdot v21.4s, v18.16b, v26.4b[0] + sdot v4.4s, v2.16b, v26.4b[1] + sdot v5.4s, v3.16b, v26.4b[1] + sdot v20.4s, v18.16b, v26.4b[1] + sdot v21.4s, v19.16b, v26.4b[1] + + uzp1 v4.8h, v4.8h, v5.8h + uzp1 v20.8h, v20.8h, v21.8h +.ifc \type, prep + sshr v4.8h, v4.8h, #2 + sshr v20.8h, v20.8h, #2 + subs \h, \h, #2 + stp q4, q20, [\dst], #32 +.else + sqshrun v4.8b, v4.8h, #6 + sqshrun v20.8b, v20.8h, #6 + subs \h, \h, #2 + str d4, [\dst] + str d20, [\dst, \d_strd] + add \dst, \dst, \d_strd, lsl #1 +.endif + b.gt 8b + ret + + .align JUMP_ALIGN +160: // H - 16xN + AARCH64_VALID_JUMP_TARGET + ldr q29, L(h_tbl_neon_dotprod) + 16 + ldr q30, L(h_tbl_neon_dotprod) + 32 + ldr q31, L(h_tbl_neon_dotprod) + 48 + ldr d26, [\xmx] + + .align LOOP_ALIGN +16: + ldp q16, q17, [\src] + add \src, \src, \s_strd + + sub v16.16b, v16.16b, v24.16b + sub v17.16b, v17.16b, v24.16b + + mov v6.16b, v27.16b + mov v7.16b, v27.16b + mov v22.16b, v27.16b + mov v23.16b, v27.16b + + tbl v0.16b, {v16.16b}, v28.16b + tbl v1.16b, {v16.16b}, v29.16b + tbl v2.16b, {v16.16b}, v30.16b + tbl v3.16b, {v16.16b, v17.16b}, v31.16b + tbl v4.16b, {v17.16b}, v28.16b + + sdot v6.4s, v0.16b, v26.4b[0] + sdot v7.4s, v1.16b, v26.4b[0] + sdot v22.4s, v2.16b, v26.4b[0] + sdot v23.4s, v3.16b, v26.4b[0] + sdot v6.4s, v1.16b, v26.4b[1] + sdot v7.4s, v2.16b, v26.4b[1] + sdot v22.4s, v3.16b, v26.4b[1] + sdot v23.4s, v4.16b, v26.4b[1] + + uzp1 v6.8h, v6.8h, v7.8h + uzp1 v22.8h, v22.8h, v23.8h +.ifc \type, prep + sshr v6.8h, v6.8h, #2 + sshr v22.8h, v22.8h, #2 + subs \h, \h, #1 + stp q6, q22, [\dst], #32 +.else + sqshrun v6.8b, v6.8h, #6 + sqshrun2 v6.16b, v22.8h, #6 + subs \h, \h, #1 + str q6, [\dst] + add \dst, \dst, \d_strd +.endif + b.gt 16b + ret + + .align JUMP_ALIGN +320: // H - 32xN+ +640: +1280: + AARCH64_VALID_JUMP_TARGET + ldr q29, L(h_tbl_neon_dotprod) + 16 + ldr q30, L(h_tbl_neon_dotprod) + 32 + ldr q31, L(h_tbl_neon_dotprod) + 48 + ldr d26, [\xmx] +.ifc \type, put + sub \d_strd, \d_strd, \w, uxtw +.endif + sub \s_strd, \s_strd, \w, uxtw + mov w8, \w + + .align LOOP_ALIGN +32: + ldp q16, q17, [\src], #16 + + sub v16.16b, v16.16b, v24.16b + sub v17.16b, v17.16b, v24.16b + + mov v6.16b, v27.16b + mov v7.16b, v27.16b + mov v22.16b, v27.16b + mov v23.16b, v27.16b + + tbl v0.16b, {v16.16b}, v28.16b + tbl v1.16b, {v16.16b}, v29.16b + tbl v2.16b, {v16.16b}, v30.16b + tbl v3.16b, {v16.16b, v17.16b}, v31.16b + tbl v4.16b, {v17.16b}, v28.16b + + sdot v6.4s, v0.16b, v26.4b[0] + sdot v7.4s, v1.16b, v26.4b[0] + sdot v22.4s, v2.16b, v26.4b[0] + sdot v23.4s, v3.16b, v26.4b[0] + sdot v6.4s, v1.16b, v26.4b[1] + sdot v7.4s, v2.16b, v26.4b[1] + sdot v22.4s, v3.16b, v26.4b[1] + sdot v23.4s, v4.16b, v26.4b[1] + + uzp1 v6.8h, v6.8h, v7.8h + uzp1 v22.8h, v22.8h, v23.8h +.ifc \type, prep + sshr v6.8h, v6.8h, #2 + sshr v22.8h, v22.8h, #2 + subs w8, w8, #16 + stp q6, q22, [\dst], #32 +.else + sqshrun v6.8b, v6.8h, #6 + sqshrun2 v6.16b, v22.8h, #6 + subs w8, w8, #16 + str q6, [\dst], #16 +.endif + b.gt 32b + + add \src, \src, \s_strd +.ifc \type, put + add \dst, \dst, \d_strd +.endif + mov w8, \w + subs \h, \h, #1 + b.gt 32b + ret + +L(\type\()_8tap_h_\isa\()_tbl): + .hword (L(\type\()_8tap_h_\isa\()_tbl) - 1280b) + .hword (L(\type\()_8tap_h_\isa\()_tbl) - 640b) + .hword (L(\type\()_8tap_h_\isa\()_tbl) - 320b) + .hword (L(\type\()_8tap_h_\isa\()_tbl) - 160b) + .hword (L(\type\()_8tap_h_\isa\()_tbl) - 80b) + .hword (L(\type\()_8tap_h_\isa\()_tbl) - 40b) +.ifc \type, put + .hword (L(\type\()_8tap_h_\isa\()_tbl) - 20b) +.endif +endfunc +.endm + +// dst(x0), d_strd(x7), src(x1), s_strd(x2), w(w3), h(w4), mx(w5), my(w6) +// xmx(x5), xmy(x6), ldst(x5), lsrc(x6), wd_strd(w7) +filter_8tap_fn prep, neon_dotprod, x0, x7, x1, x2, w3, w4, w5, w6, x5, x6, x5, x6, w7 + +// dst(x0) d_strd(x1) src(x2) s_strd(x3) w(w4) h(w5) mx(w6) my(w7) +// xmx(x6), xmy(x7), ldst(x6), lsrc(x7), wd_strd(w1) +filter_8tap_fn put, neon_dotprod, x0, x1, x2, x3, w4, w5, w6, w7, x6, x7, x6, x7, w1 + +DISABLE_DOTPROD +#endif // HAVE_DOTPROD diff --git a/third_party/dav1d/src/arm/64/msac.S b/third_party/dav1d/src/arm/64/msac.S index 7bef9243fb..9033072a82 100644 --- a/third_party/dav1d/src/arm/64/msac.S +++ b/third_party/dav1d/src/arm/64/msac.S @@ -288,10 +288,8 @@ function msac_decode_hi_tok_neon, export=1 mvni v30.4h, #0x3f // 0xffc0 ldrh w9, [x1, #6] // count = cdf[n_symbols] ld1r {v3.4h}, [x16] // rng - movrel x16, bits ld1 {v29.4h}, [x17] // EC_MIN_PROB * (n_symbols - ret) add x17, x0, #DIF + 6 - ld1 {v16.8h}, [x16] mov w13, #-24 and v17.8b, v0.8b, v30.8b // cdf & 0xffc0 ldr w10, [x0, #ALLOW_UPDATE_CDF] @@ -305,30 +303,27 @@ function msac_decode_hi_tok_neon, export=1 add v4.4h, v17.4h, v29.4h // v = cdf + EC_MIN_PROB * (n_symbols - ret) add v4.4h, v6.4h, v4.4h // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret) str h3, [sp, #14] // store original u = s->rng - cmhs v2.8h, v1.8h, v4.8h // c >= v + cmhs v2.4h, v1.4h, v4.4h // c >= v str q4, [sp, #16] // store v values to allow indexed access - and v6.16b, v2.16b, v16.16b // One bit per halfword set in the mask - addv h6, v6.8h // Aggregate mask bits - umov w3, v6.h[0] + addv h6, v2.4h // -4 + ret add w13, w13, #5 - rbit w3, w3 + smov w15, v6.h[0] add x8, sp, #16 - clz w15, w3 // ret + add w15, w15, #4 // ret cbz w10, 2f // update_cdf - movi v5.8b, #0xff + sub v5.4h, v0.4h, v2.4h // cdf[i] + (i >= val ? 1 : 0) mov w4, #-5 - urhadd v4.4h, v5.4h, v2.4h // i >= val ? -1 : 32768 + orr v2.4h, #0x80, lsl #8 // i >= val ? -1 : 32768 sub w4, w4, w9, lsr #4 // -((count >> 4) + 5) - sub v4.4h, v4.4h, v0.4h // (32768 - cdf[i]) or (-1 - cdf[i]) + sub v4.4h, v2.4h, v0.4h // (32768 - cdf[i]) or (-1 - cdf[i]) dup v6.4h, w4 // -rate sub w9, w9, w9, lsr #5 // count - (count == 32) - sub v0.4h, v0.4h, v2.4h // cdf + (i >= val ? 1 : 0) sshl v4.4h, v4.4h, v6.4h // ({32768,-1} - cdf[i]) >> rate add w9, w9, #1 // count + (count < 32) - add v0.4h, v0.4h, v4.4h // cdf + (32768 - cdf[i]) >> rate + add v0.4h, v5.4h, v4.4h // cdf[i] + (32768 - cdf[i]) >> rate st1 {v0.4h}, [x1] and v17.8b, v0.8b, v30.8b // cdf & 0xffc0 strh w9, [x1, #6] diff --git a/third_party/dav1d/src/arm/itx.h b/third_party/dav1d/src/arm/itx.h index 17234e027a..2a58a31322 100644 --- a/third_party/dav1d/src/arm/itx.h +++ b/third_party/dav1d/src/arm/itx.h @@ -28,34 +28,6 @@ #include "src/cpu.h" #include "src/itx.h" -#define decl_itx2_fns(w, h, opt) \ -decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt)) - -#define decl_itx12_fns(w, h, opt) \ -decl_itx2_fns(w, h, opt); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt)) - -#define decl_itx16_fns(w, h, opt) \ -decl_itx12_fns(w, h, opt); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt)) - -#define decl_itx17_fns(w, h, opt) \ -decl_itx16_fns(w, h, opt); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt)) - decl_itx17_fns( 4, 4, neon); decl_itx16_fns( 4, 8, neon); decl_itx16_fns( 4, 16, neon); @@ -78,41 +50,6 @@ decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, neon)); decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, neon)); static ALWAYS_INLINE void itx_dsp_init_arm(Dav1dInvTxfmDSPContext *const c, int bpc) { -#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \ - c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \ - BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext) - -#define assign_itx1_fn(pfx, w, h, ext) \ - assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext) - -#define assign_itx2_fn(pfx, w, h, ext) \ - assign_itx1_fn(pfx, w, h, ext); \ - assign_itx_fn(pfx, w, h, identity_identity, IDTX, ext) - -#define assign_itx12_fn(pfx, w, h, ext) \ - assign_itx2_fn(pfx, w, h, ext); \ - assign_itx_fn(pfx, w, h, dct_adst, ADST_DCT, ext); \ - assign_itx_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, ext); \ - assign_itx_fn(pfx, w, h, dct_identity, H_DCT, ext); \ - assign_itx_fn(pfx, w, h, adst_dct, DCT_ADST, ext); \ - assign_itx_fn(pfx, w, h, adst_adst, ADST_ADST, ext); \ - assign_itx_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, ext); \ - assign_itx_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, ext); \ - assign_itx_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, ext); \ - assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \ - assign_itx_fn(pfx, w, h, identity_dct, V_DCT, ext) - -#define assign_itx16_fn(pfx, w, h, ext) \ - assign_itx12_fn(pfx, w, h, ext); \ - assign_itx_fn(pfx, w, h, adst_identity, H_ADST, ext); \ - assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST, ext); \ - assign_itx_fn(pfx, w, h, identity_adst, V_ADST, ext); \ - assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST, ext) - -#define assign_itx17_fn(pfx, w, h, ext) \ - assign_itx16_fn(pfx, w, h, ext); \ - assign_itx_fn(pfx, w, h, wht_wht, WHT_WHT, ext) - const unsigned flags = dav1d_get_cpu_flags(); if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; diff --git a/third_party/dav1d/src/arm/mc.h b/third_party/dav1d/src/arm/mc.h index 06cd533a9b..7e57fd37cb 100644 --- a/third_party/dav1d/src/arm/mc.h +++ b/third_party/dav1d/src/arm/mc.h @@ -30,26 +30,40 @@ #include "src/mc.h" #include "src/cpu.h" -decl_mc_fn(BF(dav1d_put_8tap_regular, neon)); -decl_mc_fn(BF(dav1d_put_8tap_regular_smooth, neon)); -decl_mc_fn(BF(dav1d_put_8tap_regular_sharp, neon)); -decl_mc_fn(BF(dav1d_put_8tap_smooth, neon)); -decl_mc_fn(BF(dav1d_put_8tap_smooth_regular, neon)); -decl_mc_fn(BF(dav1d_put_8tap_smooth_sharp, neon)); -decl_mc_fn(BF(dav1d_put_8tap_sharp, neon)); -decl_mc_fn(BF(dav1d_put_8tap_sharp_regular, neon)); -decl_mc_fn(BF(dav1d_put_8tap_sharp_smooth, neon)); -decl_mc_fn(BF(dav1d_put_bilin, neon)); +#define decl_8tap_gen(decl_name, fn_name, opt) \ + decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_regular, opt)); \ + decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_regular_smooth, opt)); \ + decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_regular_sharp, opt)); \ + decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_smooth_regular, opt)); \ + decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_smooth, opt)); \ + decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_smooth_sharp, opt)); \ + decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_sharp_regular, opt)); \ + decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_sharp_smooth, opt)); \ + decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_sharp, opt)) + +#define decl_8tap_fns(opt) \ + decl_8tap_gen(mc, put, opt); \ + decl_8tap_gen(mct, prep, opt) + +#define init_8tap_gen(name, opt) \ + init_##name##_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, opt); \ + init_##name##_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, opt); \ + init_##name##_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, opt); \ + init_##name##_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, opt); \ + init_##name##_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, opt); \ + init_##name##_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, opt); \ + init_##name##_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, opt); \ + init_##name##_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, opt); \ + init_##name##_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, opt) + +#define init_8tap_fns(opt) \ + init_8tap_gen(mc, opt); \ + init_8tap_gen(mct, opt) + +decl_8tap_fns(neon); +decl_8tap_fns(neon_dotprod); -decl_mct_fn(BF(dav1d_prep_8tap_regular, neon)); -decl_mct_fn(BF(dav1d_prep_8tap_regular_smooth, neon)); -decl_mct_fn(BF(dav1d_prep_8tap_regular_sharp, neon)); -decl_mct_fn(BF(dav1d_prep_8tap_smooth, neon)); -decl_mct_fn(BF(dav1d_prep_8tap_smooth_regular, neon)); -decl_mct_fn(BF(dav1d_prep_8tap_smooth_sharp, neon)); -decl_mct_fn(BF(dav1d_prep_8tap_sharp, neon)); -decl_mct_fn(BF(dav1d_prep_8tap_sharp_regular, neon)); -decl_mct_fn(BF(dav1d_prep_8tap_sharp_smooth, neon)); +decl_mc_fn(BF(dav1d_put_bilin, neon)); decl_mct_fn(BF(dav1d_prep_bilin, neon)); decl_avg_fn(BF(dav1d_avg, neon)); @@ -77,27 +91,10 @@ static ALWAYS_INLINE void mc_dsp_init_arm(Dav1dMCDSPContext *const c) { if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; - init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, neon); - init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon); - init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon); - init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon); - init_mc_fn (FILTER_2D_8TAP_SMOOTH, 8tap_smooth, neon); - init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, neon); - init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, neon); - init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, neon); - init_mc_fn (FILTER_2D_8TAP_SHARP, 8tap_sharp, neon); - init_mc_fn (FILTER_2D_BILINEAR, bilin, neon); - - init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, neon); - init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon); - init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon); - init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon); - init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, neon); - init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, neon); - init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, neon); - init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, neon); - init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, neon); - init_mct_fn(FILTER_2D_BILINEAR, bilin, neon); + init_8tap_fns(neon); + + init_mc_fn (FILTER_2D_BILINEAR, bilin, neon); + init_mct_fn(FILTER_2D_BILINEAR, bilin, neon); c->avg = BF(dav1d_avg, neon); c->w_avg = BF(dav1d_w_avg, neon); @@ -111,4 +108,12 @@ static ALWAYS_INLINE void mc_dsp_init_arm(Dav1dMCDSPContext *const c) { c->warp8x8 = BF(dav1d_warp_affine_8x8, neon); c->warp8x8t = BF(dav1d_warp_affine_8x8t, neon); c->emu_edge = BF(dav1d_emu_edge, neon); + +#if ARCH_AARCH64 +#if HAVE_DOTPROD && BITDEPTH == 8 + if (!(flags & DAV1D_ARM_CPU_FLAG_DOTPROD)) return; + + init_8tap_fns(neon_dotprod); +#endif // HAVE_DOTPROD && BITDEPTH == 8 +#endif // ARCH_AARCH64 } diff --git a/third_party/dav1d/src/cdf.c b/third_party/dav1d/src/cdf.c index e0f2132e00..d9721dad46 100644 --- a/third_party/dav1d/src/cdf.c +++ b/third_party/dav1d/src/cdf.c @@ -65,631 +65,638 @@ #define CDF15(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o) \ CDF1(a), CDF14(b,c,d,e,f,g,h,i,j,k,l,m,n,o) -static const CdfModeContext av1_default_cdf = { - .y_mode = { - { CDF12(22801, 23489, 24293, 24756, 25601, 26123, - 26606, 27418, 27945, 29228, 29685, 30349) }, - { CDF12(18673, 19845, 22631, 23318, 23950, 24649, - 25527, 27364, 28152, 29701, 29984, 30852) }, - { CDF12(19770, 20979, 23396, 23939, 24241, 24654, - 25136, 27073, 27830, 29360, 29730, 30659) }, - { CDF12(20155, 21301, 22838, 23178, 23261, 23533, - 23703, 24804, 25352, 26575, 27016, 28049) }, - }, .use_filter_intra = { - [BS_4x4] = { CDF1( 4621) }, - [BS_4x8] = { CDF1( 6743) }, - [BS_8x4] = { CDF1( 5893) }, - [BS_8x8] = { CDF1( 7866) }, - [BS_8x16] = { CDF1(12551) }, - [BS_16x8] = { CDF1( 9394) }, - [BS_16x16] = { CDF1(12408) }, - [BS_16x32] = { CDF1(14301) }, - [BS_32x16] = { CDF1(12756) }, - [BS_32x32] = { CDF1(22343) }, - [BS_32x64] = { CDF1(16384) }, - [BS_64x32] = { CDF1(16384) }, - [BS_64x64] = { CDF1(16384) }, - [BS_64x128] = { CDF1(16384) }, - [BS_128x64] = { CDF1(16384) }, - [BS_128x128] = { CDF1(16384) }, - [BS_4x16] = { CDF1(12770) }, - [BS_16x4] = { CDF1(10368) }, - [BS_8x32] = { CDF1(20229) }, - [BS_32x8] = { CDF1(18101) }, - [BS_16x64] = { CDF1(16384) }, - [BS_64x16] = { CDF1(16384) }, - }, .filter_intra = { - CDF4(8949, 12776, 17211, 29558), - }, .uv_mode = { - { - { CDF12(22631, 24152, 25378, 25661, 25986, 26520, - 27055, 27923, 28244, 30059, 30941, 31961) }, - { CDF12( 9513, 26881, 26973, 27046, 27118, 27664, - 27739, 27824, 28359, 29505, 29800, 31796) }, - { CDF12( 9845, 9915, 28663, 28704, 28757, 28780, - 29198, 29822, 29854, 30764, 31777, 32029) }, - { CDF12(13639, 13897, 14171, 25331, 25606, 25727, - 25953, 27148, 28577, 30612, 31355, 32493) }, - { CDF12( 9764, 9835, 9930, 9954, 25386, 27053, - 27958, 28148, 28243, 31101, 31744, 32363) }, - { CDF12(11825, 13589, 13677, 13720, 15048, 29213, - 29301, 29458, 29711, 31161, 31441, 32550) }, - { CDF12(14175, 14399, 16608, 16821, 17718, 17775, - 28551, 30200, 30245, 31837, 32342, 32667) }, - { CDF12(12885, 13038, 14978, 15590, 15673, 15748, - 16176, 29128, 29267, 30643, 31961, 32461) }, - { CDF12(12026, 13661, 13874, 15305, 15490, 15726, - 15995, 16273, 28443, 30388, 30767, 32416) }, - { CDF12(19052, 19840, 20579, 20916, 21150, 21467, - 21885, 22719, 23174, 28861, 30379, 32175) }, - { CDF12(18627, 19649, 20974, 21219, 21492, 21816, - 22199, 23119, 23527, 27053, 31397, 32148) }, - { CDF12(17026, 19004, 19997, 20339, 20586, 21103, - 21349, 21907, 22482, 25896, 26541, 31819) }, - { CDF12(12124, 13759, 14959, 14992, 15007, 15051, - 15078, 15166, 15255, 15753, 16039, 16606) }, - }, { - { CDF13(10407, 11208, 12900, 13181, 13823, 14175, 14899, - 15656, 15986, 20086, 20995, 22455, 24212) }, - { CDF13( 4532, 19780, 20057, 20215, 20428, 21071, 21199, - 21451, 22099, 24228, 24693, 27032, 29472) }, - { CDF13( 5273, 5379, 20177, 20270, 20385, 20439, 20949, - 21695, 21774, 23138, 24256, 24703, 26679) }, - { CDF13( 6740, 7167, 7662, 14152, 14536, 14785, 15034, - 16741, 18371, 21520, 22206, 23389, 24182) }, - { CDF13( 4987, 5368, 5928, 6068, 19114, 20315, 21857, - 22253, 22411, 24911, 25380, 26027, 26376) }, - { CDF13( 5370, 6889, 7247, 7393, 9498, 21114, 21402, - 21753, 21981, 24780, 25386, 26517, 27176) }, - { CDF13( 4816, 4961, 7204, 7326, 8765, 8930, 20169, - 20682, 20803, 23188, 23763, 24455, 24940) }, - { CDF13( 6608, 6740, 8529, 9049, 9257, 9356, 9735, - 18827, 19059, 22336, 23204, 23964, 24793) }, - { CDF13( 5998, 7419, 7781, 8933, 9255, 9549, 9753, - 10417, 18898, 22494, 23139, 24764, 25989) }, - { CDF13(10660, 11298, 12550, 12957, 13322, 13624, 14040, - 15004, 15534, 20714, 21789, 23443, 24861) }, - { CDF13(10522, 11530, 12552, 12963, 13378, 13779, 14245, - 15235, 15902, 20102, 22696, 23774, 25838) }, - { CDF13(10099, 10691, 12639, 13049, 13386, 13665, 14125, - 15163, 15636, 19676, 20474, 23519, 25208) }, - { CDF13( 3144, 5087, 7382, 7504, 7593, 7690, 7801, - 8064, 8232, 9248, 9875, 10521, 29048) }, - }, - }, .angle_delta = { - { CDF6( 2180, 5032, 7567, 22776, 26989, 30217) }, - { CDF6( 2301, 5608, 8801, 23487, 26974, 30330) }, - { CDF6( 3780, 11018, 13699, 19354, 23083, 31286) }, - { CDF6( 4581, 11226, 15147, 17138, 21834, 28397) }, - { CDF6( 1737, 10927, 14509, 19588, 22745, 28823) }, - { CDF6( 2664, 10176, 12485, 17650, 21600, 30495) }, - { CDF6( 2240, 11096, 15453, 20341, 22561, 28917) }, - { CDF6( 3605, 10428, 12459, 17676, 21244, 30655) }, - }, .filter = { - { - { CDF2(31935, 32720) }, { CDF2( 5568, 32719) }, - { CDF2( 422, 2938) }, { CDF2(28244, 32608) }, - { CDF2(31206, 31953) }, { CDF2( 4862, 32121) }, - { CDF2( 770, 1152) }, { CDF2(20889, 25637) }, - }, { - { CDF2(31910, 32724) }, { CDF2( 4120, 32712) }, - { CDF2( 305, 2247) }, { CDF2(27403, 32636) }, - { CDF2(31022, 32009) }, { CDF2( 2963, 32093) }, - { CDF2( 601, 943) }, { CDF2(14969, 21398) }, - }, - }, .newmv_mode = { - { CDF1(24035) }, { CDF1(16630) }, { CDF1(15339) }, - { CDF1( 8386) }, { CDF1(12222) }, { CDF1( 4676) }, - }, .globalmv_mode = { - { CDF1( 2175) }, { CDF1( 1054) }, - }, .refmv_mode = { - { CDF1(23974) }, { CDF1(24188) }, { CDF1(17848) }, - { CDF1(28622) }, { CDF1(24312) }, { CDF1(19923) }, - }, .drl_bit = { - { CDF1(13104) }, { CDF1(24560) }, { CDF1(18945) }, - }, .comp_inter_mode = { - { CDF7( 7760, 13823, 15808, 17641, 19156, 20666, 26891) }, - { CDF7(10730, 19452, 21145, 22749, 24039, 25131, 28724) }, - { CDF7(10664, 20221, 21588, 22906, 24295, 25387, 28436) }, - { CDF7(13298, 16984, 20471, 24182, 25067, 25736, 26422) }, - { CDF7(18904, 23325, 25242, 27432, 27898, 28258, 30758) }, - { CDF7(10725, 17454, 20124, 22820, 24195, 25168, 26046) }, - { CDF7(17125, 24273, 25814, 27492, 28214, 28704, 30592) }, - { CDF7(13046, 23214, 24505, 25942, 27435, 28442, 29330) }, - }, .intra = { - { CDF1( 806) }, { CDF1(16662) }, { CDF1(20186) }, - { CDF1(26538) }, - }, .comp = { - { CDF1(26828) }, { CDF1(24035) }, { CDF1(12031) }, - { CDF1(10640) }, { CDF1( 2901) }, - }, .comp_dir = { - { CDF1( 1198) }, { CDF1( 2070) }, { CDF1( 9166) }, - { CDF1( 7499) }, { CDF1(22475) }, - }, .jnt_comp = { - { CDF1(18244) }, { CDF1(12865) }, { CDF1( 7053) }, - { CDF1(13259) }, { CDF1( 9334) }, { CDF1( 4644) }, - }, .mask_comp = { - { CDF1(26607) }, { CDF1(22891) }, { CDF1(18840) }, - { CDF1(24594) }, { CDF1(19934) }, { CDF1(22674) }, - }, .wedge_comp = { - { CDF1(23431) }, { CDF1(13171) }, { CDF1(11470) }, - { CDF1( 9770) }, { CDF1( 9100) }, { CDF1( 8233) }, - { CDF1( 6172) }, { CDF1(11820) }, { CDF1( 7701) }, - }, .wedge_idx = { - { CDF15( 2438, 4440, 6599, 8663, 11005, 12874, 15751, 18094, - 20359, 22362, 24127, 25702, 27752, 29450, 31171) }, - { CDF15( 806, 3266, 6005, 6738, 7218, 7367, 7771, 14588, - 16323, 17367, 18452, 19422, 22839, 26127, 29629) }, - { CDF15( 2779, 3738, 4683, 7213, 7775, 8017, 8655, 14357, - 17939, 21332, 24520, 27470, 29456, 30529, 31656) }, - { CDF15( 1684, 3625, 5675, 7108, 9302, 11274, 14429, 17144, - 19163, 20961, 22884, 24471, 26719, 28714, 30877) }, - { CDF15( 1142, 3491, 6277, 7314, 8089, 8355, 9023, 13624, - 15369, 16730, 18114, 19313, 22521, 26012, 29550) }, - { CDF15( 2742, 4195, 5727, 8035, 8980, 9336, 10146, 14124, - 17270, 20533, 23434, 25972, 27944, 29570, 31416) }, - { CDF15( 1727, 3948, 6101, 7796, 9841, 12344, 15766, 18944, - 20638, 22038, 23963, 25311, 26988, 28766, 31012) }, - { CDF15( 154, 987, 1925, 2051, 2088, 2111, 2151, 23033, - 23703, 24284, 24985, 25684, 27259, 28883, 30911) }, - { CDF15( 1135, 1322, 1493, 2635, 2696, 2737, 2770, 21016, - 22935, 25057, 27251, 29173, 30089, 30960, 31933) }, - }, .interintra = { - { CDF1(16384) }, { CDF1(26887) }, { CDF1(27597) }, - { CDF1(30237) }, - }, .interintra_mode = { - { CDF3(8192, 16384, 24576) }, - { CDF3(1875, 11082, 27332) }, - { CDF3(2473, 9996, 26388) }, - { CDF3(4238, 11537, 25926) }, - }, .interintra_wedge = { - { CDF1(20036) }, { CDF1(24957) }, { CDF1(26704) }, - { CDF1(27530) }, { CDF1(29564) }, { CDF1(29444) }, - { CDF1(26872) }, - }, .ref = { - { { CDF1( 4897) }, { CDF1(16973) }, { CDF1(29744) } }, - { { CDF1( 1555) }, { CDF1(16751) }, { CDF1(30279) } }, - { { CDF1( 4236) }, { CDF1(19647) }, { CDF1(31194) } }, - { { CDF1( 8650) }, { CDF1(24773) }, { CDF1(31895) } }, - { { CDF1( 904) }, { CDF1(11014) }, { CDF1(26875) } }, - { { CDF1( 1444) }, { CDF1(15087) }, { CDF1(30304) } }, - }, .comp_fwd_ref = { - { { CDF1( 4946) }, { CDF1(19891) }, { CDF1(30731) } }, - { { CDF1( 9468) }, { CDF1(22441) }, { CDF1(31059) } }, - { { CDF1( 1503) }, { CDF1(15160) }, { CDF1(27544) } }, - }, .comp_bwd_ref = { - { { CDF1( 2235) }, { CDF1(17182) }, { CDF1(30606) } }, - { { CDF1( 1423) }, { CDF1(15175) }, { CDF1(30489) } }, - }, .comp_uni_ref = { - { { CDF1( 5284) }, { CDF1(23152) }, { CDF1(31774) } }, - { { CDF1( 3865) }, { CDF1(14173) }, { CDF1(25120) } }, - { { CDF1( 3128) }, { CDF1(15270) }, { CDF1(26710) } }, - }, .txsz = { - { - { CDF1(19968) }, { CDF1(19968) }, { CDF1(24320) }, - }, { - { CDF2(12272, 30172) }, { CDF2(12272, 30172) }, - { CDF2(18677, 30848) }, - }, { - { CDF2(12986, 15180) }, { CDF2(12986, 15180) }, - { CDF2(24302, 25602) }, - }, { - { CDF2( 5782, 11475) }, { CDF2( 5782, 11475) }, - { CDF2(16803, 22759) }, - }, - }, .txpart = { - { { CDF1(28581) }, { CDF1(23846) }, { CDF1(20847) } }, - { { CDF1(24315) }, { CDF1(18196) }, { CDF1(12133) } }, - { { CDF1(18791) }, { CDF1(10887) }, { CDF1(11005) } }, - { { CDF1(27179) }, { CDF1(20004) }, { CDF1(11281) } }, - { { CDF1(26549) }, { CDF1(19308) }, { CDF1(14224) } }, - { { CDF1(28015) }, { CDF1(21546) }, { CDF1(14400) } }, - { { CDF1(28165) }, { CDF1(22401) }, { CDF1(16088) } }, - }, .txtp_inter1 = { - { CDF15( 4458, 5560, 7695, 9709, 13330, 14789, 17537, 20266, - 21504, 22848, 23934, 25474, 27727, 28915, 30631) }, - { CDF15( 1645, 2573, 4778, 5711, 7807, 8622, 10522, 15357, - 17674, 20408, 22517, 25010, 27116, 28856, 30749) }, - }, .txtp_inter2 = { - CDF11( 770, 2421, 5225, 12907, 15819, 18927, - 21561, 24089, 26595, 28526, 30529) - }, .txtp_inter3 = { - { CDF1(16384) }, { CDF1( 4167) }, { CDF1( 1998) }, { CDF1( 748) }, - }, .txtp_intra1 = { - { - { CDF6( 1535, 8035, 9461, 12751, 23467, 27825) }, - { CDF6( 564, 3335, 9709, 10870, 18143, 28094) }, - { CDF6( 672, 3247, 3676, 11982, 19415, 23127) }, - { CDF6( 5279, 13885, 15487, 18044, 23527, 30252) }, - { CDF6( 4423, 6074, 7985, 10416, 25693, 29298) }, - { CDF6( 1486, 4241, 9460, 10662, 16456, 27694) }, - { CDF6( 439, 2838, 3522, 6737, 18058, 23754) }, - { CDF6( 1190, 4233, 4855, 11670, 20281, 24377) }, - { CDF6( 1045, 4312, 8647, 10159, 18644, 29335) }, - { CDF6( 202, 3734, 4747, 7298, 17127, 24016) }, - { CDF6( 447, 4312, 6819, 8884, 16010, 23858) }, - { CDF6( 277, 4369, 5255, 8905, 16465, 22271) }, - { CDF6( 3409, 5436, 10599, 15599, 19687, 24040) }, - }, { - { CDF6( 1870, 13742, 14530, 16498, 23770, 27698) }, - { CDF6( 326, 8796, 14632, 15079, 19272, 27486) }, - { CDF6( 484, 7576, 7712, 14443, 19159, 22591) }, - { CDF6( 1126, 15340, 15895, 17023, 20896, 30279) }, - { CDF6( 655, 4854, 5249, 5913, 22099, 27138) }, - { CDF6( 1299, 6458, 8885, 9290, 14851, 25497) }, - { CDF6( 311, 5295, 5552, 6885, 16107, 22672) }, - { CDF6( 883, 8059, 8270, 11258, 17289, 21549) }, - { CDF6( 741, 7580, 9318, 10345, 16688, 29046) }, - { CDF6( 110, 7406, 7915, 9195, 16041, 23329) }, - { CDF6( 363, 7974, 9357, 10673, 15629, 24474) }, - { CDF6( 153, 7647, 8112, 9936, 15307, 19996) }, - { CDF6( 3511, 6332, 11165, 15335, 19323, 23594) }, - }, - }, .txtp_intra2 = { - { - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - }, { - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - }, { - { CDF4( 1127, 12814, 22772, 27483) }, - { CDF4( 145, 6761, 11980, 26667) }, - { CDF4( 362, 5887, 11678, 16725) }, - { CDF4( 385, 15213, 18587, 30693) }, - { CDF4( 25, 2914, 23134, 27903) }, - { CDF4( 60, 4470, 11749, 23991) }, - { CDF4( 37, 3332, 14511, 21448) }, - { CDF4( 157, 6320, 13036, 17439) }, - { CDF4( 119, 6719, 12906, 29396) }, - { CDF4( 47, 5537, 12576, 21499) }, - { CDF4( 269, 6076, 11258, 23115) }, - { CDF4( 83, 5615, 12001, 17228) }, - { CDF4( 1968, 5556, 12023, 18547) }, - }, - }, .skip = { - { CDF1(31671) }, { CDF1(16515) }, { CDF1( 4576) }, - }, .skip_mode = { - { CDF1(32621) }, { CDF1(20708) }, { CDF1( 8127) }, - }, .partition = { - { - // 128x128 -> 64x64 - { CDF7(27899, 28219, 28529, 32484, 32539, 32619, 32639) }, - { CDF7( 6607, 6990, 8268, 32060, 32219, 32338, 32371) }, - { CDF7( 5429, 6676, 7122, 32027, 32227, 32531, 32582) }, - { CDF7( 711, 966, 1172, 32448, 32538, 32617, 32664) }, - }, { - // 64x64 -> 32x32 - { CDF9(20137, 21547, 23078, 29566, 29837, - 30261, 30524, 30892, 31724) }, - { CDF9( 6732, 7490, 9497, 27944, 28250, - 28515, 28969, 29630, 30104) }, - { CDF9( 5945, 7663, 8348, 28683, 29117, - 29749, 30064, 30298, 32238) }, - { CDF9( 870, 1212, 1487, 31198, 31394, - 31574, 31743, 31881, 32332) }, - }, { - // 32x32 -> 16x16 - { CDF9(18462, 20920, 23124, 27647, 28227, - 29049, 29519, 30178, 31544) }, - { CDF9( 7689, 9060, 12056, 24992, 25660, - 26182, 26951, 28041, 29052) }, - { CDF9( 6015, 9009, 10062, 24544, 25409, - 26545, 27071, 27526, 32047) }, - { CDF9( 1394, 2208, 2796, 28614, 29061, - 29466, 29840, 30185, 31899) }, - }, { - // 16x16 -> 8x8 - { CDF9(15597, 20929, 24571, 26706, 27664, - 28821, 29601, 30571, 31902) }, - { CDF9( 7925, 11043, 16785, 22470, 23971, - 25043, 26651, 28701, 29834) }, - { CDF9( 5414, 13269, 15111, 20488, 22360, - 24500, 25537, 26336, 32117) }, - { CDF9( 2662, 6362, 8614, 20860, 23053, - 24778, 26436, 27829, 31171) }, - }, { - // 8x8 -> 4x4 only supports the four legacy partition types - { CDF3(19132, 25510, 30392) }, - { CDF3(13928, 19855, 28540) }, - { CDF3(12522, 23679, 28629) }, - { CDF3( 9896, 18783, 25853) }, - }, - }, .seg_pred = { - { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) }, - }, .seg_id = { - { CDF7( 5622, 7893, 16093, 18233, 27809, 28373, 32533) }, - { CDF7(14274, 18230, 22557, 24935, 29980, 30851, 32344) }, - { CDF7(27527, 28487, 28723, 28890, 32397, 32647, 32679) }, - }, .cfl_sign = { - CDF7( 1418, 2123, 13340, 18405, 26972, 28343, 32294) - }, .cfl_alpha = { - { CDF15( 7637, 20719, 31401, 32481, 32657, 32688, 32692, 32696, - 32700, 32704, 32708, 32712, 32716, 32720, 32724) }, - { CDF15(14365, 23603, 28135, 31168, 32167, 32395, 32487, 32573, - 32620, 32647, 32668, 32672, 32676, 32680, 32684) }, - { CDF15(11532, 22380, 28445, 31360, 32349, 32523, 32584, 32649, - 32673, 32677, 32681, 32685, 32689, 32693, 32697) }, - { CDF15(26990, 31402, 32282, 32571, 32692, 32696, 32700, 32704, - 32708, 32712, 32716, 32720, 32724, 32728, 32732) }, - { CDF15(17248, 26058, 28904, 30608, 31305, 31877, 32126, 32321, - 32394, 32464, 32516, 32560, 32576, 32593, 32622) }, - { CDF15(14738, 21678, 25779, 27901, 29024, 30302, 30980, 31843, - 32144, 32413, 32520, 32594, 32622, 32656, 32660) }, - }, .restore_wiener = { - CDF1(11570) - }, .restore_sgrproj = { - CDF1(16855) - }, .restore_switchable = { - CDF2( 9413, 22581) - }, .delta_q = { - CDF3(28160, 32120, 32677) - }, .delta_lf = { - { CDF3(28160, 32120, 32677) }, - { CDF3(28160, 32120, 32677) }, - { CDF3(28160, 32120, 32677) }, - { CDF3(28160, 32120, 32677) }, - { CDF3(28160, 32120, 32677) }, - }, .motion_mode = { - [BS_8x8] = { CDF2( 7651, 24760) }, - [BS_8x16] = { CDF2( 4738, 24765) }, - [BS_8x32] = { CDF2(28799, 31390) }, - [BS_16x8] = { CDF2( 5391, 25528) }, - [BS_16x16] = { CDF2(19419, 26810) }, - [BS_16x32] = { CDF2( 5123, 23606) }, - [BS_16x64] = { CDF2(28973, 31594) }, - [BS_32x8] = { CDF2(26431, 30774) }, - [BS_32x16] = { CDF2(11606, 24308) }, - [BS_32x32] = { CDF2(26260, 29116) }, - [BS_32x64] = { CDF2(20360, 28062) }, - [BS_64x16] = { CDF2(29742, 31203) }, - [BS_64x32] = { CDF2(21679, 26830) }, - [BS_64x64] = { CDF2(29516, 30701) }, - [BS_64x128] = { CDF2(28898, 30397) }, - [BS_128x64] = { CDF2(30878, 31335) }, - [BS_128x128] = { CDF2(32507, 32558) }, - }, .obmc = { - [BS_8x8] = { CDF1(10437) }, - [BS_8x16] = { CDF1( 9371) }, - [BS_8x32] = { CDF1(23664) }, - [BS_16x8] = { CDF1( 9301) }, - [BS_16x16] = { CDF1(17432) }, - [BS_16x32] = { CDF1(14423) }, - [BS_16x64] = { CDF1(24008) }, - [BS_32x8] = { CDF1(20901) }, - [BS_32x16] = { CDF1(15142) }, - [BS_32x32] = { CDF1(25817) }, - [BS_32x64] = { CDF1(22823) }, - [BS_64x16] = { CDF1(26879) }, - [BS_64x32] = { CDF1(22083) }, - [BS_64x64] = { CDF1(30128) }, - [BS_64x128] = { CDF1(31014) }, - [BS_128x64] = { CDF1(31560) }, - [BS_128x128] = { CDF1(32638) }, - }, .pal_y = { - { { CDF1(31676) }, { CDF1( 3419) }, { CDF1( 1261) } }, - { { CDF1(31912) }, { CDF1( 2859) }, { CDF1( 980) } }, - { { CDF1(31823) }, { CDF1( 3400) }, { CDF1( 781) } }, - { { CDF1(32030) }, { CDF1( 3561) }, { CDF1( 904) } }, - { { CDF1(32309) }, { CDF1( 7337) }, { CDF1( 1462) } }, - { { CDF1(32265) }, { CDF1( 4015) }, { CDF1( 1521) } }, - { { CDF1(32450) }, { CDF1( 7946) }, { CDF1( 129) } }, - }, .pal_sz = { - { - { CDF6( 7952, 13000, 18149, 21478, 25527, 29241) }, - { CDF6( 7139, 11421, 16195, 19544, 23666, 28073) }, - { CDF6( 7788, 12741, 17325, 20500, 24315, 28530) }, - { CDF6( 8271, 14064, 18246, 21564, 25071, 28533) }, - { CDF6(12725, 19180, 21863, 24839, 27535, 30120) }, - { CDF6( 9711, 14888, 16923, 21052, 25661, 27875) }, - { CDF6(14940, 20797, 21678, 24186, 27033, 28999) }, - }, { - { CDF6( 8713, 19979, 27128, 29609, 31331, 32272) }, - { CDF6( 5839, 15573, 23581, 26947, 29848, 31700) }, - { CDF6( 4426, 11260, 17999, 21483, 25863, 29430) }, - { CDF6( 3228, 9464, 14993, 18089, 22523, 27420) }, - { CDF6( 3768, 8886, 13091, 17852, 22495, 27207) }, - { CDF6( 2464, 8451, 12861, 21632, 25525, 28555) }, - { CDF6( 1269, 5435, 10433, 18963, 21700, 25865) }, - }, - }, .pal_uv = { - { CDF1(32461) }, { CDF1(21488) }, - }, .color_map = { - { /* y */ +typedef struct CdfDefaultContext { + CdfModeContext m; + struct { + CdfMvComponent comp; + ALIGN(uint16_t joint[N_MV_JOINTS], 8); + } mv; + ALIGN(uint16_t kfym[5][5][N_INTRA_PRED_MODES + 3], 32); +} CdfDefaultContext; + +static const CdfDefaultContext default_cdf = { + .m = { + .y_mode = { + { CDF12(22801, 23489, 24293, 24756, 25601, 26123, + 26606, 27418, 27945, 29228, 29685, 30349) }, + { CDF12(18673, 19845, 22631, 23318, 23950, 24649, + 25527, 27364, 28152, 29701, 29984, 30852) }, + { CDF12(19770, 20979, 23396, 23939, 24241, 24654, + 25136, 27073, 27830, 29360, 29730, 30659) }, + { CDF12(20155, 21301, 22838, 23178, 23261, 23533, + 23703, 24804, 25352, 26575, 27016, 28049) }, + }, .use_filter_intra = { + [BS_4x4] = { CDF1( 4621) }, + [BS_4x8] = { CDF1( 6743) }, + [BS_8x4] = { CDF1( 5893) }, + [BS_8x8] = { CDF1( 7866) }, + [BS_8x16] = { CDF1(12551) }, + [BS_16x8] = { CDF1( 9394) }, + [BS_16x16] = { CDF1(12408) }, + [BS_16x32] = { CDF1(14301) }, + [BS_32x16] = { CDF1(12756) }, + [BS_32x32] = { CDF1(22343) }, + [BS_32x64] = { CDF1(16384) }, + [BS_64x32] = { CDF1(16384) }, + [BS_64x64] = { CDF1(16384) }, + [BS_64x128] = { CDF1(16384) }, + [BS_128x64] = { CDF1(16384) }, + [BS_128x128] = { CDF1(16384) }, + [BS_4x16] = { CDF1(12770) }, + [BS_16x4] = { CDF1(10368) }, + [BS_8x32] = { CDF1(20229) }, + [BS_32x8] = { CDF1(18101) }, + [BS_16x64] = { CDF1(16384) }, + [BS_64x16] = { CDF1(16384) }, + }, .filter_intra = { + CDF4(8949, 12776, 17211, 29558), + }, .uv_mode = { { - { CDF1(28710) }, { CDF1(16384) }, { CDF1(10553) }, - { CDF1(27036) }, { CDF1(31603) }, + { CDF12(22631, 24152, 25378, 25661, 25986, 26520, + 27055, 27923, 28244, 30059, 30941, 31961) }, + { CDF12( 9513, 26881, 26973, 27046, 27118, 27664, + 27739, 27824, 28359, 29505, 29800, 31796) }, + { CDF12( 9845, 9915, 28663, 28704, 28757, 28780, + 29198, 29822, 29854, 30764, 31777, 32029) }, + { CDF12(13639, 13897, 14171, 25331, 25606, 25727, + 25953, 27148, 28577, 30612, 31355, 32493) }, + { CDF12( 9764, 9835, 9930, 9954, 25386, 27053, + 27958, 28148, 28243, 31101, 31744, 32363) }, + { CDF12(11825, 13589, 13677, 13720, 15048, 29213, + 29301, 29458, 29711, 31161, 31441, 32550) }, + { CDF12(14175, 14399, 16608, 16821, 17718, 17775, + 28551, 30200, 30245, 31837, 32342, 32667) }, + { CDF12(12885, 13038, 14978, 15590, 15673, 15748, + 16176, 29128, 29267, 30643, 31961, 32461) }, + { CDF12(12026, 13661, 13874, 15305, 15490, 15726, + 15995, 16273, 28443, 30388, 30767, 32416) }, + { CDF12(19052, 19840, 20579, 20916, 21150, 21467, + 21885, 22719, 23174, 28861, 30379, 32175) }, + { CDF12(18627, 19649, 20974, 21219, 21492, 21816, + 22199, 23119, 23527, 27053, 31397, 32148) }, + { CDF12(17026, 19004, 19997, 20339, 20586, 21103, + 21349, 21907, 22482, 25896, 26541, 31819) }, + { CDF12(12124, 13759, 14959, 14992, 15007, 15051, + 15078, 15166, 15255, 15753, 16039, 16606) }, }, { - { CDF2(27877, 30490) }, { CDF2(11532, 25697) }, - { CDF2( 6544, 30234) }, { CDF2(23018, 28072) }, - { CDF2(31915, 32385) }, + { CDF13(10407, 11208, 12900, 13181, 13823, 14175, 14899, + 15656, 15986, 20086, 20995, 22455, 24212) }, + { CDF13( 4532, 19780, 20057, 20215, 20428, 21071, 21199, + 21451, 22099, 24228, 24693, 27032, 29472) }, + { CDF13( 5273, 5379, 20177, 20270, 20385, 20439, 20949, + 21695, 21774, 23138, 24256, 24703, 26679) }, + { CDF13( 6740, 7167, 7662, 14152, 14536, 14785, 15034, + 16741, 18371, 21520, 22206, 23389, 24182) }, + { CDF13( 4987, 5368, 5928, 6068, 19114, 20315, 21857, + 22253, 22411, 24911, 25380, 26027, 26376) }, + { CDF13( 5370, 6889, 7247, 7393, 9498, 21114, 21402, + 21753, 21981, 24780, 25386, 26517, 27176) }, + { CDF13( 4816, 4961, 7204, 7326, 8765, 8930, 20169, + 20682, 20803, 23188, 23763, 24455, 24940) }, + { CDF13( 6608, 6740, 8529, 9049, 9257, 9356, 9735, + 18827, 19059, 22336, 23204, 23964, 24793) }, + { CDF13( 5998, 7419, 7781, 8933, 9255, 9549, 9753, + 10417, 18898, 22494, 23139, 24764, 25989) }, + { CDF13(10660, 11298, 12550, 12957, 13322, 13624, 14040, + 15004, 15534, 20714, 21789, 23443, 24861) }, + { CDF13(10522, 11530, 12552, 12963, 13378, 13779, 14245, + 15235, 15902, 20102, 22696, 23774, 25838) }, + { CDF13(10099, 10691, 12639, 13049, 13386, 13665, 14125, + 15163, 15636, 19676, 20474, 23519, 25208) }, + { CDF13( 3144, 5087, 7382, 7504, 7593, 7690, 7801, + 8064, 8232, 9248, 9875, 10521, 29048) }, + }, + }, .angle_delta = { + { CDF6( 2180, 5032, 7567, 22776, 26989, 30217) }, + { CDF6( 2301, 5608, 8801, 23487, 26974, 30330) }, + { CDF6( 3780, 11018, 13699, 19354, 23083, 31286) }, + { CDF6( 4581, 11226, 15147, 17138, 21834, 28397) }, + { CDF6( 1737, 10927, 14509, 19588, 22745, 28823) }, + { CDF6( 2664, 10176, 12485, 17650, 21600, 30495) }, + { CDF6( 2240, 11096, 15453, 20341, 22561, 28917) }, + { CDF6( 3605, 10428, 12459, 17676, 21244, 30655) }, + }, .filter = { + { + { CDF2(31935, 32720) }, { CDF2( 5568, 32719) }, + { CDF2( 422, 2938) }, { CDF2(28244, 32608) }, + { CDF2(31206, 31953) }, { CDF2( 4862, 32121) }, + { CDF2( 770, 1152) }, { CDF2(20889, 25637) }, }, { - { CDF3(25572, 28046, 30045) }, - { CDF3( 9478, 21590, 27256) }, - { CDF3( 7248, 26837, 29824) }, - { CDF3(19167, 24486, 28349) }, - { CDF3(31400, 31825, 32250) }, + { CDF2(31910, 32724) }, { CDF2( 4120, 32712) }, + { CDF2( 305, 2247) }, { CDF2(27403, 32636) }, + { CDF2(31022, 32009) }, { CDF2( 2963, 32093) }, + { CDF2( 601, 943) }, { CDF2(14969, 21398) }, + }, + }, .newmv_mode = { + { CDF1(24035) }, { CDF1(16630) }, { CDF1(15339) }, + { CDF1( 8386) }, { CDF1(12222) }, { CDF1( 4676) }, + }, .globalmv_mode = { + { CDF1( 2175) }, { CDF1( 1054) }, + }, .refmv_mode = { + { CDF1(23974) }, { CDF1(24188) }, { CDF1(17848) }, + { CDF1(28622) }, { CDF1(24312) }, { CDF1(19923) }, + }, .drl_bit = { + { CDF1(13104) }, { CDF1(24560) }, { CDF1(18945) }, + }, .comp_inter_mode = { + { CDF7( 7760, 13823, 15808, 17641, 19156, 20666, 26891) }, + { CDF7(10730, 19452, 21145, 22749, 24039, 25131, 28724) }, + { CDF7(10664, 20221, 21588, 22906, 24295, 25387, 28436) }, + { CDF7(13298, 16984, 20471, 24182, 25067, 25736, 26422) }, + { CDF7(18904, 23325, 25242, 27432, 27898, 28258, 30758) }, + { CDF7(10725, 17454, 20124, 22820, 24195, 25168, 26046) }, + { CDF7(17125, 24273, 25814, 27492, 28214, 28704, 30592) }, + { CDF7(13046, 23214, 24505, 25942, 27435, 28442, 29330) }, + }, .intra = { + { CDF1( 806) }, { CDF1(16662) }, { CDF1(20186) }, + { CDF1(26538) }, + }, .comp = { + { CDF1(26828) }, { CDF1(24035) }, { CDF1(12031) }, + { CDF1(10640) }, { CDF1( 2901) }, + }, .comp_dir = { + { CDF1( 1198) }, { CDF1( 2070) }, { CDF1( 9166) }, + { CDF1( 7499) }, { CDF1(22475) }, + }, .jnt_comp = { + { CDF1(18244) }, { CDF1(12865) }, { CDF1( 7053) }, + { CDF1(13259) }, { CDF1( 9334) }, { CDF1( 4644) }, + }, .mask_comp = { + { CDF1(26607) }, { CDF1(22891) }, { CDF1(18840) }, + { CDF1(24594) }, { CDF1(19934) }, { CDF1(22674) }, + }, .wedge_comp = { + { CDF1(23431) }, { CDF1(13171) }, { CDF1(11470) }, + { CDF1( 9770) }, { CDF1( 9100) }, { CDF1( 8233) }, + { CDF1( 6172) }, { CDF1(11820) }, { CDF1( 7701) }, + }, .wedge_idx = { + { CDF15( 2438, 4440, 6599, 8663, 11005, 12874, 15751, 18094, + 20359, 22362, 24127, 25702, 27752, 29450, 31171) }, + { CDF15( 806, 3266, 6005, 6738, 7218, 7367, 7771, 14588, + 16323, 17367, 18452, 19422, 22839, 26127, 29629) }, + { CDF15( 2779, 3738, 4683, 7213, 7775, 8017, 8655, 14357, + 17939, 21332, 24520, 27470, 29456, 30529, 31656) }, + { CDF15( 1684, 3625, 5675, 7108, 9302, 11274, 14429, 17144, + 19163, 20961, 22884, 24471, 26719, 28714, 30877) }, + { CDF15( 1142, 3491, 6277, 7314, 8089, 8355, 9023, 13624, + 15369, 16730, 18114, 19313, 22521, 26012, 29550) }, + { CDF15( 2742, 4195, 5727, 8035, 8980, 9336, 10146, 14124, + 17270, 20533, 23434, 25972, 27944, 29570, 31416) }, + { CDF15( 1727, 3948, 6101, 7796, 9841, 12344, 15766, 18944, + 20638, 22038, 23963, 25311, 26988, 28766, 31012) }, + { CDF15( 154, 987, 1925, 2051, 2088, 2111, 2151, 23033, + 23703, 24284, 24985, 25684, 27259, 28883, 30911) }, + { CDF15( 1135, 1322, 1493, 2635, 2696, 2737, 2770, 21016, + 22935, 25057, 27251, 29173, 30089, 30960, 31933) }, + }, .interintra = { + { CDF1(16384) }, { CDF1(26887) }, { CDF1(27597) }, + { CDF1(30237) }, + }, .interintra_mode = { + { CDF3(8192, 16384, 24576) }, + { CDF3(1875, 11082, 27332) }, + { CDF3(2473, 9996, 26388) }, + { CDF3(4238, 11537, 25926) }, + }, .interintra_wedge = { + { CDF1(20036) }, { CDF1(24957) }, { CDF1(26704) }, + { CDF1(27530) }, { CDF1(29564) }, { CDF1(29444) }, + { CDF1(26872) }, + }, .ref = { + { { CDF1( 4897) }, { CDF1(16973) }, { CDF1(29744) } }, + { { CDF1( 1555) }, { CDF1(16751) }, { CDF1(30279) } }, + { { CDF1( 4236) }, { CDF1(19647) }, { CDF1(31194) } }, + { { CDF1( 8650) }, { CDF1(24773) }, { CDF1(31895) } }, + { { CDF1( 904) }, { CDF1(11014) }, { CDF1(26875) } }, + { { CDF1( 1444) }, { CDF1(15087) }, { CDF1(30304) } }, + }, .comp_fwd_ref = { + { { CDF1( 4946) }, { CDF1(19891) }, { CDF1(30731) } }, + { { CDF1( 9468) }, { CDF1(22441) }, { CDF1(31059) } }, + { { CDF1( 1503) }, { CDF1(15160) }, { CDF1(27544) } }, + }, .comp_bwd_ref = { + { { CDF1( 2235) }, { CDF1(17182) }, { CDF1(30606) } }, + { { CDF1( 1423) }, { CDF1(15175) }, { CDF1(30489) } }, + }, .comp_uni_ref = { + { { CDF1( 5284) }, { CDF1(23152) }, { CDF1(31774) } }, + { { CDF1( 3865) }, { CDF1(14173) }, { CDF1(25120) } }, + { { CDF1( 3128) }, { CDF1(15270) }, { CDF1(26710) } }, + }, .txsz = { + { + { CDF1(19968) }, { CDF1(19968) }, { CDF1(24320) }, }, { - { CDF4(24779, 26955, 28576, 30282) }, - { CDF4( 8669, 20364, 24073, 28093) }, - { CDF4( 4255, 27565, 29377, 31067) }, - { CDF4(19864, 23674, 26716, 29530) }, - { CDF4(31646, 31893, 32147, 32426) }, + { CDF2(12272, 30172) }, { CDF2(12272, 30172) }, + { CDF2(18677, 30848) }, }, { - { CDF5(23132, 25407, 26970, 28435, 30073) }, - { CDF5( 7443, 17242, 20717, 24762, 27982) }, - { CDF5( 6300, 24862, 26944, 28784, 30671) }, - { CDF5(18916, 22895, 25267, 27435, 29652) }, - { CDF5(31270, 31550, 31808, 32059, 32353) }, + { CDF2(12986, 15180) }, { CDF2(12986, 15180) }, + { CDF2(24302, 25602) }, }, { - { CDF6(23105, 25199, 26464, 27684, 28931, 30318) }, - { CDF6( 6950, 15447, 18952, 22681, 25567, 28563) }, - { CDF6( 7560, 23474, 25490, 27203, 28921, 30708) }, - { CDF6(18544, 22373, 24457, 26195, 28119, 30045) }, - { CDF6(31198, 31451, 31670, 31882, 32123, 32391) }, + { CDF2( 5782, 11475) }, { CDF2( 5782, 11475) }, + { CDF2(16803, 22759) }, + }, + }, .txpart = { + { { CDF1(28581) }, { CDF1(23846) }, { CDF1(20847) } }, + { { CDF1(24315) }, { CDF1(18196) }, { CDF1(12133) } }, + { { CDF1(18791) }, { CDF1(10887) }, { CDF1(11005) } }, + { { CDF1(27179) }, { CDF1(20004) }, { CDF1(11281) } }, + { { CDF1(26549) }, { CDF1(19308) }, { CDF1(14224) } }, + { { CDF1(28015) }, { CDF1(21546) }, { CDF1(14400) } }, + { { CDF1(28165) }, { CDF1(22401) }, { CDF1(16088) } }, + }, .txtp_inter1 = { + { CDF15( 4458, 5560, 7695, 9709, 13330, 14789, 17537, 20266, + 21504, 22848, 23934, 25474, 27727, 28915, 30631) }, + { CDF15( 1645, 2573, 4778, 5711, 7807, 8622, 10522, 15357, + 17674, 20408, 22517, 25010, 27116, 28856, 30749) }, + }, .txtp_inter2 = { + CDF11( 770, 2421, 5225, 12907, 15819, 18927, + 21561, 24089, 26595, 28526, 30529) + }, .txtp_inter3 = { + { CDF1(16384) }, { CDF1( 4167) }, { CDF1( 1998) }, { CDF1( 748) }, + }, .txtp_intra1 = { + { + { CDF6( 1535, 8035, 9461, 12751, 23467, 27825) }, + { CDF6( 564, 3335, 9709, 10870, 18143, 28094) }, + { CDF6( 672, 3247, 3676, 11982, 19415, 23127) }, + { CDF6( 5279, 13885, 15487, 18044, 23527, 30252) }, + { CDF6( 4423, 6074, 7985, 10416, 25693, 29298) }, + { CDF6( 1486, 4241, 9460, 10662, 16456, 27694) }, + { CDF6( 439, 2838, 3522, 6737, 18058, 23754) }, + { CDF6( 1190, 4233, 4855, 11670, 20281, 24377) }, + { CDF6( 1045, 4312, 8647, 10159, 18644, 29335) }, + { CDF6( 202, 3734, 4747, 7298, 17127, 24016) }, + { CDF6( 447, 4312, 6819, 8884, 16010, 23858) }, + { CDF6( 277, 4369, 5255, 8905, 16465, 22271) }, + { CDF6( 3409, 5436, 10599, 15599, 19687, 24040) }, }, { - { CDF7(21689, 23883, 25163, 26352, 27506, 28827, 30195) }, - { CDF7( 6892, 15385, 17840, 21606, 24287, 26753, 29204) }, - { CDF7( 5651, 23182, 25042, 26518, 27982, 29392, 30900) }, - { CDF7(19349, 22578, 24418, 25994, 27524, 29031, 30448) }, - { CDF7(31028, 31270, 31504, 31705, 31927, 32153, 32392) }, + { CDF6( 1870, 13742, 14530, 16498, 23770, 27698) }, + { CDF6( 326, 8796, 14632, 15079, 19272, 27486) }, + { CDF6( 484, 7576, 7712, 14443, 19159, 22591) }, + { CDF6( 1126, 15340, 15895, 17023, 20896, 30279) }, + { CDF6( 655, 4854, 5249, 5913, 22099, 27138) }, + { CDF6( 1299, 6458, 8885, 9290, 14851, 25497) }, + { CDF6( 311, 5295, 5552, 6885, 16107, 22672) }, + { CDF6( 883, 8059, 8270, 11258, 17289, 21549) }, + { CDF6( 741, 7580, 9318, 10345, 16688, 29046) }, + { CDF6( 110, 7406, 7915, 9195, 16041, 23329) }, + { CDF6( 363, 7974, 9357, 10673, 15629, 24474) }, + { CDF6( 153, 7647, 8112, 9936, 15307, 19996) }, + { CDF6( 3511, 6332, 11165, 15335, 19323, 23594) }, }, - }, { /* uv */ + }, .txtp_intra2 = { { - { CDF1(29089) }, { CDF1(16384) }, { CDF1( 8713) }, - { CDF1(29257) }, { CDF1(31610) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + }, { + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, }, { - { CDF2(25257, 29145) }, { CDF2(12287, 27293) }, - { CDF2( 7033, 27960) }, { CDF2(20145, 25405) }, - { CDF2(30608, 31639) }, + { CDF4( 1127, 12814, 22772, 27483) }, + { CDF4( 145, 6761, 11980, 26667) }, + { CDF4( 362, 5887, 11678, 16725) }, + { CDF4( 385, 15213, 18587, 30693) }, + { CDF4( 25, 2914, 23134, 27903) }, + { CDF4( 60, 4470, 11749, 23991) }, + { CDF4( 37, 3332, 14511, 21448) }, + { CDF4( 157, 6320, 13036, 17439) }, + { CDF4( 119, 6719, 12906, 29396) }, + { CDF4( 47, 5537, 12576, 21499) }, + { CDF4( 269, 6076, 11258, 23115) }, + { CDF4( 83, 5615, 12001, 17228) }, + { CDF4( 1968, 5556, 12023, 18547) }, + }, + }, .skip = { + { CDF1(31671) }, { CDF1(16515) }, { CDF1( 4576) }, + }, .skip_mode = { + { CDF1(32621) }, { CDF1(20708) }, { CDF1( 8127) }, + }, .partition = { + { + // 128x128 -> 64x64 + { CDF7(27899, 28219, 28529, 32484, 32539, 32619, 32639) }, + { CDF7( 6607, 6990, 8268, 32060, 32219, 32338, 32371) }, + { CDF7( 5429, 6676, 7122, 32027, 32227, 32531, 32582) }, + { CDF7( 711, 966, 1172, 32448, 32538, 32617, 32664) }, }, { - { CDF3(24210, 27175, 29903) }, - { CDF3( 9888, 22386, 27214) }, - { CDF3( 5901, 26053, 29293) }, - { CDF3(18318, 22152, 28333) }, - { CDF3(30459, 31136, 31926) }, + // 64x64 -> 32x32 + { CDF9(20137, 21547, 23078, 29566, 29837, + 30261, 30524, 30892, 31724) }, + { CDF9( 6732, 7490, 9497, 27944, 28250, + 28515, 28969, 29630, 30104) }, + { CDF9( 5945, 7663, 8348, 28683, 29117, + 29749, 30064, 30298, 32238) }, + { CDF9( 870, 1212, 1487, 31198, 31394, + 31574, 31743, 31881, 32332) }, }, { - { CDF4(22980, 25479, 27781, 29986) }, - { CDF4( 8413, 21408, 24859, 28874) }, - { CDF4( 2257, 29449, 30594, 31598) }, - { CDF4(19189, 21202, 25915, 28620) }, - { CDF4(31844, 32044, 32281, 32518) }, + // 32x32 -> 16x16 + { CDF9(18462, 20920, 23124, 27647, 28227, + 29049, 29519, 30178, 31544) }, + { CDF9( 7689, 9060, 12056, 24992, 25660, + 26182, 26951, 28041, 29052) }, + { CDF9( 6015, 9009, 10062, 24544, 25409, + 26545, 27071, 27526, 32047) }, + { CDF9( 1394, 2208, 2796, 28614, 29061, + 29466, 29840, 30185, 31899) }, }, { - { CDF5(22217, 24567, 26637, 28683, 30548) }, - { CDF5( 7307, 16406, 19636, 24632, 28424) }, - { CDF5( 4441, 25064, 26879, 28942, 30919) }, - { CDF5(17210, 20528, 23319, 26750, 29582) }, - { CDF5(30674, 30953, 31396, 31735, 32207) }, + // 16x16 -> 8x8 + { CDF9(15597, 20929, 24571, 26706, 27664, + 28821, 29601, 30571, 31902) }, + { CDF9( 7925, 11043, 16785, 22470, 23971, + 25043, 26651, 28701, 29834) }, + { CDF9( 5414, 13269, 15111, 20488, 22360, + 24500, 25537, 26336, 32117) }, + { CDF9( 2662, 6362, 8614, 20860, 23053, + 24778, 26436, 27829, 31171) }, }, { - { CDF6(21239, 23168, 25044, 26962, 28705, 30506) }, - { CDF6( 6545, 15012, 18004, 21817, 25503, 28701) }, - { CDF6( 3448, 26295, 27437, 28704, 30126, 31442) }, - { CDF6(15889, 18323, 21704, 24698, 26976, 29690) }, - { CDF6(30988, 31204, 31479, 31734, 31983, 32325) }, + // 8x8 -> 4x4 only supports the four legacy partition types + { CDF3(19132, 25510, 30392) }, + { CDF3(13928, 19855, 28540) }, + { CDF3(12522, 23679, 28629) }, + { CDF3( 9896, 18783, 25853) }, + }, + }, .seg_pred = { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) }, + }, .seg_id = { + { CDF7( 5622, 7893, 16093, 18233, 27809, 28373, 32533) }, + { CDF7(14274, 18230, 22557, 24935, 29980, 30851, 32344) }, + { CDF7(27527, 28487, 28723, 28890, 32397, 32647, 32679) }, + }, .cfl_sign = { + CDF7( 1418, 2123, 13340, 18405, 26972, 28343, 32294) + }, .cfl_alpha = { + { CDF15( 7637, 20719, 31401, 32481, 32657, 32688, 32692, 32696, + 32700, 32704, 32708, 32712, 32716, 32720, 32724) }, + { CDF15(14365, 23603, 28135, 31168, 32167, 32395, 32487, 32573, + 32620, 32647, 32668, 32672, 32676, 32680, 32684) }, + { CDF15(11532, 22380, 28445, 31360, 32349, 32523, 32584, 32649, + 32673, 32677, 32681, 32685, 32689, 32693, 32697) }, + { CDF15(26990, 31402, 32282, 32571, 32692, 32696, 32700, 32704, + 32708, 32712, 32716, 32720, 32724, 32728, 32732) }, + { CDF15(17248, 26058, 28904, 30608, 31305, 31877, 32126, 32321, + 32394, 32464, 32516, 32560, 32576, 32593, 32622) }, + { CDF15(14738, 21678, 25779, 27901, 29024, 30302, 30980, 31843, + 32144, 32413, 32520, 32594, 32622, 32656, 32660) }, + }, .restore_wiener = { + CDF1(11570) + }, .restore_sgrproj = { + CDF1(16855) + }, .restore_switchable = { + CDF2( 9413, 22581) + }, .delta_q = { + CDF3(28160, 32120, 32677) + }, .delta_lf = { + { CDF3(28160, 32120, 32677) }, + { CDF3(28160, 32120, 32677) }, + { CDF3(28160, 32120, 32677) }, + { CDF3(28160, 32120, 32677) }, + { CDF3(28160, 32120, 32677) }, + }, .motion_mode = { + [BS_8x8] = { CDF2( 7651, 24760) }, + [BS_8x16] = { CDF2( 4738, 24765) }, + [BS_8x32] = { CDF2(28799, 31390) }, + [BS_16x8] = { CDF2( 5391, 25528) }, + [BS_16x16] = { CDF2(19419, 26810) }, + [BS_16x32] = { CDF2( 5123, 23606) }, + [BS_16x64] = { CDF2(28973, 31594) }, + [BS_32x8] = { CDF2(26431, 30774) }, + [BS_32x16] = { CDF2(11606, 24308) }, + [BS_32x32] = { CDF2(26260, 29116) }, + [BS_32x64] = { CDF2(20360, 28062) }, + [BS_64x16] = { CDF2(29742, 31203) }, + [BS_64x32] = { CDF2(21679, 26830) }, + [BS_64x64] = { CDF2(29516, 30701) }, + [BS_64x128] = { CDF2(28898, 30397) }, + [BS_128x64] = { CDF2(30878, 31335) }, + [BS_128x128] = { CDF2(32507, 32558) }, + }, .obmc = { + [BS_8x8] = { CDF1(10437) }, + [BS_8x16] = { CDF1( 9371) }, + [BS_8x32] = { CDF1(23664) }, + [BS_16x8] = { CDF1( 9301) }, + [BS_16x16] = { CDF1(17432) }, + [BS_16x32] = { CDF1(14423) }, + [BS_16x64] = { CDF1(24008) }, + [BS_32x8] = { CDF1(20901) }, + [BS_32x16] = { CDF1(15142) }, + [BS_32x32] = { CDF1(25817) }, + [BS_32x64] = { CDF1(22823) }, + [BS_64x16] = { CDF1(26879) }, + [BS_64x32] = { CDF1(22083) }, + [BS_64x64] = { CDF1(30128) }, + [BS_64x128] = { CDF1(31014) }, + [BS_128x64] = { CDF1(31560) }, + [BS_128x128] = { CDF1(32638) }, + }, .pal_y = { + { { CDF1(31676) }, { CDF1( 3419) }, { CDF1( 1261) } }, + { { CDF1(31912) }, { CDF1( 2859) }, { CDF1( 980) } }, + { { CDF1(31823) }, { CDF1( 3400) }, { CDF1( 781) } }, + { { CDF1(32030) }, { CDF1( 3561) }, { CDF1( 904) } }, + { { CDF1(32309) }, { CDF1( 7337) }, { CDF1( 1462) } }, + { { CDF1(32265) }, { CDF1( 4015) }, { CDF1( 1521) } }, + { { CDF1(32450) }, { CDF1( 7946) }, { CDF1( 129) } }, + }, .pal_sz = { + { + { CDF6( 7952, 13000, 18149, 21478, 25527, 29241) }, + { CDF6( 7139, 11421, 16195, 19544, 23666, 28073) }, + { CDF6( 7788, 12741, 17325, 20500, 24315, 28530) }, + { CDF6( 8271, 14064, 18246, 21564, 25071, 28533) }, + { CDF6(12725, 19180, 21863, 24839, 27535, 30120) }, + { CDF6( 9711, 14888, 16923, 21052, 25661, 27875) }, + { CDF6(14940, 20797, 21678, 24186, 27033, 28999) }, }, { - { CDF7(21442, 23288, 24758, 26246, 27649, 28980, 30563) }, - { CDF7( 5863, 14933, 17552, 20668, 23683, 26411, 29273) }, - { CDF7( 3415, 25810, 26877, 27990, 29223, 30394, 31618) }, - { CDF7(17965, 20084, 22232, 23974, 26274, 28402, 30390) }, - { CDF7(31190, 31329, 31516, 31679, 31825, 32026, 32322) }, + { CDF6( 8713, 19979, 27128, 29609, 31331, 32272) }, + { CDF6( 5839, 15573, 23581, 26947, 29848, 31700) }, + { CDF6( 4426, 11260, 17999, 21483, 25863, 29430) }, + { CDF6( 3228, 9464, 14993, 18089, 22523, 27420) }, + { CDF6( 3768, 8886, 13091, 17852, 22495, 27207) }, + { CDF6( 2464, 8451, 12861, 21632, 25525, 28555) }, + { CDF6( 1269, 5435, 10433, 18963, 21700, 25865) }, + }, + }, .pal_uv = { + { CDF1(32461) }, { CDF1(21488) }, + }, .color_map = { + { /* y */ + { + { CDF1(28710) }, { CDF1(16384) }, { CDF1(10553) }, + { CDF1(27036) }, { CDF1(31603) }, + }, { + { CDF2(27877, 30490) }, { CDF2(11532, 25697) }, + { CDF2( 6544, 30234) }, { CDF2(23018, 28072) }, + { CDF2(31915, 32385) }, + }, { + { CDF3(25572, 28046, 30045) }, + { CDF3( 9478, 21590, 27256) }, + { CDF3( 7248, 26837, 29824) }, + { CDF3(19167, 24486, 28349) }, + { CDF3(31400, 31825, 32250) }, + }, { + { CDF4(24779, 26955, 28576, 30282) }, + { CDF4( 8669, 20364, 24073, 28093) }, + { CDF4( 4255, 27565, 29377, 31067) }, + { CDF4(19864, 23674, 26716, 29530) }, + { CDF4(31646, 31893, 32147, 32426) }, + }, { + { CDF5(23132, 25407, 26970, 28435, 30073) }, + { CDF5( 7443, 17242, 20717, 24762, 27982) }, + { CDF5( 6300, 24862, 26944, 28784, 30671) }, + { CDF5(18916, 22895, 25267, 27435, 29652) }, + { CDF5(31270, 31550, 31808, 32059, 32353) }, + }, { + { CDF6(23105, 25199, 26464, 27684, 28931, 30318) }, + { CDF6( 6950, 15447, 18952, 22681, 25567, 28563) }, + { CDF6( 7560, 23474, 25490, 27203, 28921, 30708) }, + { CDF6(18544, 22373, 24457, 26195, 28119, 30045) }, + { CDF6(31198, 31451, 31670, 31882, 32123, 32391) }, + }, { + { CDF7(21689, 23883, 25163, 26352, 27506, 28827, 30195) }, + { CDF7( 6892, 15385, 17840, 21606, 24287, 26753, 29204) }, + { CDF7( 5651, 23182, 25042, 26518, 27982, 29392, 30900) }, + { CDF7(19349, 22578, 24418, 25994, 27524, 29031, 30448) }, + { CDF7(31028, 31270, 31504, 31705, 31927, 32153, 32392) }, + }, + }, { /* uv */ + { + { CDF1(29089) }, { CDF1(16384) }, { CDF1( 8713) }, + { CDF1(29257) }, { CDF1(31610) }, + }, { + { CDF2(25257, 29145) }, { CDF2(12287, 27293) }, + { CDF2( 7033, 27960) }, { CDF2(20145, 25405) }, + { CDF2(30608, 31639) }, + }, { + { CDF3(24210, 27175, 29903) }, + { CDF3( 9888, 22386, 27214) }, + { CDF3( 5901, 26053, 29293) }, + { CDF3(18318, 22152, 28333) }, + { CDF3(30459, 31136, 31926) }, + }, { + { CDF4(22980, 25479, 27781, 29986) }, + { CDF4( 8413, 21408, 24859, 28874) }, + { CDF4( 2257, 29449, 30594, 31598) }, + { CDF4(19189, 21202, 25915, 28620) }, + { CDF4(31844, 32044, 32281, 32518) }, + }, { + { CDF5(22217, 24567, 26637, 28683, 30548) }, + { CDF5( 7307, 16406, 19636, 24632, 28424) }, + { CDF5( 4441, 25064, 26879, 28942, 30919) }, + { CDF5(17210, 20528, 23319, 26750, 29582) }, + { CDF5(30674, 30953, 31396, 31735, 32207) }, + }, { + { CDF6(21239, 23168, 25044, 26962, 28705, 30506) }, + { CDF6( 6545, 15012, 18004, 21817, 25503, 28701) }, + { CDF6( 3448, 26295, 27437, 28704, 30126, 31442) }, + { CDF6(15889, 18323, 21704, 24698, 26976, 29690) }, + { CDF6(30988, 31204, 31479, 31734, 31983, 32325) }, + }, { + { CDF7(21442, 23288, 24758, 26246, 27649, 28980, 30563) }, + { CDF7( 5863, 14933, 17552, 20668, 23683, 26411, 29273) }, + { CDF7( 3415, 25810, 26877, 27990, 29223, 30394, 31618) }, + { CDF7(17965, 20084, 22232, 23974, 26274, 28402, 30390) }, + { CDF7(31190, 31329, 31516, 31679, 31825, 32026, 32322) }, + }, }, + }, .intrabc = { + CDF1(30531) + }, + }, .mv = { + .comp = { + .classes = { + CDF10(28672, 30976, 31858, 32320, 32551, + 32656, 32740, 32757, 32762, 32767) + }, .class0 = { + CDF1(27648) + }, .classN = { + { CDF1(17408) }, { CDF1(17920) }, { CDF1(18944) }, + { CDF1(20480) }, { CDF1(22528) }, { CDF1(24576) }, + { CDF1(28672) }, { CDF1(29952) }, { CDF1(29952) }, + { CDF1(30720) }, + }, .class0_fp = { + { CDF3(16384, 24576, 26624) }, + { CDF3(12288, 21248, 24128) }, + }, .classN_fp = { + CDF3( 8192, 17408, 21248) + }, .class0_hp = { + CDF1(20480) + }, .classN_hp = { + CDF1(16384) + }, .sign = { + CDF1(16384) + }, + }, .joint = { + CDF3( 4096, 11264, 19328) + }, + }, .kfym = { + { + { CDF12(15588, 17027, 19338, 20218, 20682, 21110, + 21825, 23244, 24189, 28165, 29093, 30466) }, + { CDF12(12016, 18066, 19516, 20303, 20719, 21444, + 21888, 23032, 24434, 28658, 30172, 31409) }, + { CDF12(10052, 10771, 22296, 22788, 23055, 23239, + 24133, 25620, 26160, 29336, 29929, 31567) }, + { CDF12(14091, 15406, 16442, 18808, 19136, 19546, + 19998, 22096, 24746, 29585, 30958, 32462) }, + { CDF12(12122, 13265, 15603, 16501, 18609, 20033, + 22391, 25583, 26437, 30261, 31073, 32475) }, + }, { + { CDF12(10023, 19585, 20848, 21440, 21832, 22760, + 23089, 24023, 25381, 29014, 30482, 31436) }, + { CDF12( 5983, 24099, 24560, 24886, 25066, 25795, + 25913, 26423, 27610, 29905, 31276, 31794) }, + { CDF12( 7444, 12781, 20177, 20728, 21077, 21607, + 22170, 23405, 24469, 27915, 29090, 30492) }, + { CDF12( 8537, 14689, 15432, 17087, 17408, 18172, + 18408, 19825, 24649, 29153, 31096, 32210) }, + { CDF12( 7543, 14231, 15496, 16195, 17905, 20717, + 21984, 24516, 26001, 29675, 30981, 31994) }, + }, { + { CDF12(12613, 13591, 21383, 22004, 22312, 22577, + 23401, 25055, 25729, 29538, 30305, 32077) }, + { CDF12( 9687, 13470, 18506, 19230, 19604, 20147, + 20695, 22062, 23219, 27743, 29211, 30907) }, + { CDF12( 6183, 6505, 26024, 26252, 26366, 26434, + 27082, 28354, 28555, 30467, 30794, 32086) }, + { CDF12(10718, 11734, 14954, 17224, 17565, 17924, + 18561, 21523, 23878, 28975, 30287, 32252) }, + { CDF12( 9194, 9858, 16501, 17263, 18424, 19171, + 21563, 25961, 26561, 30072, 30737, 32463) }, + }, { + { CDF12(12602, 14399, 15488, 18381, 18778, 19315, + 19724, 21419, 25060, 29696, 30917, 32409) }, + { CDF12( 8203, 13821, 14524, 17105, 17439, 18131, + 18404, 19468, 25225, 29485, 31158, 32342) }, + { CDF12( 8451, 9731, 15004, 17643, 18012, 18425, + 19070, 21538, 24605, 29118, 30078, 32018) }, + { CDF12( 7714, 9048, 9516, 16667, 16817, 16994, + 17153, 18767, 26743, 30389, 31536, 32528) }, + { CDF12( 8843, 10280, 11496, 15317, 16652, 17943, + 19108, 22718, 25769, 29953, 30983, 32485) }, + }, { + { CDF12(12578, 13671, 15979, 16834, 19075, 20913, + 22989, 25449, 26219, 30214, 31150, 32477) }, + { CDF12( 9563, 13626, 15080, 15892, 17756, 20863, + 22207, 24236, 25380, 29653, 31143, 32277) }, + { CDF12( 8356, 8901, 17616, 18256, 19350, 20106, + 22598, 25947, 26466, 29900, 30523, 32261) }, + { CDF12(10835, 11815, 13124, 16042, 17018, 18039, + 18947, 22753, 24615, 29489, 30883, 32482) }, + { CDF12( 7618, 8288, 9859, 10509, 15386, 18657, + 22903, 28776, 29180, 31355, 31802, 32593) }, }, - }, .intrabc = { - CDF1(30531) - }, -}; - -static const CdfMvComponent default_mv_component_cdf = { - .classes = { - CDF10(28672, 30976, 31858, 32320, 32551, - 32656, 32740, 32757, 32762, 32767) - }, .class0 = { - CDF1(27648) - }, .classN = { - { CDF1(17408) }, { CDF1(17920) }, { CDF1(18944) }, - { CDF1(20480) }, { CDF1(22528) }, { CDF1(24576) }, - { CDF1(28672) }, { CDF1(29952) }, { CDF1(29952) }, - { CDF1(30720) }, - }, .class0_fp = { - { CDF3(16384, 24576, 26624) }, - { CDF3(12288, 21248, 24128) }, - }, .classN_fp = { - CDF3( 8192, 17408, 21248) - }, .class0_hp = { - CDF1(20480) - }, .classN_hp = { - CDF1(16384) - }, .sign = { - CDF1(16384) - }, -}; - -static const uint16_t ALIGN(default_mv_joint_cdf[N_MV_JOINTS], 8) = { - CDF3( 4096, 11264, 19328) -}; - -static const uint16_t ALIGN(default_kf_y_mode_cdf[5][5][N_INTRA_PRED_MODES + 3], 32) = { - { - { CDF12(15588, 17027, 19338, 20218, 20682, 21110, - 21825, 23244, 24189, 28165, 29093, 30466) }, - { CDF12(12016, 18066, 19516, 20303, 20719, 21444, - 21888, 23032, 24434, 28658, 30172, 31409) }, - { CDF12(10052, 10771, 22296, 22788, 23055, 23239, - 24133, 25620, 26160, 29336, 29929, 31567) }, - { CDF12(14091, 15406, 16442, 18808, 19136, 19546, - 19998, 22096, 24746, 29585, 30958, 32462) }, - { CDF12(12122, 13265, 15603, 16501, 18609, 20033, - 22391, 25583, 26437, 30261, 31073, 32475) }, - }, { - { CDF12(10023, 19585, 20848, 21440, 21832, 22760, - 23089, 24023, 25381, 29014, 30482, 31436) }, - { CDF12( 5983, 24099, 24560, 24886, 25066, 25795, - 25913, 26423, 27610, 29905, 31276, 31794) }, - { CDF12( 7444, 12781, 20177, 20728, 21077, 21607, - 22170, 23405, 24469, 27915, 29090, 30492) }, - { CDF12( 8537, 14689, 15432, 17087, 17408, 18172, - 18408, 19825, 24649, 29153, 31096, 32210) }, - { CDF12( 7543, 14231, 15496, 16195, 17905, 20717, - 21984, 24516, 26001, 29675, 30981, 31994) }, - }, { - { CDF12(12613, 13591, 21383, 22004, 22312, 22577, - 23401, 25055, 25729, 29538, 30305, 32077) }, - { CDF12( 9687, 13470, 18506, 19230, 19604, 20147, - 20695, 22062, 23219, 27743, 29211, 30907) }, - { CDF12( 6183, 6505, 26024, 26252, 26366, 26434, - 27082, 28354, 28555, 30467, 30794, 32086) }, - { CDF12(10718, 11734, 14954, 17224, 17565, 17924, - 18561, 21523, 23878, 28975, 30287, 32252) }, - { CDF12( 9194, 9858, 16501, 17263, 18424, 19171, - 21563, 25961, 26561, 30072, 30737, 32463) }, - }, { - { CDF12(12602, 14399, 15488, 18381, 18778, 19315, - 19724, 21419, 25060, 29696, 30917, 32409) }, - { CDF12( 8203, 13821, 14524, 17105, 17439, 18131, - 18404, 19468, 25225, 29485, 31158, 32342) }, - { CDF12( 8451, 9731, 15004, 17643, 18012, 18425, - 19070, 21538, 24605, 29118, 30078, 32018) }, - { CDF12( 7714, 9048, 9516, 16667, 16817, 16994, - 17153, 18767, 26743, 30389, 31536, 32528) }, - { CDF12( 8843, 10280, 11496, 15317, 16652, 17943, - 19108, 22718, 25769, 29953, 30983, 32485) }, - }, { - { CDF12(12578, 13671, 15979, 16834, 19075, 20913, - 22989, 25449, 26219, 30214, 31150, 32477) }, - { CDF12( 9563, 13626, 15080, 15892, 17756, 20863, - 22207, 24236, 25380, 29653, 31143, 32277) }, - { CDF12( 8356, 8901, 17616, 18256, 19350, 20106, - 22598, 25947, 26466, 29900, 30523, 32261) }, - { CDF12(10835, 11815, 13124, 16042, 17018, 18039, - 18947, 22753, 24615, 29489, 30883, 32482) }, - { CDF12( 7618, 8288, 9859, 10509, 15386, 18657, - 22903, 28776, 29180, 31355, 31802, 32593) }, }, }; -static const CdfCoefContext av1_default_coef_cdf[4] = { +static const CdfCoefContext default_coef_cdf[4] = { [0] = { .skip = { { @@ -3951,10 +3958,8 @@ void dav1d_cdf_thread_update(const Dav1dFrameHeader *const hdr, { #define update_cdf_1d(n1d, name) \ do { \ - memcpy(dst->name, src->name, sizeof(dst->name)); \ dst->name[n1d] = 0; \ } while (0) - #define update_cdf_2d(n1d, n2d, name) \ for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j]) #define update_cdf_3d(n1d, n2d, n3d, name) \ @@ -3962,29 +3967,8 @@ void dav1d_cdf_thread_update(const Dav1dFrameHeader *const hdr, #define update_cdf_4d(n1d, n2d, n3d, n4d, name) \ for (int l = 0; l < (n1d); l++) update_cdf_3d(n2d, n3d, n4d, name[l]) -#define update_bit_0d(name) \ - do { \ - dst->name[0] = src->name[0]; \ - dst->name[1] = 0; \ - } while (0) - -#define update_bit_1d(n1d, name) \ - for (int i = 0; i < (n1d); i++) update_bit_0d(name[i]) -#define update_bit_2d(n1d, n2d, name) \ - for (int j = 0; j < (n1d); j++) update_bit_1d(n2d, name[j]) -#define update_bit_3d(n1d, n2d, n3d, name) \ - for (int k = 0; k < (n1d); k++) update_bit_2d(n2d, n3d, name[k]) + memcpy(dst, src, offsetof(CdfContext, m.intrabc)); - update_bit_1d(N_BS_SIZES, m.use_filter_intra); - update_cdf_1d(4, m.filter_intra); - update_cdf_3d(2, N_INTRA_PRED_MODES, N_UV_INTRA_PRED_MODES - 1 - !k, m.uv_mode); - update_cdf_2d(8, 6, m.angle_delta); - update_cdf_3d(N_TX_SIZES - 1, 3, imin(k + 1, 2), m.txsz); - update_cdf_3d(2, N_INTRA_PRED_MODES, 6, m.txtp_intra1); - update_cdf_3d(3, N_INTRA_PRED_MODES, 4, m.txtp_intra2); - update_bit_1d(3, m.skip); - update_cdf_3d(N_BL_LEVELS, 4, dav1d_partition_type_count[k], m.partition); - update_bit_2d(N_TX_SIZES, 13, coef.skip); update_cdf_3d(2, 2, 4, coef.eob_bin_16); update_cdf_3d(2, 2, 5, coef.eob_bin_32); update_cdf_3d(2, 2, 6, coef.eob_bin_64); @@ -3992,106 +3976,104 @@ void dav1d_cdf_thread_update(const Dav1dFrameHeader *const hdr, update_cdf_3d(2, 2, 8, coef.eob_bin_256); update_cdf_2d(2, 9, coef.eob_bin_512); update_cdf_2d(2, 10, coef.eob_bin_1024); - update_bit_3d(N_TX_SIZES, 2, 11 /*22*/, coef.eob_hi_bit); update_cdf_4d(N_TX_SIZES, 2, 4, 2, coef.eob_base_tok); update_cdf_4d(N_TX_SIZES, 2, 41 /*42*/, 3, coef.base_tok); - update_bit_2d(2, 3, coef.dc_sign); update_cdf_4d(4, 2, 21, 3, coef.br_tok); - update_cdf_2d(3, DAV1D_MAX_SEGMENTS - 1, m.seg_id); - update_cdf_1d(7, m.cfl_sign); + update_cdf_4d(N_TX_SIZES, 2, 11 /*22*/, 1, coef.eob_hi_bit); + update_cdf_3d(N_TX_SIZES, 13, 1, coef.skip); + update_cdf_3d(2, 3, 1, coef.dc_sign); + + update_cdf_3d(2, N_INTRA_PRED_MODES, N_UV_INTRA_PRED_MODES - 1 - !k, m.uv_mode); + update_cdf_2d(4, N_PARTITIONS - 3, m.partition[BL_128X128]); + for (int k = BL_64X64; k < BL_8X8; k++) + update_cdf_2d(4, N_PARTITIONS - 1, m.partition[k]); + update_cdf_2d(4, N_SUB8X8_PARTITIONS - 1, m.partition[BL_8X8]); update_cdf_2d(6, 15, m.cfl_alpha); - update_bit_0d(m.restore_wiener); - update_bit_0d(m.restore_sgrproj); - update_cdf_1d(2, m.restore_switchable); - update_cdf_1d(3, m.delta_q); - update_cdf_2d(5, 3, m.delta_lf); - update_bit_2d(7, 3, m.pal_y); - update_bit_1d(2, m.pal_uv); - update_cdf_3d(2, 7, 6, m.pal_sz); - update_cdf_4d(2, 7, 5, k + 1, m.color_map); - update_bit_2d(7, 3, m.txpart); update_cdf_2d(2, 15, m.txtp_inter1); update_cdf_1d(11, m.txtp_inter2); - update_bit_1d(4, m.txtp_inter3); - - if (IS_KEY_OR_INTRA(hdr)) { - update_bit_0d(m.intrabc); + update_cdf_3d(2, N_INTRA_PRED_MODES, 6, m.txtp_intra1); + update_cdf_3d(3, N_INTRA_PRED_MODES, 4, m.txtp_intra2); + update_cdf_1d(7, m.cfl_sign); + update_cdf_2d(8, 6, m.angle_delta); + update_cdf_1d(4, m.filter_intra); + update_cdf_2d(3, DAV1D_MAX_SEGMENTS - 1, m.seg_id); + update_cdf_3d(2, 7, 6, m.pal_sz); + update_cdf_4d(2, 7, 5, k + 1, m.color_map); + update_cdf_3d(N_TX_SIZES - 1, 3, imin(k + 1, 2), m.txsz); + update_cdf_1d(3, m.delta_q); + update_cdf_2d(5, 3, m.delta_lf); + update_cdf_1d(2, m.restore_switchable); + update_cdf_1d(1, m.restore_wiener); + update_cdf_1d(1, m.restore_sgrproj); + update_cdf_2d(4, 1, m.txtp_inter3); + update_cdf_2d(N_BS_SIZES, 1, m.use_filter_intra); + update_cdf_3d(7, 3, 1, m.txpart); + update_cdf_2d(3, 1, m.skip); + update_cdf_3d(7, 3, 1, m.pal_y); + update_cdf_2d(2, 1, m.pal_uv); - update_cdf_1d(N_MV_JOINTS - 1, dmv.joint); - for (int k = 0; k < 2; k++) { - update_cdf_1d(10, dmv.comp[k].classes); - update_bit_0d(dmv.comp[k].class0); - update_bit_1d(10, dmv.comp[k].classN); - update_bit_0d(dmv.comp[k].sign); - } + if (IS_KEY_OR_INTRA(hdr)) return; - } - update_bit_1d(3, m.skip_mode); + memcpy(dst->m.y_mode, src->m.y_mode, + offsetof(CdfContext, kfym) - offsetof(CdfContext, m.y_mode)); + update_cdf_2d(4, N_INTRA_PRED_MODES - 1, m.y_mode); - update_cdf_3d(2, 8, DAV1D_N_SWITCHABLE_FILTERS - 1, m.filter); - update_bit_1d(6, m.newmv_mode); - update_bit_1d(2, m.globalmv_mode); - update_bit_1d(6, m.refmv_mode); - update_bit_1d(3, m.drl_bit); - update_cdf_2d(8, N_COMP_INTER_PRED_MODES - 1, m.comp_inter_mode); - update_bit_1d(4, m.intra); - update_bit_1d(5, m.comp); - update_bit_1d(5, m.comp_dir); - update_bit_1d(6, m.jnt_comp); - update_bit_1d(6, m.mask_comp); - update_bit_1d(9, m.wedge_comp); update_cdf_2d(9, 15, m.wedge_idx); - update_bit_2d(6, 3, m.ref); - update_bit_2d(3, 3, m.comp_fwd_ref); - update_bit_2d(2, 3, m.comp_bwd_ref); - update_bit_2d(3, 3, m.comp_uni_ref); - update_bit_1d(3, m.seg_pred); - update_bit_1d(4, m.interintra); - update_bit_1d(7, m.interintra_wedge); + update_cdf_2d(8, N_COMP_INTER_PRED_MODES - 1, m.comp_inter_mode); + update_cdf_3d(2, 8, DAV1D_N_SWITCHABLE_FILTERS - 1, m.filter); update_cdf_2d(4, 3, m.interintra_mode); update_cdf_2d(N_BS_SIZES, 2, m.motion_mode); - update_bit_1d(N_BS_SIZES, m.obmc); + update_cdf_2d(3, 1, m.skip_mode); + update_cdf_2d(6, 1, m.newmv_mode); + update_cdf_2d(2, 1, m.globalmv_mode); + update_cdf_2d(6, 1, m.refmv_mode); + update_cdf_2d(3, 1, m.drl_bit); + update_cdf_2d(4, 1, m.intra); + update_cdf_2d(5, 1, m.comp); + update_cdf_2d(5, 1, m.comp_dir); + update_cdf_2d(6, 1, m.jnt_comp); + update_cdf_2d(6, 1, m.mask_comp); + update_cdf_2d(9, 1, m.wedge_comp); + update_cdf_3d(6, 3, 1, m.ref); + update_cdf_3d(3, 3, 1, m.comp_fwd_ref); + update_cdf_3d(2, 3, 1, m.comp_bwd_ref); + update_cdf_3d(3, 3, 1, m.comp_uni_ref); + update_cdf_2d(3, 1, m.seg_pred); + update_cdf_2d(4, 1, m.interintra); + update_cdf_2d(7, 1, m.interintra_wedge); + update_cdf_2d(N_BS_SIZES, 1, m.obmc); - update_cdf_1d(N_MV_JOINTS - 1, mv.joint); for (int k = 0; k < 2; k++) { update_cdf_1d(10, mv.comp[k].classes); - update_bit_0d(mv.comp[k].class0); - update_bit_1d(10, mv.comp[k].classN); + update_cdf_1d(1, mv.comp[k].sign); + update_cdf_1d(1, mv.comp[k].class0); update_cdf_2d(2, 3, mv.comp[k].class0_fp); + update_cdf_1d(1, mv.comp[k].class0_hp); + update_cdf_2d(10, 1, mv.comp[k].classN); update_cdf_1d(3, mv.comp[k].classN_fp); - update_bit_0d(mv.comp[k].class0_hp); - update_bit_0d(mv.comp[k].classN_hp); - update_bit_0d(mv.comp[k].sign); + update_cdf_1d(1, mv.comp[k].classN_hp); } + update_cdf_1d(N_MV_JOINTS - 1, mv.joint); } /* * CDF threading wrappers. */ -static inline int get_qcat_idx(const int q) { - if (q <= 20) return 0; - if (q <= 60) return 1; - if (q <= 120) return 2; - return 3; -} - -void dav1d_cdf_thread_init_static(CdfThreadContext *const cdf, const int qidx) { +void dav1d_cdf_thread_init_static(CdfThreadContext *const cdf, const unsigned qidx) { cdf->ref = NULL; - cdf->data.qcat = get_qcat_idx(qidx); + cdf->data.qcat = (qidx > 20) + (qidx > 60) + (qidx > 120); } void dav1d_cdf_thread_copy(CdfContext *const dst, const CdfThreadContext *const src) { if (src->ref) { memcpy(dst, src->data.cdf, sizeof(*dst)); } else { - dst->m = av1_default_cdf; - memcpy(dst->kfym, default_kf_y_mode_cdf, sizeof(default_kf_y_mode_cdf)); - dst->coef = av1_default_coef_cdf[src->data.qcat]; - memcpy(dst->mv.joint, default_mv_joint_cdf, sizeof(default_mv_joint_cdf)); - memcpy(dst->dmv.joint, default_mv_joint_cdf, sizeof(default_mv_joint_cdf)); - dst->mv.comp[0] = dst->mv.comp[1] = dst->dmv.comp[0] = dst->dmv.comp[1] = - default_mv_component_cdf; + dst->coef = default_coef_cdf[src->data.qcat]; + memcpy(&dst->m, &default_cdf.m, + offsetof(CdfDefaultContext, mv.joint)); + memcpy(&dst->mv.comp[1], &default_cdf.mv.comp, + sizeof(default_cdf) - offsetof(CdfDefaultContext, mv.comp)); } } diff --git a/third_party/dav1d/src/cdf.h b/third_party/dav1d/src/cdf.h index 4b30474baa..c9b516dc72 100644 --- a/third_party/dav1d/src/cdf.h +++ b/third_party/dav1d/src/cdf.h @@ -34,12 +34,10 @@ #include "src/ref.h" #include "src/thread_data.h" -/* Buffers padded to [8] or [16] for SIMD where needed. */ +/* Buffers padded to [4]/[8]/[16] for SIMD where needed. */ typedef struct CdfModeContext { - ALIGN(uint16_t y_mode[4][N_INTRA_PRED_MODES + 3], 32); ALIGN(uint16_t uv_mode[2][N_INTRA_PRED_MODES][N_UV_INTRA_PRED_MODES + 2], 32); - ALIGN(uint16_t wedge_idx[9][16], 32); ALIGN(uint16_t partition[N_BL_LEVELS][4][N_PARTITIONS + 6], 32); ALIGN(uint16_t cfl_alpha[6][16], 32); ALIGN(uint16_t txtp_inter1[2][16], 32); @@ -49,23 +47,33 @@ typedef struct CdfModeContext { ALIGN(uint16_t cfl_sign[8], 16); ALIGN(uint16_t angle_delta[8][8], 16); ALIGN(uint16_t filter_intra[5 + 3], 16); - ALIGN(uint16_t comp_inter_mode[8][N_COMP_INTER_PRED_MODES], 16); ALIGN(uint16_t seg_id[3][DAV1D_MAX_SEGMENTS], 16); ALIGN(uint16_t pal_sz[2][7][7 + 1], 16); ALIGN(uint16_t color_map[2][7][5][8], 16); - ALIGN(uint16_t filter[2][8][DAV1D_N_SWITCHABLE_FILTERS + 1], 8); ALIGN(uint16_t txsz[N_TX_SIZES - 1][3][4], 8); - ALIGN(uint16_t motion_mode[N_BS_SIZES][3 + 1], 8); ALIGN(uint16_t delta_q[4], 8); ALIGN(uint16_t delta_lf[5][4], 8); - ALIGN(uint16_t interintra_mode[4][4], 8); ALIGN(uint16_t restore_switchable[3 + 1], 8); ALIGN(uint16_t restore_wiener[2], 4); ALIGN(uint16_t restore_sgrproj[2], 4); - ALIGN(uint16_t interintra[7][2], 4); - ALIGN(uint16_t interintra_wedge[7][2], 4); ALIGN(uint16_t txtp_inter3[4][2], 4); ALIGN(uint16_t use_filter_intra[N_BS_SIZES][2], 4); + ALIGN(uint16_t txpart[7][3][2], 4); + ALIGN(uint16_t skip[3][2], 4); + ALIGN(uint16_t pal_y[7][3][2], 4); + ALIGN(uint16_t pal_uv[2][2], 4); + + /* key/intra */ + ALIGN(uint16_t intrabc[2], 4); + + /* inter/switch */ + ALIGN(uint16_t y_mode[4][N_INTRA_PRED_MODES + 3], 32); + ALIGN(uint16_t wedge_idx[9][16], 32); + ALIGN(uint16_t comp_inter_mode[8][N_COMP_INTER_PRED_MODES], 16); + ALIGN(uint16_t filter[2][8][DAV1D_N_SWITCHABLE_FILTERS + 1], 8); + ALIGN(uint16_t interintra_mode[4][4], 8); + ALIGN(uint16_t motion_mode[N_BS_SIZES][3 + 1], 8); + ALIGN(uint16_t skip_mode[3][2], 4); ALIGN(uint16_t newmv_mode[6][2], 4); ALIGN(uint16_t globalmv_mode[2][2], 4); ALIGN(uint16_t refmv_mode[6][2], 4); @@ -80,14 +88,10 @@ typedef struct CdfModeContext { ALIGN(uint16_t comp_fwd_ref[3][3][2], 4); ALIGN(uint16_t comp_bwd_ref[2][3][2], 4); ALIGN(uint16_t comp_uni_ref[3][3][2], 4); - ALIGN(uint16_t txpart[7][3][2], 4); - ALIGN(uint16_t skip[3][2], 4); - ALIGN(uint16_t skip_mode[3][2], 4); ALIGN(uint16_t seg_pred[3][2], 4); + ALIGN(uint16_t interintra[7][2], 4); + ALIGN(uint16_t interintra_wedge[7][2], 4); ALIGN(uint16_t obmc[N_BS_SIZES][2], 4); - ALIGN(uint16_t pal_y[7][3][2], 4); - ALIGN(uint16_t pal_uv[2][2], 4); - ALIGN(uint16_t intrabc[2], 4); } CdfModeContext; typedef struct CdfCoefContext { @@ -108,13 +112,13 @@ typedef struct CdfCoefContext { typedef struct CdfMvComponent { ALIGN(uint16_t classes[11 + 5], 32); + ALIGN(uint16_t sign[2], 4); + ALIGN(uint16_t class0[2], 4); ALIGN(uint16_t class0_fp[2][4], 8); - ALIGN(uint16_t classN_fp[4], 8); ALIGN(uint16_t class0_hp[2], 4); - ALIGN(uint16_t classN_hp[2], 4); - ALIGN(uint16_t class0[2], 4); ALIGN(uint16_t classN[10][2], 4); - ALIGN(uint16_t sign[2], 4); + ALIGN(uint16_t classN_fp[4], 8); + ALIGN(uint16_t classN_hp[2], 4); } CdfMvComponent; typedef struct CdfMvContext { @@ -123,10 +127,10 @@ typedef struct CdfMvContext { } CdfMvContext; typedef struct CdfContext { + CdfCoefContext coef; CdfModeContext m; + CdfMvContext mv; ALIGN(uint16_t kfym[5][5][N_INTRA_PRED_MODES + 3], 32); - CdfCoefContext coef; - CdfMvContext mv, dmv; } CdfContext; typedef struct CdfThreadContext { @@ -138,7 +142,7 @@ typedef struct CdfThreadContext { atomic_uint *progress; } CdfThreadContext; -void dav1d_cdf_thread_init_static(CdfThreadContext *cdf, int qidx); +void dav1d_cdf_thread_init_static(CdfThreadContext *cdf, unsigned qidx); int dav1d_cdf_thread_alloc(Dav1dContext *c, CdfThreadContext *cdf, const int have_frame_mt); void dav1d_cdf_thread_copy(CdfContext *dst, const CdfThreadContext *src); diff --git a/third_party/dav1d/src/decode.c b/third_party/dav1d/src/decode.c index eed9dfb756..7427c35592 100644 --- a/third_party/dav1d/src/decode.c +++ b/third_party/dav1d/src/decode.c @@ -73,42 +73,29 @@ static void init_quant_tables(const Dav1dSequenceHeader *const seq_hdr, } } -static int read_mv_component_diff(Dav1dTaskContext *const t, +static int read_mv_component_diff(MsacContext *const msac, CdfMvComponent *const mv_comp, - const int have_fp) + const int mv_prec) { - Dav1dTileState *const ts = t->ts; - const Dav1dFrameContext *const f = t->f; - const int have_hp = f->frame_hdr->hp; - const int sign = dav1d_msac_decode_bool_adapt(&ts->msac, mv_comp->sign); - const int cl = dav1d_msac_decode_symbol_adapt16(&ts->msac, - mv_comp->classes, 10); - int up, fp, hp; + const int sign = dav1d_msac_decode_bool_adapt(msac, mv_comp->sign); + const int cl = dav1d_msac_decode_symbol_adapt16(msac, mv_comp->classes, 10); + int up, fp = 3, hp = 1; if (!cl) { - up = dav1d_msac_decode_bool_adapt(&ts->msac, mv_comp->class0); - if (have_fp) { - fp = dav1d_msac_decode_symbol_adapt4(&ts->msac, - mv_comp->class0_fp[up], 3); - hp = have_hp ? dav1d_msac_decode_bool_adapt(&ts->msac, - mv_comp->class0_hp) : 1; - } else { - fp = 3; - hp = 1; + up = dav1d_msac_decode_bool_adapt(msac, mv_comp->class0); + if (mv_prec >= 0) { // !force_integer_mv + fp = dav1d_msac_decode_symbol_adapt4(msac, mv_comp->class0_fp[up], 3); + if (mv_prec > 0) // allow_high_precision_mv + hp = dav1d_msac_decode_bool_adapt(msac, mv_comp->class0_hp); } } else { up = 1 << cl; for (int n = 0; n < cl; n++) - up |= dav1d_msac_decode_bool_adapt(&ts->msac, - mv_comp->classN[n]) << n; - if (have_fp) { - fp = dav1d_msac_decode_symbol_adapt4(&ts->msac, - mv_comp->classN_fp, 3); - hp = have_hp ? dav1d_msac_decode_bool_adapt(&ts->msac, - mv_comp->classN_hp) : 1; - } else { - fp = 3; - hp = 1; + up |= dav1d_msac_decode_bool_adapt(msac, mv_comp->classN[n]) << n; + if (mv_prec >= 0) { // !force_integer_mv + fp = dav1d_msac_decode_symbol_adapt4(msac, mv_comp->classN_fp, 3); + if (mv_prec > 0) // allow_high_precision_mv + hp = dav1d_msac_decode_bool_adapt(msac, mv_comp->classN_hp); } } @@ -117,25 +104,16 @@ static int read_mv_component_diff(Dav1dTaskContext *const t, return sign ? -diff : diff; } -static void read_mv_residual(Dav1dTaskContext *const t, mv *const ref_mv, - CdfMvContext *const mv_cdf, const int have_fp) +static void read_mv_residual(Dav1dTileState *const ts, mv *const ref_mv, + const int mv_prec) { - switch (dav1d_msac_decode_symbol_adapt4(&t->ts->msac, t->ts->cdf.mv.joint, - N_MV_JOINTS - 1)) - { - case MV_JOINT_HV: - ref_mv->y += read_mv_component_diff(t, &mv_cdf->comp[0], have_fp); - ref_mv->x += read_mv_component_diff(t, &mv_cdf->comp[1], have_fp); - break; - case MV_JOINT_H: - ref_mv->x += read_mv_component_diff(t, &mv_cdf->comp[1], have_fp); - break; - case MV_JOINT_V: - ref_mv->y += read_mv_component_diff(t, &mv_cdf->comp[0], have_fp); - break; - default: - break; - } + MsacContext *const msac = &ts->msac; + const enum MVJoint mv_joint = + dav1d_msac_decode_symbol_adapt4(msac, ts->cdf.mv.joint, N_MV_JOINTS - 1); + if (mv_joint & MV_JOINT_V) + ref_mv->y += read_mv_component_diff(msac, &ts->cdf.mv.comp[0], mv_prec); + if (mv_joint & MV_JOINT_H) + ref_mv->x += read_mv_component_diff(msac, &ts->cdf.mv.comp[1], mv_prec); } static void read_tx_tree(Dav1dTaskContext *const t, @@ -1001,8 +979,7 @@ static int decode_b(Dav1dTaskContext *const t, const int have_delta_q = f->frame_hdr->delta.q.present && (bs != (f->seq_hdr->sb128 ? BS_128x128 : BS_64x64) || !b->skip); - int8_t prev_delta_lf[4]; - memcpy(prev_delta_lf, ts->last_delta_lf, 4); + uint32_t prev_delta_lf = ts->last_delta_lf.u32; if (have_delta_q) { int delta_q = dav1d_msac_decode_symbol_adapt4(&ts->msac, @@ -1038,8 +1015,8 @@ static int decode_b(Dav1dTaskContext *const t, delta_lf = -delta_lf; delta_lf *= 1 << f->frame_hdr->delta.lf.res_log2; } - ts->last_delta_lf[i] = - iclip(ts->last_delta_lf[i] + delta_lf, -63, 63); + ts->last_delta_lf.i8[i] = + iclip(ts->last_delta_lf.i8[i] + delta_lf, -63, 63); if (have_delta_q && DEBUG_BLOCK_INFO) printf("Post-delta_lf[%d:%d]: r=%d\n", i, delta_lf, ts->msac.rng); @@ -1054,13 +1031,13 @@ static int decode_b(Dav1dTaskContext *const t, init_quant_tables(f->seq_hdr, f->frame_hdr, ts->last_qidx, ts->dqmem); ts->dq = ts->dqmem; } - if (!memcmp(ts->last_delta_lf, (int8_t[4]) { 0, 0, 0, 0 }, 4)) { + if (!ts->last_delta_lf.u32) { // assign frame-wide lf values to this sb ts->lflvl = f->lf.lvl; - } else if (memcmp(ts->last_delta_lf, prev_delta_lf, 4)) { + } else if (ts->last_delta_lf.u32 != prev_delta_lf) { // find sb-specific lf lvl parameters - dav1d_calc_lf_values(ts->lflvlmem, f->frame_hdr, ts->last_delta_lf); ts->lflvl = ts->lflvlmem; + dav1d_calc_lf_values(ts->lflvlmem, f->frame_hdr, ts->last_delta_lf.i8); } } @@ -1324,7 +1301,7 @@ static int decode_b(Dav1dTaskContext *const t, } const union mv ref = b->mv[0]; - read_mv_residual(t, &b->mv[0], &ts->cdf.dmv, 0); + read_mv_residual(ts, &b->mv[0], -1); // clip intrabc motion vector to decoded parts of current tile int border_left = ts->tiling.col_start * 4; @@ -1586,8 +1563,8 @@ static int decode_b(Dav1dTaskContext *const t, break; \ case NEWMV: \ b->mv[idx] = mvstack[b->drl_idx].mv.mv[idx]; \ - read_mv_residual(t, &b->mv[idx], &ts->cdf.mv, \ - !f->frame_hdr->force_integer_mv); \ + const int mv_prec = f->frame_hdr->hp - f->frame_hdr->force_integer_mv; \ + read_mv_residual(ts, &b->mv[idx], mv_prec); \ break; \ } has_subpel_filter = imin(bw4, bh4) == 1 || @@ -1775,8 +1752,8 @@ static int decode_b(Dav1dTaskContext *const t, if (DEBUG_BLOCK_INFO) printf("Post-intermode[%d,drl=%d]: r=%d\n", b->inter_mode, b->drl_idx, ts->msac.rng); - read_mv_residual(t, &b->mv[0], &ts->cdf.mv, - !f->frame_hdr->force_integer_mv); + const int mv_prec = f->frame_hdr->hp - f->frame_hdr->force_integer_mv; + read_mv_residual(ts, &b->mv[0], mv_prec); if (DEBUG_BLOCK_INFO) printf("Post-residualmv[mv=y:%d,x:%d]: r=%d\n", b->mv[0].y, b->mv[0].x, ts->msac.rng); @@ -2495,7 +2472,7 @@ static void setup_tile(Dav1dTileState *const ts, dav1d_cdf_thread_copy(&ts->cdf, &f->in_cdf); ts->last_qidx = f->frame_hdr->quant.yac; - memset(ts->last_delta_lf, 0, sizeof(ts->last_delta_lf)); + ts->last_delta_lf.u32 = 0; dav1d_msac_init(&ts->msac, data, sz, f->frame_hdr->disable_cdf_update); diff --git a/third_party/dav1d/src/internal.h b/third_party/dav1d/src/internal.h index 72f65607ed..96bf409c6c 100644 --- a/third_party/dav1d/src/internal.h +++ b/third_party/dav1d/src/internal.h @@ -303,8 +303,8 @@ struct Dav1dFrameContext { int lr_buf_plane_sz[2]; /* (stride*sbh*4) << sb128 if n_tc > 1, else stride*4 */ int re_sz /* h */; ALIGN(Av1FilterLUT lim_lut, 16); + ALIGN(uint8_t lvl[8 /* seg_id */][4 /* dir */][8 /* ref */][2 /* is_gmv */], 16); int last_sharpness; - uint8_t lvl[8 /* seg_id */][4 /* dir */][8 /* ref */][2 /* is_gmv */]; uint8_t *tx_lpf_right_edge[2]; uint8_t *cdef_line_buf, *lr_line_buf; pixel *cdef_line[2 /* pre, post */][3 /* plane */]; @@ -376,8 +376,11 @@ struct Dav1dTileState { const uint16_t (*dq)[3][2]; int last_qidx; - int8_t last_delta_lf[4]; - uint8_t lflvlmem[8 /* seg_id */][4 /* dir */][8 /* ref */][2 /* is_gmv */]; + union { + int8_t i8[4]; + uint32_t u32; + } last_delta_lf; + ALIGN(uint8_t lflvlmem[8 /* seg_id */][4 /* dir */][8 /* ref */][2 /* is_gmv */], 16); const uint8_t (*lflvl)[4][8][2]; Av1RestorationUnit *lr_ref[3]; diff --git a/third_party/dav1d/src/itx.h b/third_party/dav1d/src/itx.h index d522079907..8ef4f4df48 100644 --- a/third_party/dav1d/src/itx.h +++ b/third_party/dav1d/src/itx.h @@ -39,10 +39,73 @@ void (name)(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob \ HIGHBD_DECL_SUFFIX) typedef decl_itx_fn(*itxfm_fn); +#define decl_itx2_fns(w, h, opt) \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt)) + +#define decl_itx12_fns(w, h, opt) \ +decl_itx2_fns(w, h, opt); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt)) + +#define decl_itx16_fns(w, h, opt) \ +decl_itx12_fns(w, h, opt); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt)) + +#define decl_itx17_fns(w, h, opt) \ +decl_itx16_fns(w, h, opt); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt)) + typedef struct Dav1dInvTxfmDSPContext { itxfm_fn itxfm_add[N_RECT_TX_SIZES][N_TX_TYPES_PLUS_LL]; } Dav1dInvTxfmDSPContext; bitfn_decls(void dav1d_itx_dsp_init, Dav1dInvTxfmDSPContext *c, int bpc); +#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \ + c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \ + BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext) + +#define assign_itx1_fn(pfx, w, h, ext) \ + assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext) + +#define assign_itx2_fn(pfx, w, h, ext) \ + assign_itx1_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, identity_identity, IDTX, ext) + +#define assign_itx12_fn(pfx, w, h, ext) \ + assign_itx2_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, dct_adst, ADST_DCT, ext); \ + assign_itx_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, ext); \ + assign_itx_fn(pfx, w, h, dct_identity, H_DCT, ext); \ + assign_itx_fn(pfx, w, h, adst_dct, DCT_ADST, ext); \ + assign_itx_fn(pfx, w, h, adst_adst, ADST_ADST, ext); \ + assign_itx_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, identity_dct, V_DCT, ext) + +#define assign_itx16_fn(pfx, w, h, ext) \ + assign_itx12_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, adst_identity, H_ADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, identity_adst, V_ADST, ext); \ + assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST, ext) + +#define assign_itx17_fn(pfx, w, h, ext) \ + assign_itx16_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, wht_wht, WHT_WHT, ext) + #endif /* DAV1D_SRC_ITX_H */ diff --git a/third_party/dav1d/src/lf_mask.c b/third_party/dav1d/src/lf_mask.c index 062ba67371..09a5c532c4 100644 --- a/third_party/dav1d/src/lf_mask.c +++ b/third_party/dav1d/src/lf_mask.c @@ -436,7 +436,7 @@ static void calc_lf_value(uint8_t (*const lflvl_values)[2], const int base = iclip(iclip(base_lvl + lf_delta, 0, 63) + seg_delta, 0, 63); if (!mr_delta) { - memset(lflvl_values, base, 8 * 2); + memset(lflvl_values, base, sizeof(*lflvl_values) * 8); } else { const int sh = base >= 32; lflvl_values[0][0] = lflvl_values[0][1] = @@ -457,7 +457,7 @@ static inline void calc_lf_value_chroma(uint8_t (*const lflvl_values)[2], const Dav1dLoopfilterModeRefDeltas *const mr_delta) { if (!base_lvl) - memset(lflvl_values, 0, 8 * 2); + memset(lflvl_values, 0, sizeof(*lflvl_values) * 8); else calc_lf_value(lflvl_values, base_lvl, lf_delta, seg_delta, mr_delta); } @@ -469,7 +469,7 @@ void dav1d_calc_lf_values(uint8_t (*const lflvl_values)[4][8][2], const int n_seg = hdr->segmentation.enabled ? 8 : 1; if (!hdr->loopfilter.level_y[0] && !hdr->loopfilter.level_y[1]) { - memset(lflvl_values, 0, 8 * 4 * 2 * n_seg); + memset(lflvl_values, 0, sizeof(*lflvl_values) * n_seg); return; } diff --git a/third_party/dav1d/src/meson.build b/third_party/dav1d/src/meson.build index dc4be5fd6f..cd19b70c38 100644 --- a/third_party/dav1d/src/meson.build +++ b/third_party/dav1d/src/meson.build @@ -106,6 +106,7 @@ if is_asm_enabled 'arm/64/loopfilter.S', 'arm/64/looprestoration.S', 'arm/64/mc.S', + 'arm/64/mc_dotprod.S', ) endif diff --git a/third_party/dav1d/src/refmvs.c b/third_party/dav1d/src/refmvs.c index 200afebde7..1da024b630 100644 --- a/third_party/dav1d/src/refmvs.c +++ b/third_party/dav1d/src/refmvs.c @@ -817,7 +817,9 @@ int dav1d_refmvs_init_frame(refmvs_frame *const rf, if (r_stride != rf->r_stride || n_tile_rows != rf->n_tile_rows) { if (rf->r) dav1d_freep_aligned(&rf->r); const int uses_2pass = n_tile_threads > 1 && n_frame_threads > 1; - rf->r = dav1d_alloc_aligned(ALLOC_REFMVS, sizeof(*rf->r) * 35 * r_stride * n_tile_rows * (1 + uses_2pass), 64); + /* sizeof(refmvs_block) == 12 but it's accessed using 16-byte loads in asm, + * so add 4 bytes of padding to avoid buffer overreads. */ + rf->r = dav1d_alloc_aligned(ALLOC_REFMVS, sizeof(*rf->r) * 35 * r_stride * n_tile_rows * (1 + uses_2pass) + 4, 64); if (!rf->r) return DAV1D_ERR(ENOMEM); rf->r_stride = r_stride; } diff --git a/third_party/dav1d/src/riscv/itx.h b/third_party/dav1d/src/riscv/itx.h index d3f9a03a03..e11b138348 100644 --- a/third_party/dav1d/src/riscv/itx.h +++ b/third_party/dav1d/src/riscv/itx.h @@ -28,34 +28,6 @@ #include "src/cpu.h" #include "src/itx.h" -#define decl_itx2_fns(w, h, opt) \ -decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt)) - -#define decl_itx12_fns(w, h, opt) \ -decl_itx2_fns(w, h, opt); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt)) - -#define decl_itx16_fns(w, h, opt) \ -decl_itx12_fns(w, h, opt); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt)) - -#define decl_itx17_fns(w, h, opt) \ -decl_itx16_fns(w, h, opt); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt)) - #define decl_itx_fns(ext) \ decl_itx17_fns( 4, 4, ext); \ decl_itx16_fns( 4, 8, ext); \ @@ -70,41 +42,6 @@ decl_itx16_fns(16, 16, ext) decl_itx_fns(rvv); static ALWAYS_INLINE void itx_dsp_init_riscv(Dav1dInvTxfmDSPContext *const c, int const bpc) { -#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \ - c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \ - BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext) - -#define assign_itx1_fn(pfx, w, h, ext) \ - assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext) - -#define assign_itx2_fn(pfx, w, h, ext) \ - assign_itx1_fn(pfx, w, h, ext); \ - assign_itx_fn(pfx, w, h, identity_identity, IDTX, ext) - -#define assign_itx12_fn(pfx, w, h, ext) \ - assign_itx2_fn(pfx, w, h, ext); \ - assign_itx_fn(pfx, w, h, dct_adst, ADST_DCT, ext); \ - assign_itx_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, ext); \ - assign_itx_fn(pfx, w, h, dct_identity, H_DCT, ext); \ - assign_itx_fn(pfx, w, h, adst_dct, DCT_ADST, ext); \ - assign_itx_fn(pfx, w, h, adst_adst, ADST_ADST, ext); \ - assign_itx_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, ext); \ - assign_itx_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, ext); \ - assign_itx_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, ext); \ - assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \ - assign_itx_fn(pfx, w, h, identity_dct, V_DCT, ext) - -#define assign_itx16_fn(pfx, w, h, ext) \ - assign_itx12_fn(pfx, w, h, ext); \ - assign_itx_fn(pfx, w, h, adst_identity, H_ADST, ext); \ - assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST, ext); \ - assign_itx_fn(pfx, w, h, identity_adst, V_ADST, ext); \ - assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST, ext) - -#define assign_itx17_fn(pfx, w, h, ext) \ - assign_itx16_fn(pfx, w, h, ext); \ - assign_itx_fn(pfx, w, h, wht_wht, WHT_WHT, ext) - const unsigned flags = dav1d_get_cpu_flags(); if (!(flags & DAV1D_RISCV_CPU_FLAG_V)) return; diff --git a/third_party/dav1d/src/x86/ipred_avx2.asm b/third_party/dav1d/src/x86/ipred_avx2.asm index 35738e7c0b..2956ffaf29 100644 --- a/third_party/dav1d/src/x86/ipred_avx2.asm +++ b/third_party/dav1d/src/x86/ipred_avx2.asm @@ -66,7 +66,8 @@ z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39 z_filter_k: db 0, 16, 0, 16, 0, 20, 0, 20, 8, 16, 8, 16 db 32, 16, 32, 16, 24, 20, 24, 20, 16, 16, 16, 16 db 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 8, 0 -z_filter_s: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7 +const \ +z_filter_s, db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7 db 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15 db 15, 15, 15, 15, 15, 15, 15, 15 ; should be in one cache line pb_128: times 4 db 128 ; those are just placed here for alignment. diff --git a/third_party/dav1d/src/x86/itx.h b/third_party/dav1d/src/x86/itx.h index 346fde7d90..23d7a73806 100644 --- a/third_party/dav1d/src/x86/itx.h +++ b/third_party/dav1d/src/x86/itx.h @@ -30,34 +30,6 @@ #define BF_BPC(x, bits, suffix) x##_##bits##bpc_##suffix -#define decl_itx2_fns(w, h, opt) \ -decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt)) - -#define decl_itx12_fns(w, h, opt) \ -decl_itx2_fns(w, h, opt); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt)) - -#define decl_itx16_fns(w, h, opt) \ -decl_itx12_fns(w, h, opt); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt)) - -#define decl_itx17_fns(w, h, opt) \ -decl_itx16_fns(w, h, opt); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt)) - #define decl_itx_fns(ext) \ decl_itx17_fns( 4, 4, ext); \ decl_itx16_fns( 4, 8, ext); \ @@ -136,42 +108,6 @@ decl_itx_fn(dav1d_inv_txfm_add_wht_wht_4x4_16bpc_avx2); decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_4x4, sse2)); static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, const int bpc) { -#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \ - c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \ - BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext) - -#define assign_itx1_fn(pfx, w, h, ext) \ - assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext) - -#define assign_itx2_fn(pfx, w, h, ext) \ - assign_itx1_fn(pfx, w, h, ext); \ - assign_itx_fn(pfx, w, h, identity_identity, IDTX, ext) - -#define assign_itx12_fn(pfx, w, h, ext) \ - assign_itx2_fn(pfx, w, h, ext); \ - assign_itx_fn(pfx, w, h, dct_adst, ADST_DCT, ext); \ - assign_itx_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, ext); \ - assign_itx_fn(pfx, w, h, dct_identity, H_DCT, ext); \ - assign_itx_fn(pfx, w, h, adst_dct, DCT_ADST, ext); \ - assign_itx_fn(pfx, w, h, adst_adst, ADST_ADST, ext); \ - assign_itx_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, ext); \ - assign_itx_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, ext); \ - assign_itx_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, ext); \ - assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \ - assign_itx_fn(pfx, w, h, identity_dct, V_DCT, ext) - -#define assign_itx16_fn(pfx, w, h, ext) \ - assign_itx12_fn(pfx, w, h, ext); \ - assign_itx_fn(pfx, w, h, adst_identity, H_ADST, ext); \ - assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST, ext); \ - assign_itx_fn(pfx, w, h, identity_adst, V_ADST, ext); \ - assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST, ext) - -#define assign_itx17_fn(pfx, w, h, ext) \ - assign_itx16_fn(pfx, w, h, ext); \ - assign_itx_fn(pfx, w, h, wht_wht, WHT_WHT, ext) - - #define assign_itx_bpc_fn(pfx, w, h, type, type_enum, bpc, ext) \ c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \ BF_BPC(dav1d_inv_txfm_add_##type##_##w##x##h, bpc, ext) diff --git a/third_party/dav1d/src/x86/mc16_avx2.asm b/third_party/dav1d/src/x86/mc16_avx2.asm index 42e2a5525e..6b4424946b 100644 --- a/third_party/dav1d/src/x86/mc16_avx2.asm +++ b/third_party/dav1d/src/x86/mc16_avx2.asm @@ -1222,7 +1222,7 @@ cglobal prep_bilin_16bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 %assign FILTER_SMOOTH (1*15 << 16) | 4*15 %assign FILTER_SHARP (2*15 << 16) | 3*15 -%macro FN 4 ; prefix, type, type_h, type_v +%macro FN 4-5 ; prefix, type, type_h, type_v, jmp_to cglobal %1_%2_16bpc mov t0d, FILTER_%3 %ifidn %3, %4 @@ -1230,8 +1230,8 @@ cglobal %1_%2_16bpc %else mov t1d, FILTER_%4 %endif -%ifnidn %2, regular ; skip the jump in the last filter - jmp mangle(private_prefix %+ _%1_16bpc %+ SUFFIX) +%if %0 == 5 ; skip the jump in the last filter + jmp mangle(private_prefix %+ _%5 %+ SUFFIX) %endif %endmacro @@ -1242,22 +1242,17 @@ DECLARE_REG_TMP 7, 8 %endif %define PUT_8TAP_FN FN put_8tap, -PUT_8TAP_FN sharp, SHARP, SHARP -PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH -PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP -PUT_8TAP_FN smooth, SMOOTH, SMOOTH -PUT_8TAP_FN sharp_regular, SHARP, REGULAR -PUT_8TAP_FN regular_sharp, REGULAR, SHARP -PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR -PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH +PUT_8TAP_FN smooth, SMOOTH, SMOOTH, put_6tap_16bpc +PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR, put_6tap_16bpc +PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH, put_6tap_16bpc PUT_8TAP_FN regular, REGULAR, REGULAR -cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my +cglobal put_6tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my %define base r8-put_avx2 imul mxd, mxm, 0x010101 - add mxd, t0d ; 8tap_h, mx, 4tap_h + add mxd, t0d ; 6tap_h, mx, 4tap_h imul myd, mym, 0x010101 - add myd, t1d ; 8tap_v, my, 4tap_v + add myd, t1d ; 6tap_v, my, 4tap_v lea r8, [put_avx2] movifnidn wd, wm movifnidn hd, hm @@ -1265,6 +1260,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my jnz .h test myd, 0xf00 jnz .v +.put: tzcnt wd, wd movzx wd, word [r8+wq*2+table_offset(put,)] add wq, r8 @@ -1337,43 +1333,36 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my cmp wd, 4 je .h_w4 jl .h_w2 - WIN64_SPILL_XMM 13 + WIN64_SPILL_XMM 11 shr mxd, 16 - sub srcq, 6 - vpbroadcastq m0, [base+subpel_filters+mxq*8] - vbroadcasti128 m6, [subpel_h_shufA] - vbroadcasti128 m7, [subpel_h_shufB] + sub srcq, 4 + vpbroadcastq m0, [base+subpel_filters+1+mxq*8] + vbroadcasti128 m6, [base+subpel_h_shufA] punpcklbw m0, m0 psraw m0, 8 ; sign-extend - pshufd m8, m0, q0000 - pshufd m9, m0, q1111 - pshufd m10, m0, q2222 - pshufd m11, m0, q3333 - cmp wd, 8 - jg .h_w16 + pshufd m7, m0, q0000 + pshufd m8, m0, q1111 + pshufd m9, m0, q2222 + sub wd, 16 + jge .h_w16 .h_w8: -%macro PUT_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] - pshufb m%4, m%1, m7 ; 2 3 3 4 4 5 5 6 - pshufb m%1, m6 ; 0 1 1 2 2 3 3 4 - pmaddwd m%5, m9, m%4 ; abcd1 - pmaddwd m%1, m8 ; abcd0 - pshufb m%2, m7 ; 6 7 7 8 8 9 9 a - shufpd m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8 - paddd m%5, m4 - paddd m%1, m%5 - pmaddwd m%5, m11, m%2 ; abcd3 - paddd m%1, m%5 - pmaddwd m%5, m10, m%4 ; abcd2 - pshufb m%3, m7 ; a b b c c d d e - pmaddwd m%4, m8 ; efgh0 - paddd m%1, m%5 - pmaddwd m%5, m9, m%2 ; efgh1 - shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c - pmaddwd m%3, m11 ; efgh3 - pmaddwd m%2, m10 ; efgh2 +%macro PUT_6TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] + pshufb m%1, m6 ; 01 12 23 34 + pshufb m%2, m6 ; 45 56 67 78 + pmaddwd m%4, m7, m%1 ; a0 + pshufb m%3, m6 ; 89 9a ab bc + pmaddwd m%5, m9, m%2 ; a2 + shufpd m%1, m%2, 0x05 ; 23 34 45 56 + paddd m%4, m%5 ; a0+a2 + pmaddwd m%5, m7, m%2 ; b0 + shufpd m%2, m%3, 0x05 ; 67 78 89 9a + pmaddwd m%3, m9 ; b2 + pmaddwd m%1, m8 ; a1 + pmaddwd m%2, m8 ; b1 + paddd m%3, m%5 ; b0+b2 paddd m%4, m4 - paddd m%4, m%5 - paddd m%3, m%4 + paddd m%3, m4 + paddd m%1, m%4 paddd m%2, m%3 psrad m%1, 6 psrad m%2, 6 @@ -1384,9 +1373,9 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my vinserti128 m0, [srcq+ssq*1+ 0], 1 movu xm2, [srcq+ssq*0+16] vinserti128 m2, [srcq+ssq*1+16], 1 - lea srcq, [srcq+ssq*2] shufpd m1, m0, m2, 0x05 - PUT_8TAP_H 0, 1, 2, 3, 12 + lea srcq, [srcq+ssq*2] + PUT_6TAP_H 0, 1, 2, 3, 10 mova [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] @@ -1396,13 +1385,13 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my .h_w16: mov r6d, wd .h_w16_loop: - movu m0, [srcq+r6*2-32] - movu m1, [srcq+r6*2-24] - movu m2, [srcq+r6*2-16] - PUT_8TAP_H 0, 1, 2, 3, 12 - mova [dstq+r6*2-32], m0 + movu m0, [srcq+r6*2+ 0] + movu m1, [srcq+r6*2+ 8] + movu m2, [srcq+r6*2+16] + PUT_6TAP_H 0, 1, 2, 3, 10 + mova [dstq+r6*2], m0 sub r6d, 16 - jg .h_w16_loop + jge .h_w16_loop add srcq, ssq add dstq, dsq dec hd @@ -1411,10 +1400,449 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my .v: movzx mxd, myb shr myd, 16 - cmp hd, 4 - cmovle myd, mxd + cmp hd, 6 + cmovs myd, mxd + vpbroadcastq m0, [base+subpel_filters+1+myq*8] + WIN64_SPILL_XMM 10, 12 + vpbroadcastd m5, [pd_32] + vpbroadcastw m6, r8m + punpcklbw m0, m0 + mov r6, ssq + psraw m0, 8 ; sign-extend + neg r6 + pshufd m7, m0, q0000 + pshufd m8, m0, q1111 + pshufd m9, m0, q2222 + cmp wd, 4 + jg .v_w8 + je .v_w4 +.v_w2: + movd xm2, [srcq+r6 *2] + pinsrd xm2, [srcq+r6 *1], 1 + pinsrd xm2, [srcq+ssq*0], 2 + pinsrd xm2, [srcq+ssq*1], 3 ; 0 1 2 3 + lea srcq, [srcq+ssq*2] + movd xm0, [srcq+ssq*0] + palignr xm3, xm0, xm2, 4 ; 1 2 3 4 + punpcklwd xm1, xm2, xm3 ; 01 12 + punpckhwd xm2, xm3 ; 23 34 +.v_w2_loop: + movd xm3, [srcq+ssq*1] + pmaddwd xm4, xm7, xm1 ; a0 b0 + mova xm1, xm2 + pmaddwd xm2, xm8 ; a1 b1 + lea srcq, [srcq+ssq*2] + paddd xm4, xm2 + punpckldq xm2, xm0, xm3 ; 4 5 + movd xm0, [srcq+ssq*0] + punpckldq xm3, xm0 ; 5 6 + punpcklwd xm2, xm3 ; 45 56 + pmaddwd xm3, xm9, xm2 ; a2 b2 + paddd xm4, xm5 + paddd xm4, xm3 + psrad xm4, 6 + packusdw xm4, xm4 + pminsw xm4, xm6 + movd [dstq+dsq*0], xm4 + pextrd [dstq+dsq*1], xm4, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: + movq xm1, [srcq+r6 *2] + vpbroadcastq m3, [srcq+r6 *1] + vpbroadcastq m2, [srcq+ssq*0] + vpbroadcastq m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpbroadcastq m0, [srcq+ssq*0] + vpblendd m1, m3, 0x30 + vpblendd m3, m2, 0x30 + punpcklwd m1, m3 ; 01 12 + vpblendd m2, m4, 0x30 + vpblendd m4, m0, 0x30 + punpcklwd m2, m4 ; 23 34 +.v_w4_loop: + vpbroadcastq m3, [srcq+ssq*1] + pmaddwd m4, m7, m1 ; a0 b0 + mova m1, m2 + pmaddwd m2, m8 ; a1 b1 + lea srcq, [srcq+ssq*2] + paddd m4, m2 + vpblendd m2, m0, m3, 0x30 + vpbroadcastq m0, [srcq+ssq*0] + vpblendd m3, m0, 0x30 + punpcklwd m2, m3 ; 45 56 + pmaddwd m3, m9, m2 ; a2 b2 + paddd m4, m5 + paddd m4, m3 + psrad m4, 6 + vextracti128 xm3, m4, 1 + packusdw xm4, xm3 + pminsw xm4, xm6 + movq [dstq+dsq*0], xm4 + movhps [dstq+dsq*1], xm4 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: + shl wd, 5 + WIN64_PUSH_XMM 12 + lea wd, [hq+wq-256] +.v_w8_loop0: + vbroadcasti128 m3, [srcq+r6 *2] + vbroadcasti128 m4, [srcq+r6 *1] + lea r7, [srcq+ssq*2] + vbroadcasti128 m0, [srcq+ssq*0] + vbroadcasti128 m1, [srcq+ssq*1] + mov r8, dstq + vbroadcasti128 m2, [r7+ssq*0] + shufpd m3, m0, 0x0c + shufpd m4, m1, 0x0c + punpcklwd m1, m3, m4 ; 01 + punpckhwd m3, m4 ; 23 + shufpd m0, m2, 0x0c + punpcklwd m2, m4, m0 ; 12 + punpckhwd m4, m0 ; 34 +.v_w8_loop: + vbroadcasti128 m5, [r7+ssq*1] + pmaddwd m10, m7, m1 ; a0 + lea r7, [r7+ssq*2] + pmaddwd m11, m7, m2 ; b0 + mova m1, m3 + pmaddwd m3, m8 ; a1 + mova m2, m4 + pmaddwd m4, m8 ; b1 + paddd m10, m3 + vbroadcasti128 m3, [r7+ssq*0] + paddd m11, m4 + shufpd m4, m0, m5, 0x0d + shufpd m0, m5, m3, 0x0c + punpcklwd m3, m4, m0 ; 45 + punpckhwd m4, m0 ; 56 + pmaddwd m5, m9, m3 ; a2 + paddd m10, m5 + pmaddwd m5, m9, m4 ; b2 + paddd m5, m11 + psrad m10, 5 + psrad m5, 5 + packusdw m10, m5 + pxor m5, m5 + pavgw m5, m10 + pminsw m5, m6 + vpermq m5, m5, q3120 + mova [r8+dsq*0], xm5 + vextracti128 [r8+dsq*1], m5, 1 + lea r8, [r8+dsq*2] + sub hd, 2 + jg .v_w8_loop + add srcq, 16 + add dstq, 16 + movzx hd, wb + sub wd, 1<<8 + jg .v_w8_loop0 + RET +.hv: + WIN64_SPILL_XMM 12, 16 + vpbroadcastd m10, [pd_512] + vpbroadcastw m11, r8m + cmp wd, 4 + jg .hv_w8 + movzx mxd, mxb + vpbroadcastd m0, [base+subpel_filters+mxq*8+2] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + vpbroadcastq m1, [base+subpel_filters+1+myq*8] + mov r6, ssq + sub srcq, 2 + neg r6 + pxor m6, m6 + punpcklbw m6, m0 + punpcklbw m1, m1 + psraw m1, 8 ; sign-extend + test dword r8m, 0x800 + jz .hv_10bit + psraw m6, 2 + psllw m1, 2 +.hv_10bit: + pshufd m7, m1, q0000 + pshufd m8, m1, q1111 + pshufd m9, m1, q2222 + cmp wd, 4 + je .hv_w4 + vbroadcasti128 m5, [subpel_h_shuf2] + vbroadcasti128 m0, [srcq+ssq*0] + vinserti128 m2, m0, [srcq+r6*2], 1 ; 2 0 + movu xm1, [srcq+ssq*1] + vinserti128 m1, [srcq+r6 *1], 1 ; 3 1 + lea srcq, [srcq+ssq*2] + vinserti128 m0, [srcq+ssq*0], 0 ; 4 2 + REPX {pshufb x, m5}, m2, m1, m0 + REPX {pmaddwd x, m6}, m2, m1, m0 + phaddd m2, m1 + phaddd m1, m0 + paddd m2, m10 + paddd m1, m10 + psrad m2, 10 + psrad m1, 10 + packssdw m2, m1 ; 2 3 3 4 0 1 1 2 + punpckhqdq m0, m2, m2 + punpcklwd m2, m0 ; 23 34 + vextracti128 xm1, m2, 1 ; 01 12 +.hv_w2_loop: + movu xm3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movu xm4, [srcq+ssq*0] + pshufb xm3, xm5 + pshufb xm4, xm5 + pmaddwd xm3, xm6 + pmaddwd xm4, xm6 + phaddd xm3, xm4 + pmaddwd xm4, xm7, xm1 ; a0 b0 + mova xm1, xm2 + pmaddwd xm2, xm8 ; a1 b1 + paddd xm4, xm2 + paddd xm3, xm10 + psrad xm3, 10 + packssdw xm3, xm3 + palignr xm2, xm3, xm0, 12 + mova xm0, xm3 + punpcklwd xm2, xm0 ; 45 56 + pmaddwd xm3, xm9, xm2 ; a2 b2 + paddd xm4, xm10 + paddd xm4, xm3 + psrad xm4, 10 + packusdw xm4, xm4 + pminsw xm4, xm11 + movd [dstq+dsq*0], xm4 + pextrd [dstq+dsq*1], xm4, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +.hv_w4: + WIN64_PUSH_XMM 14 + vbroadcasti128 m12, [subpel_h_shufA] + pshufd m5, m6, q0000 + vbroadcasti128 m13, [subpel_h_shufB] + pshufd m6, m6, q1111 + movu xm2, [srcq+r6 *2] + vinserti128 m2, [srcq+r6 *1], 1 ; 0 1 + movu xm0, [srcq+ssq*0] + vinserti128 m0, [srcq+ssq*1], 1 ; 2 3 + lea srcq, [srcq+ssq*2] + movu xm3, [srcq+ssq*0] ; 4 + pshufb m1, m2, m12 + pmaddwd m1, m5 + pshufb m2, m13 + pmaddwd m2, m6 + pshufb m4, m0, m12 + pmaddwd m4, m5 + pshufb m0, m13 + pmaddwd m0, m6 + paddd m2, m1 + pshufb xm1, xm3, xm12 + pmaddwd xm1, xm5 + pshufb xm3, xm13 + pmaddwd xm3, xm6 + paddd m0, m4 + paddd m2, m10 + paddd xm1, xm10 + paddd m0, m10 + paddd xm3, xm1 + REPX {psrad x, 10}, m2, m0, xm3 + packssdw m2, m0 ; 0 2 1 3 + packssdw xm0, xm3 ; 2 4 + vperm2i128 m0, m2, 0x03 + punpcklwd m1, m2, m0 ; 01 12 + punpckhwd m2, m0 ; 23 34 +.hv_w4_loop: + movu xm3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vinserti128 m3, [srcq+ssq*0], 1 + pmaddwd m4, m7, m1 ; a0 b0 + mova m1, m2 + pmaddwd m2, m8 ; a1 b1 + paddd m4, m2 + pshufb m2, m3, m12 + pmaddwd m2, m5 + pshufb m3, m13 + pmaddwd m3, m6 + paddd m2, m10 + paddd m3, m2 + psrad m3, 10 + packssdw m3, m3 ; 5 5 6 6 + vperm2i128 m2, m0, m3, 0x21 + mova m0, m3 + punpckhwd m2, m3 ; 45 56 + pmaddwd m3, m9, m2 ; a2 b2 + paddd m4, m10 + paddd m4, m3 + psrad m4, 10 + vextracti128 xm3, m4, 1 + packusdw xm4, xm3 + pminsw xm4, xm11 + movq [dstq+dsq*0], xm4 + movhps [dstq+dsq*1], xm4 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: + WIN64_PUSH_XMM 16, 12 + shr mxd, 16 + vbroadcasti128 m12, [subpel_h_shufA] + vpbroadcastq m2, [base+subpel_filters+1+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + pmovsxbw xm1, [base+subpel_filters+1+myq*8] + shl wd, 5 + mov r6, ssq + sub srcq, 4 + pxor m0, m0 + neg r6 + punpcklbw m0, m2 + lea wd, [hq+wq-256] + test dword r8m, 0x800 + jz .hv_w8_10bit + psraw m0, 2 + psllw xm1, 2 +.hv_w8_10bit: + pshufd m7, m0, q0000 + pshufd m8, m0, q1111 +%if WIN64 + %define v_mul (rsp+stack_offset+40) ; r4m +%else + %define v_mul (rsp+stack_offset+ 8) ; r6m +%endif + mova [v_mul], xm1 + pshufd m9, m0, q2222 +.hv_w8_loop0: + vbroadcasti128 m0, [srcq+ssq*0+ 0] + vinserti128 m3, m0, [srcq+r6*2+ 0], 0 + lea r7, [srcq+ssq*2] + vbroadcasti128 m2, [srcq+ssq*0+16] + vinserti128 m1, m2, [srcq+r6*2+16], 0 + mov r8, dstq + vinserti128 m0, [r7 +ssq*0+ 0], 1 + vinserti128 m2, [r7 +ssq*0+16], 1 + shufpd m4, m3, m1, 0x05 +%macro PUT_6TAP_HV_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] + pshufb m%1, m12 ; 01 12 23 34 + pshufb m%2, m12 ; 45 56 67 78 + pmaddwd m%4, m7, m%1 ; a0 + pshufb m%3, m12 ; 89 9a ab bc + pmaddwd m%5, m9, m%2 ; a2 + shufpd m%1, m%2, 0x05 ; 23 34 45 56 + paddd m%4, m%5 ; a0+a2 + pmaddwd m%5, m7, m%2 ; b0 + shufpd m%2, m%3, 0x05 ; 67 78 89 9a + pmaddwd m%3, m9 ; b2 + pmaddwd m%1, m8 ; a1 + pmaddwd m%2, m8 ; b1 + paddd m%3, m%5 ; b0+b2 + paddd m%4, m10 + paddd m%3, m10 + paddd m%1, m%4 + paddd m%2, m%3 + psrad m%1, 10 + psrad m%2, 10 + packssdw m%1, m%2 +%endmacro + PUT_6TAP_HV_H 3, 4, 1, 5, 6 ; 0 2 + movu xm4, [srcq+r6 *1+ 0] + vinserti128 m4, [srcq+ssq*1+ 0], 1 + shufpd m1, m0, m2, 0x05 + PUT_6TAP_HV_H 0, 1, 2, 5, 6 ; 2 4 + movu xm2, [srcq+r6 *1+16] + vinserti128 m2, [srcq+ssq*1+16], 1 + shufpd m1, m4, m2, 0x05 + PUT_6TAP_HV_H 4, 1, 2, 5, 6 ; 1 3 + vpermq m3, m3, q3120 + vpermq m4, m4, q3120 + vpermq m0, m0, q3120 + punpcklwd m1, m3, m4 ; 01 + punpckhwd m3, m4 ; 23 + punpcklwd m2, m4, m0 ; 12 + punpckhwd m4, m0 ; 34 +.hv_w8_loop: + vpbroadcastd m15, [v_mul+4*0] + vpbroadcastd m13, [v_mul+4*1] + movu xm5, [r7+ssq*1+ 0] + movu xm6, [r7+ssq*1+16] + lea r7, [r7+ssq*2] + pmaddwd m14, m15, m1 ; a0 + pmaddwd m15, m2 ; b0 + vinserti128 m5, [r7+ssq*0+ 0], 1 + vinserti128 m6, [r7+ssq*0+16], 1 + mova m1, m3 + pmaddwd m3, m13 ; a1 + mova m2, m4 + pmaddwd m4, m13 ; b1 + paddd m14, m3 + shufpd m3, m5, m6, 0x05 + paddd m15, m4 + PUT_6TAP_HV_H 5, 3, 6, 4, 13 ; 5 6 + vpbroadcastd m6, [v_mul+4*2] + vpermq m5, m5, q3120 + shufpd m4, m0, m5, 0x05 + mova m0, m5 + punpcklwd m3, m4, m5 ; 45 + punpckhwd m4, m5 ; 56 + pmaddwd m5, m6, m3 ; a2 + pmaddwd m6, m4 ; b2 + paddd m14, m10 + paddd m15, m10 + paddd m5, m14 + paddd m6, m15 + psrad m5, 10 + psrad m6, 10 + packusdw m5, m6 + pminsw m5, m11 + vpermq m5, m5, q3120 + mova [r8+dsq*0], xm5 + vextracti128 [r8+dsq*1], m5, 1 + lea r8, [r8+dsq*2] + sub hd, 2 + jg .hv_w8_loop + add srcq, 16 + add dstq, 16 + movzx hd, wb + sub wd, 1<<8 + jg .hv_w8_loop0 + RET + +PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP, put_8tap_16bpc +PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH, put_8tap_16bpc +PUT_8TAP_FN regular_sharp, REGULAR, SHARP, put_8tap_16bpc +PUT_8TAP_FN sharp_regular, SHARP, REGULAR, put_8tap_16bpc +PUT_8TAP_FN sharp, SHARP, SHARP + +cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my +%define base r8-put_avx2 + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + lea r8, [put_avx2] + movifnidn wd, wm + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jz mangle(private_prefix %+ _put_6tap_16bpc_avx2).put +.v: + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd vpbroadcastq m0, [base+subpel_filters+myq*8] - WIN64_SPILL_XMM 15 + WIN64_SPILL_XMM 12, 15 vpbroadcastd m6, [pd_32] vpbroadcastw m7, r8m lea r6, [ssq*3] @@ -1518,19 +1946,19 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my RET .v_w8: shl wd, 5 - mov r7, srcq - mov r8, dstq + WIN64_PUSH_XMM 15 lea wd, [hq+wq-256] .v_w8_loop0: vbroadcasti128 m4, [srcq+ssq*0] vbroadcasti128 m5, [srcq+ssq*1] + lea r7, [srcq+ssq*4] vbroadcasti128 m0, [srcq+r6 ] vbroadcasti128 m6, [srcq+ssq*2] - lea srcq, [srcq+ssq*4] - vbroadcasti128 m1, [srcq+ssq*0] - vbroadcasti128 m2, [srcq+ssq*1] - vbroadcasti128 m3, [srcq+ssq*2] - add srcq, r6 + mov r8, dstq + vbroadcasti128 m1, [r7+ssq*0] + vbroadcasti128 m2, [r7+ssq*1] + vbroadcasti128 m3, [r7+ssq*2] + add r7, r6 shufpd m4, m0, 0x0c shufpd m5, m1, 0x0c punpcklwd m1, m4, m5 ; 01 @@ -1542,7 +1970,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my punpcklwd m3, m6, m0 ; 23 punpckhwd m6, m0 ; 56 .v_w8_loop: - vbroadcasti128 m14, [srcq+ssq*0] + vbroadcasti128 m14, [r7+ssq*0] pmaddwd m12, m8, m1 ; a0 pmaddwd m13, m8, m2 ; b0 mova m1, m3 @@ -1556,8 +1984,8 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my pmaddwd m5, m10 ; a2 pmaddwd m6, m10 ; b2 paddd m12, m5 - vbroadcasti128 m5, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] + vbroadcasti128 m5, [r7+ssq*1] + lea r7, [r7+ssq*2] paddd m13, m6 shufpd m6, m0, m14, 0x0d shufpd m0, m14, m5, 0x0c @@ -1574,41 +2002,121 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my pavgw m12, m13 pminsw m12, m7 vpermq m12, m12, q3120 - mova [dstq+dsq*0], xm12 - vextracti128 [dstq+dsq*1], m12, 1 - lea dstq, [dstq+dsq*2] + mova [r8+dsq*0], xm12 + vextracti128 [r8+dsq*1], m12, 1 + lea r8, [r8+dsq*2] sub hd, 2 jg .v_w8_loop - add r7, 16 - add r8, 16 + add srcq, 16 + add dstq, 16 movzx hd, wb - mov srcq, r7 - mov dstq, r8 sub wd, 1<<8 jg .v_w8_loop0 RET -.hv: - WIN64_SPILL_XMM 16 - vpbroadcastw m15, r8m +.h: + RESET_STACK_STATE + test myd, 0xf00 + jnz .hv + mov r7d, r8m + vpbroadcastw m5, r8m + shr r7d, 11 + vpbroadcastd m4, [base+put_8tap_h_rnd+r7*4] cmp wd, 4 - jg .hv_w8 - movzx mxd, mxb - vpbroadcastd m0, [base+subpel_filters+mxq*8+2] - movzx mxd, myb - shr myd, 16 - cmp hd, 4 - cmovle myd, mxd - vpbroadcastq m1, [base+subpel_filters+myq*8] - vpbroadcastd m6, [pd_512] - lea r6, [ssq*3] - sub srcq, 2 - sub srcq, r6 - pxor m7, m7 - punpcklbw m7, m0 - punpcklbw m1, m1 - psraw m1, 8 ; sign-extend - test dword r8m, 0x800 - jz .hv_10bit + jl mangle(private_prefix %+ _put_6tap_16bpc_avx2).h_w2 + je mangle(private_prefix %+ _put_6tap_16bpc_avx2).h_w4 + WIN64_SPILL_XMM 13 + shr mxd, 16 + sub srcq, 6 + vpbroadcastq m0, [base+subpel_filters+mxq*8] + vbroadcasti128 m6, [subpel_h_shufA] + vbroadcasti128 m7, [subpel_h_shufB] + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + pshufd m8, m0, q0000 + pshufd m9, m0, q1111 + pshufd m10, m0, q2222 + pshufd m11, m0, q3333 + sub wd, 16 + jge .h_w16 +.h_w8: +%macro PUT_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] + pshufb m%4, m%1, m7 ; 2 3 3 4 4 5 5 6 + pshufb m%1, m6 ; 0 1 1 2 2 3 3 4 + pmaddwd m%5, m9, m%4 ; abcd1 + pmaddwd m%1, m8 ; abcd0 + pshufb m%2, m7 ; 6 7 7 8 8 9 9 a + shufpd m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8 + paddd m%5, m4 + paddd m%1, m%5 + pmaddwd m%5, m11, m%2 ; abcd3 + paddd m%1, m%5 + pmaddwd m%5, m10, m%4 ; abcd2 + pshufb m%3, m7 ; a b b c c d d e + pmaddwd m%4, m8 ; efgh0 + paddd m%1, m%5 + pmaddwd m%5, m9, m%2 ; efgh1 + shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c + pmaddwd m%3, m11 ; efgh3 + pmaddwd m%2, m10 ; efgh2 + paddd m%4, m4 + paddd m%4, m%5 + paddd m%3, m%4 + paddd m%2, m%3 + psrad m%1, 6 + psrad m%2, 6 + packusdw m%1, m%2 + pminsw m%1, m5 +%endmacro + movu xm0, [srcq+ssq*0+ 0] + vinserti128 m0, [srcq+ssq*1+ 0], 1 + movu xm2, [srcq+ssq*0+16] + vinserti128 m2, [srcq+ssq*1+16], 1 + lea srcq, [srcq+ssq*2] + shufpd m1, m0, m2, 0x05 + PUT_8TAP_H 0, 1, 2, 3, 12 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w8 + RET +.h_w16: + mov r6d, wd +.h_w16_loop: + movu m0, [srcq+r6*2+ 0] + movu m1, [srcq+r6*2+ 8] + movu m2, [srcq+r6*2+16] + PUT_8TAP_H 0, 1, 2, 3, 12 + mova [dstq+r6*2], m0 + sub r6d, 16 + jge .h_w16_loop + add srcq, ssq + add dstq, dsq + dec hd + jg .h_w16 + RET +.hv: + WIN64_SPILL_XMM 16 + vpbroadcastw m15, r8m + cmp wd, 4 + jg .hv_w8 + movzx mxd, mxb + vpbroadcastd m0, [base+subpel_filters+mxq*8+2] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + vpbroadcastq m1, [base+subpel_filters+myq*8] + vpbroadcastd m6, [pd_512] + lea r6, [ssq*3] + sub srcq, 2 + sub srcq, r6 + pxor m7, m7 + punpcklbw m7, m0 + punpcklbw m1, m1 + psraw m1, 8 ; sign-extend + test dword r8m, 0x800 + jz .hv_10bit psraw m7, 2 psllw m1, 2 .hv_10bit: @@ -1773,17 +2281,15 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my vpbroadcastq m2, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 - cmp hd, 4 - cmovle myd, mxd + cmp hd, 6 + cmovs myd, mxd pmovsxbw xm1, [base+subpel_filters+myq*8] shl wd, 5 lea r6, [ssq*3] sub srcq, 6 - sub srcq, r6 pxor m0, m0 + sub srcq, r6 punpcklbw m0, m2 - mov r7, srcq - mov r8, dstq lea wd, [hq+wq-256] test dword r8m, 0x800 jz .hv_w8_10bit @@ -1792,14 +2298,9 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my .hv_w8_10bit: pshufd m11, m0, q0000 pshufd m12, m0, q1111 + mova [v_mul], xm1 pshufd m13, m0, q2222 pshufd m14, m0, q3333 -%if WIN64 - %define v_mul (rsp+stack_offset+40) ; r4m -%else - %define v_mul (rsp-24) ; red zone -%endif - mova [v_mul], xm1 .hv_w8_loop0: %macro PUT_8TAP_HV_H 3 ; dst/src+0, src+8, src+16 pshufb m2, m%1, m9 ; 2 3 3 4 4 5 5 6 @@ -1830,14 +2331,16 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my %endmacro movu xm4, [srcq+r6 *1+ 0] vbroadcasti128 m8, [subpel_h_shufA] + lea r7, [srcq+ssq*4] movu xm6, [srcq+r6 *1+ 8] vbroadcasti128 m9, [subpel_h_shufB] + mov r8, dstq movu xm0, [srcq+r6 *1+16] vpbroadcastd m10, [pd_512] movu xm5, [srcq+ssq*0+ 0] - vinserti128 m5, [srcq+ssq*4+ 0], 1 + vinserti128 m5, [r7 +ssq*0+ 0], 1 movu xm1, [srcq+ssq*0+16] - vinserti128 m1, [srcq+ssq*4+16], 1 + vinserti128 m1, [r7 +ssq*0+16], 1 shufpd m7, m5, m1, 0x05 INIT_XMM avx2 PUT_8TAP_HV_H 4, 6, 0 ; 3 @@ -1851,10 +2354,9 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my PUT_8TAP_HV_H 0, 7, 1 ; 2 6 movu xm6, [srcq+ssq*1+ 0] movu xm1, [srcq+ssq*1+16] - lea srcq, [srcq+ssq*4] - vinserti128 m6, [srcq+ssq*1+ 0], 1 - vinserti128 m1, [srcq+ssq*1+16], 1 - add srcq, r6 + vinserti128 m6, [r7 +ssq*1+ 0], 1 + vinserti128 m1, [r7 +ssq*1+16], 1 + add r7, r6 shufpd m7, m6, m1, 0x05 PUT_8TAP_HV_H 6, 7, 1 ; 1 5 vpermq m4, m4, q1100 @@ -1885,13 +2387,13 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my pmaddwd m6, m10 ; b2 paddd m8, m5 paddd m9, m6 - movu xm5, [srcq+ssq*0] - vinserti128 m5, [srcq+ssq*1], 1 + movu xm5, [r7+ssq*0] + vinserti128 m5, [r7+ssq*1], 1 vbroadcasti128 m7, [subpel_h_shufA] vbroadcasti128 m10, [subpel_h_shufB] - movu xm6, [srcq+ssq*0+16] - vinserti128 m6, [srcq+ssq*1+16], 1 - vextracti128 [dstq], m0, 1 + movu xm6, [r7+ssq*0+16] + vinserti128 m6, [r7+ssq*1+16], 1 + vextracti128 [r8], m0, 1 pshufb m0, m5, m7 ; 01 pshufb m5, m10 ; 23 pmaddwd m0, m11 @@ -1902,9 +2404,9 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my pmaddwd m5, m13 pmaddwd m6, m14 paddd m6, m5 - movu xm5, [srcq+ssq*0+8] - vinserti128 m5, [srcq+ssq*1+8], 1 - lea srcq, [srcq+ssq*2] + movu xm5, [r7+ssq*0+8] + vinserti128 m5, [r7+ssq*1+8], 1 + lea r7, [r7+ssq*2] pshufb m7, m5, m7 pshufb m5, m10 pmaddwd m10, m13, m7 @@ -1916,7 +2418,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my pmaddwd m5, m12 paddd m0, m7 paddd m5, m6 - vbroadcasti128 m6, [dstq] + vbroadcasti128 m6, [r8] paddd m8, m10 paddd m9, m10 paddd m0, m10 @@ -1938,36 +2440,512 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my packusdw m7, m9 pminsw m7, m15 vpermq m7, m7, q3120 - mova [dstq+dsq*0], xm7 - vextracti128 [dstq+dsq*1], m7, 1 - lea dstq, [dstq+dsq*2] + mova [r8+dsq*0], xm7 + vextracti128 [r8+dsq*1], m7, 1 + lea r8, [r8+dsq*2] + sub hd, 2 + jg .hv_w8_loop + add srcq, 16 + add dstq, 16 + movzx hd, wb + sub wd, 1<<8 + jg .hv_w8_loop0 + RET + +%if WIN64 +DECLARE_REG_TMP 6, 4 +%else +DECLARE_REG_TMP 6, 7 +%endif + +%define PREP_8TAP_FN FN prep_8tap, +PREP_8TAP_FN smooth, SMOOTH, SMOOTH, prep_6tap_16bpc +PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR, prep_6tap_16bpc +PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH, prep_6tap_16bpc +PREP_8TAP_FN regular, REGULAR, REGULAR + +cglobal prep_6tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my +%define base r7-prep_avx2 + imul mxd, mxm, 0x010101 + add mxd, t0d ; 6tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 6tap_v, my, 4tap_v + lea r7, [prep_avx2] + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jnz .v +.prep: + tzcnt wd, wd + mov r6d, r7m ; bitdepth_max + movzx wd, word [r7+wq*2+table_offset(prep,)] + vpbroadcastd m5, [r7-prep_avx2+pw_8192] + shr r6d, 11 + add wq, r7 + vpbroadcastd m4, [base+prep_mul+r6*4] + lea r6, [ssq*3] +%if WIN64 + pop r7 +%endif + jmp wq +.h_w4: + movzx mxd, mxb + sub srcq, 2 + pmovsxbw xm0, [base+subpel_filters+mxq*8] + vbroadcasti128 m3, [subpel_h_shufA] + lea r6, [ssq*3] + vbroadcasti128 m4, [subpel_h_shufB] + WIN64_SPILL_XMM 8 + pshufd xm0, xm0, q2211 + test dword r7m, 0x800 + jnz .h_w4_12bpc + psllw xm0, 2 +.h_w4_12bpc: + vpbroadcastq m6, xm0 + vpermq m7, m0, q1111 +.h_w4_loop: + movu xm1, [srcq+ssq*0] + vinserti128 m1, [srcq+ssq*2], 1 + movu xm2, [srcq+ssq*1] + vinserti128 m2, [srcq+r6 *1], 1 + lea srcq, [srcq+ssq*4] + pshufb m0, m1, m3 ; 0 1 1 2 2 3 3 4 + pshufb m1, m4 ; 2 3 3 4 4 5 5 6 + pmaddwd m0, m6 + pmaddwd m1, m7 + paddd m0, m5 + paddd m0, m1 + pshufb m1, m2, m3 + pshufb m2, m4 + pmaddwd m1, m6 + pmaddwd m2, m7 + paddd m1, m5 + paddd m1, m2 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + mova [tmpq], m0 + add tmpq, 32 + sub hd, 4 + jg .h_w4_loop + RET +.h: + test myd, 0xf00 + jnz .hv + vpbroadcastd m5, [prep_8tap_1d_rnd] ; 8 - (8192 << 4) + cmp wd, 4 + je .h_w4 + shr mxd, 16 + sub srcq, 4 + vpbroadcastq m0, [base+subpel_filters+1+mxq*8] + WIN64_SPILL_XMM 10 + vbroadcasti128 m6, [subpel_h_shufA] + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + test dword r7m, 0x800 + jnz .h_12bpc + psllw m0, 2 +.h_12bpc: + pshufd m7, m0, q0000 + pshufd m8, m0, q1111 + pshufd m9, m0, q2222 + cmp wd, 8 + jg .h_w16 +.h_w8: + movu xm0, [srcq+ssq*0+ 0] + vinserti128 m0, [srcq+ssq*1+ 0], 1 + movu xm2, [srcq+ssq*0+16] + vinserti128 m2, [srcq+ssq*1+16], 1 + lea srcq, [srcq+ssq*2] + shufpd m1, m0, m2, 0x05 +%macro PREP_6TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] + pshufb m%1, m6 ; 01 12 23 34 + pshufb m%2, m6 ; 45 56 67 78 + pmaddwd m%4, m7, m%1 ; a0 + pshufb m%3, m6 ; 89 9a ab bc + pmaddwd m%5, m9, m%2 ; a2 + shufpd m%1, m%2, 0x05 ; 23 34 45 56 + paddd m%4, m%5 ; a0+a2 + pmaddwd m%5, m7, m%2 ; b0 + shufpd m%2, m%3, 0x05 ; 67 78 89 9a + pmaddwd m%3, m9 ; b2 + pmaddwd m%1, m8 ; a1 + pmaddwd m%2, m8 ; b1 + paddd m%3, m%5 ; b0+b2 + paddd m%4, m5 + paddd m%3, m5 + paddd m%1, m%4 + paddd m%2, m%3 + psrad m%1, 4 + psrad m%2, 4 + packssdw m%1, m%2 +%endmacro + PREP_6TAP_H 0, 1, 2, 3, 4 + mova [tmpq], m0 + add tmpq, 32 + sub hd, 2 + jg .h_w8 + RET +.h_w16: + add wd, wd +.h_w16_loop0: + mov r6d, wd +.h_w16_loop: + movu m0, [srcq+r6-32] + movu m1, [srcq+r6-24] + movu m2, [srcq+r6-16] + PREP_6TAP_H 0, 1, 2, 3, 4 + mova [tmpq+r6-32], m0 + sub r6d, 32 + jg .h_w16_loop + add srcq, ssq + add tmpq, wq + dec hd + jg .h_w16_loop0 + RET +.v: + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + vpbroadcastq m0, [base+subpel_filters+1+myq*8] + WIN64_SPILL_XMM 9, 12 + vpbroadcastd m5, [prep_8tap_1d_rnd] + mov r6, ssq + punpcklbw m0, m0 + neg r6 + psraw m0, 8 ; sign-extend + test dword r7m, 0x800 + jnz .v_12bpc + psllw m0, 2 +.v_12bpc: + pshufd m6, m0, q0000 + pshufd m7, m0, q1111 + pshufd m8, m0, q2222 + cmp wd, 4 + jg .v_w8 +.v_w4: + movq xm1, [srcq+r6 *2] + vpbroadcastq m3, [srcq+r6 *1] + vpbroadcastq m2, [srcq+ssq*0] + vpbroadcastq m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpbroadcastq m0, [srcq+ssq*0] + vpblendd m1, m3, 0x30 + vpblendd m3, m2, 0x30 + punpcklwd m1, m3 ; 01 12 + vpblendd m2, m4, 0x30 + vpblendd m4, m0, 0x30 + punpcklwd m2, m4 ; 23 34 +.v_w4_loop: + vpbroadcastq m3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddwd m4, m6, m1 ; a0 b0 + mova m1, m2 + pmaddwd m2, m7 ; a1 b1 + paddd m4, m2 + vpblendd m2, m0, m3, 0x30 + vpbroadcastq m0, [srcq+ssq*0] + vpblendd m3, m0, 0x30 + punpcklwd m2, m3 ; 45 56 + pmaddwd m3, m8, m2 ; a2 b2 + paddd m4, m5 + paddd m4, m3 + psrad m4, 4 + vextracti128 xm3, m4, 1 + packssdw xm4, xm3 + mova [tmpq], xm4 + add tmpq, 16 + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: + WIN64_PUSH_XMM 12 +%if WIN64 + push r8 +%endif + mov r8d, wd + shl wd, 5 + lea wd, [hq+wq-256] +.v_w8_loop0: + vbroadcasti128 m3, [srcq+r6 *2] + vbroadcasti128 m4, [srcq+r6 *1] + lea r5, [srcq+ssq*2] + vbroadcasti128 m0, [srcq+ssq*0] + vbroadcasti128 m1, [srcq+ssq*1] + mov r7, tmpq + vbroadcasti128 m2, [r5+ssq*0] + shufpd m3, m0, 0x0c + shufpd m4, m1, 0x0c + punpcklwd m1, m3, m4 ; 01 + punpckhwd m3, m4 ; 23 + shufpd m0, m2, 0x0c + punpcklwd m2, m4, m0 ; 12 + punpckhwd m4, m0 ; 34 +.v_w8_loop: + vbroadcasti128 m9, [r5+ssq*1] + pmaddwd m10, m6, m1 ; a0 + lea r5, [r5+ssq*2] + pmaddwd m11, m6, m2 ; b0 + mova m1, m3 + pmaddwd m3, m7 ; a1 + mova m2, m4 + pmaddwd m4, m7 ; b1 + paddd m10, m5 + paddd m11, m5 + paddd m10, m3 + vbroadcasti128 m3, [r5+ssq*0] + paddd m11, m4 + shufpd m4, m0, m9, 0x0d + shufpd m0, m9, m3, 0x0c + punpcklwd m3, m4, m0 ; 45 + punpckhwd m4, m0 ; 56 + pmaddwd m9, m8, m3 ; a2 + paddd m10, m9 + pmaddwd m9, m8, m4 ; b2 + paddd m11, m9 + psrad m10, 4 + psrad m11, 4 + packssdw m10, m11 + vpermq m10, m10, q3120 + mova [r7+r8*0], xm10 + vextracti128 [r7+r8*2], m10, 1 + lea r7, [r7+r8*4] + sub hd, 2 + jg .v_w8_loop + add srcq, 16 + add tmpq, 16 + movzx hd, wb + sub wd, 1<<8 + jg .v_w8_loop0 +%if WIN64 + pop r8 +%endif + RET +.hv: + WIN64_SPILL_XMM 13, 15 + vpbroadcastd m7, [prep_8tap_2d_rnd] + vbroadcasti128 m8, [subpel_h_shufA] + cmp wd, 4 + jg .hv_w8 + movzx mxd, mxb + vpbroadcastd m0, [base+subpel_filters+mxq*8+2] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + vpbroadcastq m1, [base+subpel_filters+1+myq*8] + mov r6, ssq + sub srcq, 2 + pxor m6, m6 + neg r6 + punpcklbw m6, m0 + punpcklbw m1, m1 + psraw m6, 4 + psraw m1, 8 + test dword r7m, 0x800 + jz .hv_w4_10bit + psraw m6, 2 +.hv_w4_10bit: + pshufd m10, m1, q0000 + pshufd m11, m1, q1111 + pshufd m12, m1, q2222 +.hv_w4: + movu xm2, [srcq+r6 *2] + vinserti128 m2, [srcq+r6 *1], 1 ; 0 1 + pshufd m5, m6, q0000 + vbroadcasti128 m9, [base+subpel_h_shufB] + movu xm0, [srcq+ssq*0] + pshufd m6, m6, q1111 + vinserti128 m0, [srcq+ssq*1], 1 ; 2 3 + lea srcq, [srcq+ssq*2] + movu xm3, [srcq+ssq*0] ; 4 + pshufb m1, m2, m8 + pmaddwd m1, m5 + pshufb m2, m9 + pmaddwd m2, m6 + pshufb m4, m0, m8 + pmaddwd m4, m5 + pshufb m0, m9 + pmaddwd m0, m6 + paddd m2, m1 + pshufb xm1, xm3, xm8 + pmaddwd xm1, xm5 + pshufb xm3, xm9 + pmaddwd xm3, xm6 + paddd m0, m4 + paddd m2, m7 + paddd xm1, xm7 + paddd m0, m7 + paddd xm3, xm1 + REPX {psrad x, 6}, m2, m0, xm3 + packssdw m2, m0 ; 0 2 1 3 + packssdw xm0, xm3 ; 2 4 + vperm2i128 m0, m2, 0x03 + punpcklwd m1, m2, m0 ; 01 12 + punpckhwd m2, m0 ; 23 34 +.hv_w4_loop: + movu xm3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vinserti128 m3, [srcq+ssq*0], 1 + pmaddwd m4, m10, m1 ; a0 b0 + mova m1, m2 + pmaddwd m2, m11 ; a1 b1 + paddd m4, m2 + pshufb m2, m3, m8 + pmaddwd m2, m5 + pshufb m3, m9 + pmaddwd m3, m6 + paddd m2, m7 + paddd m3, m2 + psrad m3, 6 + packssdw m3, m3 ; 5 5 6 6 + vperm2i128 m2, m0, m3, 0x21 + mova m0, m3 + punpckhwd m2, m3 ; 45 56 + pmaddwd m3, m12, m2 ; a2 b2 + paddd m4, m7 + paddd m4, m3 + psrad m4, 6 + vextracti128 xm3, m4, 1 + packssdw xm4, xm3 + mova [tmpq], xm4 + add tmpq, 16 + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: + shr mxd, 16 + vpbroadcastq m2, [base+subpel_filters+1+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + pmovsxbw xm1, [base+subpel_filters+1+myq*8] + WIN64_PUSH_XMM 15 +%if WIN64 + PUSH r8 +%endif + mov r8d, wd + shl wd, 5 + mov r6, ssq + sub srcq, 4 + neg r6 + lea wd, [hq+wq-256] + pxor m0, m0 + punpcklbw m0, m2 + psraw m0, 4 + test dword r7m, 0x800 + jz .hv_w8_10bit + psraw m0, 2 +.hv_w8_10bit: + pshufd m10, m0, q0000 + pshufd m11, m0, q1111 + mova [v_mul], xm1 + pshufd m12, m0, q2222 +.hv_w8_loop0: + vbroadcasti128 m0, [srcq+ssq*0+ 0] + vinserti128 m3, m0, [srcq+r6*2+ 0], 0 + lea r5, [srcq+ssq*2] + vbroadcasti128 m2, [srcq+ssq*0+16] + vinserti128 m1, m2, [srcq+r6*2+16], 0 + mov r7, tmpq + vinserti128 m0, [r5 +ssq*0+ 0], 1 + vinserti128 m2, [r5 +ssq*0+16], 1 + shufpd m4, m3, m1, 0x05 +%macro PREP_6TAP_HV_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] + pshufb m%1, m8 ; 01 12 23 34 + pshufb m%2, m8 ; 45 56 67 78 + pmaddwd m%4, m10, m%1 ; a0 + pshufb m%3, m8 ; 89 9a ab bc + pmaddwd m%5, m12, m%2 ; a2 + shufpd m%1, m%2, 0x05 ; 23 34 45 56 + paddd m%4, m%5 ; a0+a2 + pmaddwd m%5, m10, m%2 ; b0 + shufpd m%2, m%3, 0x05 ; 67 78 89 9a + pmaddwd m%3, m12 ; b2 + pmaddwd m%1, m11 ; a1 + pmaddwd m%2, m11 ; b1 + paddd m%3, m%5 ; b0+b2 + paddd m%4, m7 + paddd m%3, m7 + paddd m%1, m%4 + paddd m%2, m%3 + psrad m%1, 6 + psrad m%2, 6 + packssdw m%1, m%2 +%endmacro + PREP_6TAP_HV_H 3, 4, 1, 5, 6 ; 0 2 + movu xm4, [srcq+r6 *1+ 0] + vinserti128 m4, [srcq+ssq*1+ 0], 1 + shufpd m1, m0, m2, 0x05 + PREP_6TAP_HV_H 0, 1, 2, 5, 6 ; 2 4 + movu xm2, [srcq+r6 *1+16] + vinserti128 m2, [srcq+ssq*1+16], 1 + shufpd m1, m4, m2, 0x05 + PREP_6TAP_HV_H 4, 1, 2, 5, 6 ; 1 3 + vpermq m3, m3, q3120 + vpermq m4, m4, q3120 + vpermq m0, m0, q3120 + punpcklwd m1, m3, m4 ; 01 + punpckhwd m3, m4 ; 23 + punpcklwd m2, m4, m0 ; 12 + punpckhwd m4, m0 ; 34 +.hv_w8_loop: + vpbroadcastd m14, [v_mul+4*0] + vpbroadcastd m9, [v_mul+4*1] + movu xm5, [r5+ssq*1+ 0] + movu xm6, [r5+ssq*1+16] + lea r5, [r5+ssq*2] + pmaddwd m13, m14, m1 ; a0 + pmaddwd m14, m2 ; b0 + vinserti128 m5, [r5+ssq*0+ 0], 1 + vinserti128 m6, [r5+ssq*0+16], 1 + mova m1, m3 + pmaddwd m3, m9 ; a1 + mova m2, m4 + pmaddwd m4, m9 ; b1 + paddd m13, m3 + shufpd m3, m5, m6, 0x05 + paddd m14, m4 + PREP_6TAP_HV_H 5, 3, 6, 4, 9 ; 5 6 + vpbroadcastd m6, [v_mul+4*2] + vpermq m5, m5, q3120 + shufpd m4, m0, m5, 0x05 + mova m0, m5 + punpcklwd m3, m4, m5 ; 45 + punpckhwd m4, m5 ; 56 + pmaddwd m5, m6, m3 ; a2 + pmaddwd m6, m4 ; b2 + paddd m13, m7 + paddd m14, m7 + paddd m5, m13 + paddd m6, m14 + psrad m5, 6 + psrad m6, 6 + packssdw m5, m6 + vpermq m5, m5, q3120 + mova [r7+r8*0], xm5 + vextracti128 [r7+r8*2], m5, 1 + lea r7, [r7+r8*4] sub hd, 2 jg .hv_w8_loop - add r7, 16 - add r8, 16 + add srcq, 16 + add tmpq, 16 movzx hd, wb - mov srcq, r7 - mov dstq, r8 sub wd, 1<<8 jg .hv_w8_loop0 - RET - %if WIN64 -DECLARE_REG_TMP 6, 4 -%else -DECLARE_REG_TMP 6, 7 + POP r8 %endif + RET -%define PREP_8TAP_FN FN prep_8tap, +PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_16bpc +PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_16bpc +PREP_8TAP_FN regular_sharp, REGULAR, SHARP, prep_8tap_16bpc +PREP_8TAP_FN sharp_regular, SHARP, REGULAR, prep_8tap_16bpc PREP_8TAP_FN sharp, SHARP, SHARP -PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH -PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP -PREP_8TAP_FN smooth, SMOOTH, SMOOTH -PREP_8TAP_FN sharp_regular, SHARP, REGULAR -PREP_8TAP_FN regular_sharp, REGULAR, SHARP -PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR -PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH -PREP_8TAP_FN regular, REGULAR, REGULAR cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my %define base r7-prep_avx2 @@ -1980,152 +2958,18 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my test mxd, 0xf00 jnz .h test myd, 0xf00 - jnz .v - tzcnt wd, wd - mov r6d, r7m ; bitdepth_max - movzx wd, word [r7+wq*2+table_offset(prep,)] - vpbroadcastd m5, [r7-prep_avx2+pw_8192] - shr r6d, 11 - add wq, r7 - vpbroadcastd m4, [base+prep_mul+r6*4] - lea r6, [strideq*3] -%if WIN64 - pop r7 -%endif - jmp wq -.h_w4: - movzx mxd, mxb - sub srcq, 2 - pmovsxbw xm0, [base+subpel_filters+mxq*8] - vbroadcasti128 m3, [subpel_h_shufA] - vbroadcasti128 m4, [subpel_h_shufB] - WIN64_SPILL_XMM 8 - pshufd xm0, xm0, q2211 - test dword r7m, 0x800 - jnz .h_w4_12bpc - psllw xm0, 2 -.h_w4_12bpc: - vpbroadcastq m6, xm0 - vpermq m7, m0, q1111 -.h_w4_loop: - movu xm1, [srcq+strideq*0] - vinserti128 m1, [srcq+strideq*2], 1 - movu xm2, [srcq+strideq*1] - vinserti128 m2, [srcq+r6 ], 1 - lea srcq, [srcq+strideq*4] - pshufb m0, m1, m3 ; 0 1 1 2 2 3 3 4 - pshufb m1, m4 ; 2 3 3 4 4 5 5 6 - pmaddwd m0, m6 - pmaddwd m1, m7 - paddd m0, m5 - paddd m0, m1 - pshufb m1, m2, m3 - pshufb m2, m4 - pmaddwd m1, m6 - pmaddwd m2, m7 - paddd m1, m5 - paddd m1, m2 - psrad m0, 4 - psrad m1, 4 - packssdw m0, m1 - mova [tmpq], m0 - add tmpq, 32 - sub hd, 4 - jg .h_w4_loop - RET -.h: - test myd, 0xf00 - jnz .hv - vpbroadcastd m5, [prep_8tap_1d_rnd] ; 8 - (8192 << 4) - lea r6, [strideq*3] - cmp wd, 4 - je .h_w4 - shr mxd, 16 - sub srcq, 6 - vpbroadcastq m0, [base+subpel_filters+mxq*8] - WIN64_SPILL_XMM 12 - vbroadcasti128 m6, [subpel_h_shufA] - vbroadcasti128 m7, [subpel_h_shufB] - punpcklbw m0, m0 - psraw m0, 8 ; sign-extend - test dword r7m, 0x800 - jnz .h_12bpc - psllw m0, 2 -.h_12bpc: - pshufd m8, m0, q0000 - pshufd m9, m0, q1111 - pshufd m10, m0, q2222 - pshufd m11, m0, q3333 - cmp wd, 8 - jg .h_w16 -.h_w8: -%macro PREP_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] - pshufb m%4, m%1, m7 ; 2 3 3 4 4 5 5 6 - pshufb m%1, m6 ; 0 1 1 2 2 3 3 4 - pmaddwd m%5, m9, m%4 ; abcd1 - pmaddwd m%1, m8 ; abcd0 - pshufb m%2, m7 ; 6 7 7 8 8 9 9 a - shufpd m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8 - paddd m%5, m5 - paddd m%1, m%5 - pmaddwd m%5, m11, m%2 ; abcd3 - paddd m%1, m%5 - pmaddwd m%5, m10, m%4 ; abcd2 - pshufb m%3, m7 ; a b b c c d d e - pmaddwd m%4, m8 ; efgh0 - paddd m%1, m%5 - pmaddwd m%5, m9, m%2 ; efgh1 - shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c - pmaddwd m%3, m11 ; efgh3 - pmaddwd m%2, m10 ; efgh2 - paddd m%4, m5 - paddd m%4, m%5 - paddd m%3, m%4 - paddd m%2, m%3 - psrad m%1, 4 - psrad m%2, 4 - packssdw m%1, m%2 -%endmacro - movu xm0, [srcq+strideq*0+ 0] - vinserti128 m0, [srcq+strideq*1+ 0], 1 - movu xm2, [srcq+strideq*0+16] - vinserti128 m2, [srcq+strideq*1+16], 1 - lea srcq, [srcq+strideq*2] - shufpd m1, m0, m2, 0x05 - PREP_8TAP_H 0, 1, 2, 3, 4 - mova [tmpq], m0 - add tmpq, 32 - sub hd, 2 - jg .h_w8 - RET -.h_w16: - add wd, wd -.h_w16_loop0: - mov r6d, wd -.h_w16_loop: - movu m0, [srcq+r6-32] - movu m1, [srcq+r6-24] - movu m2, [srcq+r6-16] - PREP_8TAP_H 0, 1, 2, 3, 4 - mova [tmpq+r6-32], m0 - sub r6d, 32 - jg .h_w16_loop - add srcq, strideq - add tmpq, wq - dec hd - jg .h_w16_loop0 - RET + jz mangle(private_prefix %+ _prep_6tap_16bpc_avx2).prep .v: movzx mxd, myb shr myd, 16 cmp hd, 4 - cmovle myd, mxd + cmove myd, mxd vpbroadcastq m0, [base+subpel_filters+myq*8] - WIN64_SPILL_XMM 15 + WIN64_SPILL_XMM 12, 15 vpbroadcastd m7, [prep_8tap_1d_rnd] lea r6, [strideq*3] - sub srcq, r6 punpcklbw m0, m0 + sub srcq, r6 psraw m0, 8 ; sign-extend test dword r7m, 0x800 jnz .v_12bpc @@ -2183,23 +3027,23 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my RET .v_w8: %if WIN64 + WIN64_PUSH_XMM 15 push r8 %endif mov r8d, wd shl wd, 5 - mov r5, srcq - mov r7, tmpq lea wd, [hq+wq-256] .v_w8_loop0: vbroadcasti128 m4, [srcq+strideq*0] vbroadcasti128 m5, [srcq+strideq*1] + lea r5, [srcq+strideq*4] vbroadcasti128 m0, [srcq+r6 ] vbroadcasti128 m6, [srcq+strideq*2] - lea srcq, [srcq+strideq*4] - vbroadcasti128 m1, [srcq+strideq*0] - vbroadcasti128 m2, [srcq+strideq*1] - vbroadcasti128 m3, [srcq+strideq*2] - add srcq, r6 + mov r7, tmpq + vbroadcasti128 m1, [r5+strideq*0] + vbroadcasti128 m2, [r5+strideq*1] + vbroadcasti128 m3, [r5+strideq*2] + add r5, r6 shufpd m4, m0, 0x0c shufpd m5, m1, 0x0c punpcklwd m1, m4, m5 ; 01 @@ -2211,7 +3055,7 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my punpcklwd m3, m6, m0 ; 23 punpckhwd m6, m0 ; 56 .v_w8_loop: - vbroadcasti128 m14, [srcq+strideq*0] + vbroadcasti128 m14, [r5+strideq*0] pmaddwd m12, m8, m1 ; a0 pmaddwd m13, m8, m2 ; b0 mova m1, m3 @@ -2227,8 +3071,8 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my pmaddwd m5, m10 ; a2 pmaddwd m6, m10 ; b2 paddd m12, m5 - vbroadcasti128 m5, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] + vbroadcasti128 m5, [r5+strideq*1] + lea r5, [r5+strideq*2] paddd m13, m6 shufpd m6, m0, m14, 0x0d shufpd m0, m14, m5, 0x0c @@ -2242,22 +3086,101 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my psrad m13, 4 packssdw m12, m13 vpermq m12, m12, q3120 - mova [tmpq+r8*0], xm12 - vextracti128 [tmpq+r8*2], m12, 1 - lea tmpq, [tmpq+r8*4] + mova [r7+r8*0], xm12 + vextracti128 [r7+r8*2], m12, 1 + lea r7, [r7+r8*4] sub hd, 2 jg .v_w8_loop - add r5, 16 - add r7, 16 + add srcq, 16 + add tmpq, 16 movzx hd, wb - mov srcq, r5 - mov tmpq, r7 sub wd, 1<<8 jg .v_w8_loop0 %if WIN64 pop r8 %endif RET +.h: + test myd, 0xf00 + jnz .hv + vpbroadcastd m5, [prep_8tap_1d_rnd] ; 8 - (8192 << 4) + cmp wd, 4 + je mangle(private_prefix %+ _prep_6tap_16bpc_avx2).h_w4 + shr mxd, 16 + sub srcq, 6 + vpbroadcastq m0, [base+subpel_filters+mxq*8] + WIN64_SPILL_XMM 12 + vbroadcasti128 m6, [subpel_h_shufA] + vbroadcasti128 m7, [subpel_h_shufB] + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + test dword r7m, 0x800 + jnz .h_12bpc + psllw m0, 2 +.h_12bpc: + pshufd m8, m0, q0000 + pshufd m9, m0, q1111 + pshufd m10, m0, q2222 + pshufd m11, m0, q3333 + cmp wd, 8 + jg .h_w16 +.h_w8: +%macro PREP_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] + pshufb m%4, m%1, m7 ; 2 3 3 4 4 5 5 6 + pshufb m%1, m6 ; 0 1 1 2 2 3 3 4 + pmaddwd m%5, m9, m%4 ; abcd1 + pmaddwd m%1, m8 ; abcd0 + pshufb m%2, m7 ; 6 7 7 8 8 9 9 a + shufpd m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8 + paddd m%5, m5 + paddd m%1, m%5 + pmaddwd m%5, m11, m%2 ; abcd3 + paddd m%1, m%5 + pmaddwd m%5, m10, m%4 ; abcd2 + pshufb m%3, m7 ; a b b c c d d e + pmaddwd m%4, m8 ; efgh0 + paddd m%1, m%5 + pmaddwd m%5, m9, m%2 ; efgh1 + shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c + pmaddwd m%3, m11 ; efgh3 + pmaddwd m%2, m10 ; efgh2 + paddd m%4, m5 + paddd m%4, m%5 + paddd m%3, m%4 + paddd m%2, m%3 + psrad m%1, 4 + psrad m%2, 4 + packssdw m%1, m%2 +%endmacro + movu xm0, [srcq+strideq*0+ 0] + vinserti128 m0, [srcq+strideq*1+ 0], 1 + movu xm2, [srcq+strideq*0+16] + vinserti128 m2, [srcq+strideq*1+16], 1 + lea srcq, [srcq+strideq*2] + shufpd m1, m0, m2, 0x05 + PREP_8TAP_H 0, 1, 2, 3, 4 + mova [tmpq], m0 + add tmpq, 32 + sub hd, 2 + jg .h_w8 + RET +.h_w16: + add wd, wd +.h_w16_loop0: + mov r6d, wd +.h_w16_loop: + movu m0, [srcq+r6-32] + movu m1, [srcq+r6-24] + movu m2, [srcq+r6-16] + PREP_8TAP_H 0, 1, 2, 3, 4 + mova [tmpq+r6-32], m0 + sub r6d, 32 + jg .h_w16_loop + add srcq, strideq + add tmpq, wq + dec hd + jg .h_w16_loop0 + RET .hv: WIN64_SPILL_XMM 16 vpbroadcastd m15, [prep_8tap_2d_rnd] @@ -2268,12 +3191,12 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my movzx mxd, myb shr myd, 16 cmp hd, 4 - cmovle myd, mxd + cmove myd, mxd vpbroadcastq m1, [base+subpel_filters+myq*8] lea r6, [strideq*3] sub srcq, 2 - sub srcq, r6 pxor m7, m7 + sub srcq, r6 punpcklbw m7, m0 punpcklbw m1, m1 psraw m7, 4 @@ -2375,7 +3298,7 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my movzx mxd, myb shr myd, 16 cmp hd, 4 - cmovle myd, mxd + cmove myd, mxd pmovsxbw xm1, [base+subpel_filters+myq*8] %if WIN64 PUSH r8 @@ -2385,12 +3308,9 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my lea r6, [strideq*3] sub srcq, 6 sub srcq, r6 - mov r5, srcq - mov r7, tmpq lea wd, [hq+wq-256] pxor m0, m0 punpcklbw m0, m2 - mova [v_mul], xm1 psraw m0, 4 test dword r7m, 0x800 jz .hv_w8_10bit @@ -2398,6 +3318,7 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my .hv_w8_10bit: pshufd m11, m0, q0000 pshufd m12, m0, q1111 + mova [v_mul], xm1 pshufd m13, m0, q2222 pshufd m14, m0, q3333 .hv_w8_loop0: @@ -2430,13 +3351,15 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my %endmacro movu xm4, [srcq+r6 + 0] vbroadcasti128 m8, [subpel_h_shufA] + lea r5, [srcq+strideq*4] movu xm6, [srcq+r6 + 8] vbroadcasti128 m9, [subpel_h_shufB] + mov r7, tmpq movu xm0, [srcq+r6 +16] movu xm5, [srcq+strideq*0+ 0] - vinserti128 m5, [srcq+strideq*4+ 0], 1 + vinserti128 m5, [r5 +strideq*0+ 0], 1 movu xm1, [srcq+strideq*0+16] - vinserti128 m1, [srcq+strideq*4+16], 1 + vinserti128 m1, [r5 +strideq*0+16], 1 shufpd m7, m5, m1, 0x05 INIT_XMM avx2 PREP_8TAP_HV_H 4, 6, 0 ; 3 @@ -2450,10 +3373,9 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my PREP_8TAP_HV_H 0, 7, 1 ; 2 6 movu xm6, [srcq+strideq*1+ 0] movu xm1, [srcq+strideq*1+16] - lea srcq, [srcq+strideq*4] - vinserti128 m6, [srcq+strideq*1+ 0], 1 - vinserti128 m1, [srcq+strideq*1+16], 1 - add srcq, r6 + vinserti128 m6, [r5 +strideq*1+ 0], 1 + vinserti128 m1, [r5 +strideq*1+16], 1 + add r5, r6 shufpd m7, m6, m1, 0x05 PREP_8TAP_HV_H 6, 7, 1 ; 1 5 vpermq m4, m4, q1100 @@ -2486,13 +3408,13 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my pmaddwd m6, m10 ; b2 paddd m8, m5 paddd m9, m6 - movu xm5, [srcq+strideq*0] - vinserti128 m5, [srcq+strideq*1], 1 + movu xm5, [r5+strideq*0] + vinserti128 m5, [r5+strideq*1], 1 vbroadcasti128 m7, [subpel_h_shufA] vbroadcasti128 m10, [subpel_h_shufB] - movu xm6, [srcq+strideq*0+16] - vinserti128 m6, [srcq+strideq*1+16], 1 - vextracti128 [tmpq], m0, 1 + movu xm6, [r5+strideq*0+16] + vinserti128 m6, [r5+strideq*1+16], 1 + vextracti128 [r7], m0, 1 pshufb m0, m5, m7 ; 01 pshufb m5, m10 ; 23 pmaddwd m0, m11 @@ -2505,9 +3427,9 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my pmaddwd m6, m14 paddd m5, m15 paddd m6, m5 - movu xm5, [srcq+strideq*0+8] - vinserti128 m5, [srcq+strideq*1+8], 1 - lea srcq, [srcq+strideq*2] + movu xm5, [r5+strideq*0+8] + vinserti128 m5, [r5+strideq*1+8], 1 + lea r5, [r5+strideq*2] pshufb m7, m5, m7 pshufb m5, m10 pmaddwd m10, m13, m7 @@ -2518,7 +3440,7 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my pmaddwd m5, m12 paddd m0, m7 paddd m5, m6 - vbroadcasti128 m6, [tmpq] + vbroadcasti128 m6, [r7] vpbroadcastd m10, [v_mul+4*3] psrad m0, 6 psrad m5, 6 @@ -2535,16 +3457,14 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my psrad m9, 6 packssdw m7, m9 vpermq m7, m7, q3120 - mova [tmpq+r8*0], xm7 - vextracti128 [tmpq+r8*2], m7, 1 - lea tmpq, [tmpq+r8*4] + mova [r7+r8*0], xm7 + vextracti128 [r7+r8*2], m7, 1 + lea r7, [r7+r8*4] sub hd, 2 jg .hv_w8_loop - add r5, 16 - add r7, 16 + add srcq, 16 + add tmpq, 16 movzx hd, wb - mov srcq, r5 - mov tmpq, r7 sub wd, 1<<8 jg .hv_w8_loop0 %if WIN64 @@ -4223,14 +5143,14 @@ DECLARE_REG_TMP 6, 8 %define PUT_8TAP_SCALED_FN FN put_8tap_scaled, BILIN_SCALED_FN put -PUT_8TAP_SCALED_FN sharp, SHARP, SHARP -PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH -PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP -PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH -PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR -PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP -PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR -PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH +PUT_8TAP_SCALED_FN sharp, SHARP, SHARP, put_8tap_scaled_16bpc +PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH, put_8tap_scaled_16bpc +PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP, put_8tap_scaled_16bpc +PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH, put_8tap_scaled_16bpc +PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR, put_8tap_scaled_16bpc +PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP, put_8tap_scaled_16bpc +PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR, put_8tap_scaled_16bpc +PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH, put_8tap_scaled_16bpc PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR MC_8TAP_SCALED put @@ -4242,14 +5162,14 @@ DECLARE_REG_TMP 6, 7 %define PREP_8TAP_SCALED_FN FN prep_8tap_scaled, BILIN_SCALED_FN prep -PREP_8TAP_SCALED_FN sharp, SHARP, SHARP -PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH -PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP -PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH -PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR -PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP -PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR -PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH +PREP_8TAP_SCALED_FN sharp, SHARP, SHARP, prep_8tap_scaled_16bpc +PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_scaled_16bpc +PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_scaled_16bpc +PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH, prep_8tap_scaled_16bpc +PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR, prep_8tap_scaled_16bpc +PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP, prep_8tap_scaled_16bpc +PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR, prep_8tap_scaled_16bpc +PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH, prep_8tap_scaled_16bpc PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR MC_8TAP_SCALED prep diff --git a/third_party/dav1d/src/x86/mc_avx2.asm b/third_party/dav1d/src/x86/mc_avx2.asm index 58e3cb5af1..df8bebb1cb 100644 --- a/third_party/dav1d/src/x86/mc_avx2.asm +++ b/third_party/dav1d/src/x86/mc_avx2.asm @@ -60,15 +60,14 @@ subpel_v_shuf4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 1 subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11 subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 -bilin_h_shuf8: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7 deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11 blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 pb_8x0_8x8: db 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8 bdct_lb_dw: db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 wswap: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 -rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7 resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7 +rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7 wm_420_sign: dd 0x01020102, 0x01010101 wm_422_sign: dd 0x80808080, 0x7f7f7f7f @@ -95,6 +94,7 @@ pq_0x40000000: dq 0x40000000 cextern mc_subpel_filters cextern mc_warp_filter2 cextern resize_filter +cextern z_filter_s %define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) @@ -184,7 +184,9 @@ BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128 BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE put, 6tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, 8tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, 6tap, avx2, 1, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, 8tap, avx2, 1, 4, 8, 16, 32, 64, 128 SCALED_JMP_TABLE put_8tap_scaled, avx2, 2, 4, 8, 16, 32, 64, 128 SCALED_JMP_TABLE prep_8tap_scaled, avx2, 4, 8, 16, 32, 64, 128 @@ -298,7 +300,7 @@ INIT_YMM avx2 ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4 ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4 imul mxyd, 255 - vbroadcasti128 m4, [bilin_h_shuf8] + vbroadcasti128 m4, [z_filter_s+2] add mxyd, 16 movd xm5, mxyd mov mxyd, r7m ; my @@ -900,7 +902,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 ; 16 * src[x] + (mx * (src[x + 1] - src[x])) ; = (16 - mx) * src[x] + mx * src[x + 1] imul mxyd, 255 - vbroadcasti128 m4, [bilin_h_shuf8] + vbroadcasti128 m4, [z_filter_s+2] add mxyd, 16 movd xm5, mxyd mov mxyd, r6m ; my @@ -1436,7 +1438,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 %assign FILTER_SMOOTH (1*15 << 16) | 4*15 %assign FILTER_SHARP (2*15 << 16) | 3*15 -%macro FN 4 ; fn, type, type_h, type_v +%macro FN 4-5 ; fn, type, type_h, type_v, jmp_to cglobal %1_%2_8bpc mov t0d, FILTER_%3 %ifidn %3, %4 @@ -1444,8 +1446,8 @@ cglobal %1_%2_8bpc %else mov t1d, FILTER_%4 %endif -%ifnidn %2, regular ; skip the jump in the last filter - jmp mangle(private_prefix %+ _%1_8bpc %+ SUFFIX) +%if %0 == 5 ; skip the jump in the last filter + jmp mangle(private_prefix %+ _%5 %+ SUFFIX) %endif %endmacro @@ -1456,28 +1458,24 @@ DECLARE_REG_TMP 7, 8 %endif %define PUT_8TAP_FN FN put_8tap, -PUT_8TAP_FN sharp, SHARP, SHARP -PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH -PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP -PUT_8TAP_FN smooth, SMOOTH, SMOOTH -PUT_8TAP_FN sharp_regular, SHARP, REGULAR -PUT_8TAP_FN regular_sharp, REGULAR, SHARP -PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR -PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH +PUT_8TAP_FN smooth, SMOOTH, SMOOTH, put_6tap_8bpc +PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR, put_6tap_8bpc +PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH, put_6tap_8bpc PUT_8TAP_FN regular, REGULAR, REGULAR -cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 +cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ns imul mxd, mxm, 0x010101 - add mxd, t0d ; 8tap_h, mx, 4tap_h + add mxd, t0d ; 6tap_h, mx, 4tap_h imul myd, mym, 0x010101 - add myd, t1d ; 8tap_v, my, 4tap_v + add myd, t1d ; 6tap_v, my, 4tap_v lea r8, [put_avx2] - movsxd wq, wm + mov wd, wm movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v +.put: tzcnt wd, wd movzx wd, word [r8+wq*2+table_offset(put,)] add wq, r8 @@ -1487,36 +1485,18 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 pop r8 %endif jmp wq -.h: - test myd, 0xf00 - jnz .hv - vpbroadcastd m5, [pw_34] ; 2 + (8 << 2) - WIN64_SPILL_XMM 11 - cmp wd, 4 - jl .h_w2 - vbroadcasti128 m6, [subpel_h_shufA] - je .h_w4 - tzcnt wd, wd - vbroadcasti128 m7, [subpel_h_shufB] - vbroadcasti128 m8, [subpel_h_shufC] - shr mxd, 16 - sub srcq, 3 - movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)] - vpbroadcastd m9, [r8+mxq*8+subpel_filters-put_avx2+0] - vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+4] - add wq, r8 - jmp wq .h_w2: movzx mxd, mxb - dec srcq - mova xm4, [subpel_h_shuf4] - vpbroadcastd xm3, [r8+mxq*8+subpel_filters-put_avx2+2] + lea srcq, [srcq-1] + vpbroadcastd xm4, [r8+mxq*8+subpel_filters-put_avx2+2] + je .h_w4 + mova xm3, [subpel_h_shuf4] .h_w2_loop: movq xm0, [srcq+ssq*0] movhps xm0, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - pshufb xm0, xm4 - pmaddubsw xm0, xm3 + pshufb xm0, xm3 + pmaddubsw xm0, xm4 phaddw xm0, xm0 paddw xm0, xm5 psraw xm0, 6 @@ -1528,17 +1508,15 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 jg .h_w2_loop RET .h_w4: - movzx mxd, mxb - dec srcq - vpbroadcastd xm3, [r8+mxq*8+subpel_filters-put_avx2+2] + mova xm3, [subpel_h_shufA] .h_w4_loop: movq xm0, [srcq+ssq*0] movq xm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - pshufb xm0, xm6 - pshufb xm1, xm6 - pmaddubsw xm0, xm3 - pmaddubsw xm1, xm3 + pshufb xm0, xm3 + pshufb xm1, xm3 + pmaddubsw xm0, xm4 + pmaddubsw xm1, xm4 phaddw xm0, xm1 paddw xm0, xm5 psraw xm0, 6 @@ -1549,25 +1527,43 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 sub hd, 2 jg .h_w4_loop RET +.h: + test myd, 0xf00 + jnz .hv + vpbroadcastd m5, [pw_34] ; 2 + (8 << 2) + cmp wd, 4 + jle .h_w2 + WIN64_SPILL_XMM 11 + tzcnt wd, wd + vbroadcasti128 m4, [z_filter_s+ 2] ; 01 + shr mxd, 16 + vbroadcasti128 m6, [z_filter_s+ 6] ; 23 + sub srcq, 2 + vbroadcasti128 m7, [z_filter_s+10] ; 45 + lea mxq, [r8+mxq*8+subpel_filters+1-put_avx2] + movzx wd, word [r8+wq*2+table_offset(put, _6tap_h)] + vpbroadcastw m8, [mxq+0] + vpbroadcastw m9, [mxq+2] + add wq, r8 + vpbroadcastw m10, [mxq+4] + jmp wq .h_w8: -%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3] - pshufb m%2, m%1, m7 - pshufb m%3, m%1, m8 - pshufb m%1, m6 - pmaddubsw m%4, m%2, m9 - pmaddubsw m%2, m10 - pmaddubsw m%3, m10 - pmaddubsw m%1, m9 - paddw m%3, m%4 +%macro PUT_6TAP_H 3 ; dst/src, tmp[1-2] + pshufb m%2, m%1, m4 + pmaddubsw m%2, m8 + pshufb m%3, m%1, m6 + pmaddubsw m%3, m9 + pshufb m%1, m7 + pmaddubsw m%1, m10 + paddw m%2, m5 + paddw m%1, m%3 paddw m%1, m%2 - phaddw m%1, m%3 - paddw m%1, m5 psraw m%1, 6 %endmacro movu xm0, [srcq+ssq*0] vinserti128 m0, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] - PUT_8TAP_H 0, 1, 2, 3 + PUT_6TAP_H 0, 1, 2 vextracti128 xm1, m0, 1 packuswb xm0, xm1 movq [dstq+dsq*0], xm0 @@ -1581,9 +1577,9 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 vinserti128 m0, [srcq+ssq*1+8*0], 1 movu xm1, [srcq+ssq*0+8*1] vinserti128 m1, [srcq+ssq*1+8*1], 1 - PUT_8TAP_H 0, 2, 3, 4 + PUT_6TAP_H 0, 2, 3 lea srcq, [srcq+ssq*2] - PUT_8TAP_H 1, 2, 3, 4 + PUT_6TAP_H 1, 2, 3 packuswb m0, m1 mova [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], m0, 1 @@ -1606,8 +1602,8 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 .h_loop: movu m0, [srcq+r6+8*0] movu m1, [srcq+r6+8*1] - PUT_8TAP_H 0, 2, 3, 4 - PUT_8TAP_H 1, 2, 3, 4 + PUT_6TAP_H 0, 2, 3 + PUT_6TAP_H 1, 2, 3 packuswb m0, m1 mova [dstq+r6], m0 add r6, 32 @@ -1619,7 +1615,421 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 jg .h_loop RET .v: - WIN64_SPILL_XMM 16 + WIN64_SPILL_XMM 9, 12 + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + tzcnt r6d, wd + movzx r6d, word [r8+r6*2+table_offset(put, _6tap_v)] + vpbroadcastd m8, [pw_512] + lea myq, [r8+myq*8+subpel_filters+1-put_avx2] + vpbroadcastw m5, [myq+0] + vpbroadcastw m6, [myq+2] + vpbroadcastw m7, [myq+4] + add r6, r8 + mov nsq, ssq + neg nsq + jmp r6 +.v_w2: + movd xm2, [srcq+nsq*2] + pinsrw xm2, [srcq+nsq*1], 2 + pinsrw xm2, [srcq+ssq*0], 4 + pinsrw xm2, [srcq+ssq*1], 6 ; 0 1 2 3 + lea srcq, [srcq+ssq*2] + vpbroadcastd xm0, [srcq+ssq*0] + palignr xm3, xm0, xm2, 4 ; 1 2 3 4 + punpcklbw xm1, xm2, xm3 ; 01 12 + punpckhbw xm2, xm3 ; 23 34 +.v_w2_loop: + vpbroadcastd xm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddubsw xm3, xm1, xm5 ; a0 b0 + mova xm1, xm2 + pmaddubsw xm2, xm6 ; a1 b1 + paddw xm3, xm2 + vpblendd xm2, xm0, xm4, 0x02 ; 4 5 + vpbroadcastd xm0, [srcq+ssq*0] + vpblendd xm4, xm0, 0x02 ; 5 6 + punpcklbw xm2, xm4 ; 67 78 + pmaddubsw xm4, xm2, xm7 ; a3 b3 + paddw xm3, xm4 + pmulhrsw xm3, xm8 + packuswb xm3, xm3 + pextrw [dstq+dsq*0], xm3, 0 + pextrw [dstq+dsq*1], xm3, 2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: + movd xm2, [srcq+nsq*2] + pinsrd xm2, [srcq+nsq*1], 1 + pinsrd xm2, [srcq+ssq*0], 2 + pinsrd xm2, [srcq+ssq*1], 3 ; 0 1 2 3 + lea srcq, [srcq+ssq*2] + vpbroadcastd xm0, [srcq+ssq*0] + palignr xm3, xm0, xm2, 4 ; 1 2 3 4 + punpcklbw xm1, xm2, xm3 ; 01 12 + punpckhbw xm2, xm3 ; 23 34 +.v_w4_loop: + vpbroadcastd xm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddubsw xm3, xm1, xm5 ; a0 b0 + mova xm1, xm2 + pmaddubsw xm2, xm6 ; a1 b1 + paddw xm3, xm2 + vpblendd xm2, xm0, xm4, 0x02 ; 4 5 + vpbroadcastd xm0, [srcq+ssq*0] + vpblendd xm4, xm0, 0x02 ; 5 6 + punpcklbw xm2, xm4 ; 45 56 + pmaddubsw xm4, xm2, xm7 ; a2 b2 + paddw xm3, xm4 + pmulhrsw xm3, xm8 + packuswb xm3, xm3 + movd [dstq+dsq*0], xm3 + pextrd [dstq+dsq*1], xm3, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: + movq xm1, [srcq+nsq*2] + vpbroadcastq m3, [srcq+nsq*1] + vpbroadcastq m2, [srcq+ssq*0] + vpbroadcastq m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpbroadcastq m0, [srcq+ssq*0] + vpblendd m1, m3, 0x30 + vpblendd m3, m2, 0x30 + punpcklbw m1, m3 ; 01 12 + vpblendd m2, m4, 0x30 + vpblendd m4, m0, 0x30 + punpcklbw m2, m4 ; 23 34 +.v_w8_loop: + vpbroadcastq m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddubsw m3, m1, m5 ; a0 b0 + mova m1, m2 + pmaddubsw m2, m6 ; a1 b1 + paddw m3, m2 + vpblendd m2, m0, m4, 0x30 + vpbroadcastq m0, [srcq+ssq*0] + vpblendd m4, m0, 0x30 + punpcklbw m2, m4 ; 45 56 + pmaddubsw m4, m2, m7 ; a2 b2 + paddw m3, m4 + pmulhrsw m3, m8 + vextracti128 xm4, m3, 1 + packuswb xm3, xm4 + movq [dstq+dsq*0], xm3 + movhps [dstq+dsq*1], xm3 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w8_loop + RET +.v_w16: +.v_w32: +.v_w64: +.v_w128: + lea r6d, [wq*8-128] + WIN64_PUSH_XMM 12 + lea r6d, [hq+r6*2] +.v_w16_loop0: + vbroadcasti128 m3, [srcq+nsq*2] + vbroadcasti128 m4, [srcq+nsq*1] + lea r4, [srcq+ssq*2] + vbroadcasti128 m0, [srcq+ssq*0] + vbroadcasti128 m1, [srcq+ssq*1] + mov r7, dstq + vbroadcasti128 m2, [r4+ssq*0] + shufpd m3, m0, 0x0c + shufpd m4, m1, 0x0c + punpcklbw m1, m3, m4 ; 01 + punpckhbw m3, m4 ; 23 + shufpd m0, m2, 0x0c + punpcklbw m2, m4, m0 ; 12 + punpckhbw m4, m0 ; 34 +.v_w16_loop: + vbroadcasti128 m9, [r4+ssq*1] + pmaddubsw m10, m1, m5 ; a0 + lea r4, [r4+ssq*2] + pmaddubsw m11, m2, m5 ; b0 + mova m1, m3 + pmaddubsw m3, m6 ; a1 + mova m2, m4 + pmaddubsw m4, m6 ; b1 + paddw m10, m3 + vbroadcasti128 m3, [r4+ssq*0] + paddw m11, m4 + shufpd m4, m0, m9, 0x0d + shufpd m0, m9, m3, 0x0c + punpcklbw m3, m4, m0 ; 45 + punpckhbw m4, m0 ; 56 + pmaddubsw m9, m3, m7 ; a2 + paddw m10, m9 + pmaddubsw m9, m4, m7 ; b2 + paddw m11, m9 + pmulhrsw m10, m8 + pmulhrsw m11, m8 + packuswb m10, m11 + vpermq m10, m10, q3120 + mova [r7+dsq*0], xm10 + vextracti128 [r7+dsq*1], m10, 1 + lea r7, [r7+dsq*2] + sub hd, 2 + jg .v_w16_loop + add srcq, 16 + add dstq, 16 + movzx hd, r6b + sub r6d, 1<<8 + jg .v_w16_loop0 + RET +.hv: + WIN64_SPILL_XMM 12, 16 + cmp wd, 4 + jg .hv_w8 + movzx mxd, mxb + dec srcq + vpbroadcastd m6, [r8+mxq*8+subpel_filters-put_avx2+2] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + vpbroadcastq m0, [r8+myq*8+subpel_filters+1-put_avx2] + vpbroadcastd m7, [pw_8192] + punpcklbw m0, m0 + vpbroadcastd m8, [pd_512] + psraw m0, 8 ; sign-extend + mov nsq, ssq + pshufd m9, m0, q0000 + neg nsq + pshufd m10, m0, q1111 + pshufd m11, m0, q2222 + cmp wd, 4 + je .hv_w4 + vbroadcasti128 m5, [subpel_h_shuf4] + movq xm2, [srcq+nsq*2] + movhps xm2, [srcq+nsq*1] + movq xm0, [srcq+ssq*0] + movhps xm0, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpbroadcastq m1, [srcq+ssq*0] + vpblendd m2, m1, 0x30 + pshufb m2, m5 + pshufb xm0, xm5 + pmaddubsw m2, m6 + pmaddubsw xm0, xm6 + phaddw m2, m0 + pmulhrsw m2, m7 + vextracti128 xm0, m2, 1 + palignr xm0, xm2, 4 + punpcklwd xm1, xm2, xm0 ; 01 12 + punpckhwd xm2, xm0 ; 23 34 +.hv_w2_loop: + movq xm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movhps xm4, [srcq+ssq*0] + pshufb xm4, xm5 + pmaddubsw xm4, xm6 + pmaddwd xm3, xm9, xm1 ; a0 b0 + mova xm1, xm2 + pmaddwd xm2, xm10 ; a1 b1 + phaddw xm4, xm4 + paddd xm3, xm2 + pmulhrsw xm4, xm7 + palignr xm2, xm4, xm0, 12 + mova xm0, xm4 + punpcklwd xm2, xm4 ; 45 56 + pmaddwd xm4, xm11, xm2 ; a2 b2 + paddd xm3, xm8 + paddd xm3, xm4 + psrad xm3, 10 + packssdw xm3, xm3 + packuswb xm3, xm3 + pextrw [dstq+dsq*0], xm3, 0 + pextrw [dstq+dsq*1], xm3, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +.hv_w4: + mova m5, [subpel_h_shuf4] + vpbroadcastq m2, [srcq+nsq*2] + vpbroadcastq m4, [srcq+nsq*1] + vpbroadcastq m1, [srcq+ssq*0] + vpbroadcastq m3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpbroadcastq m0, [srcq+ssq*0] + vpblendd m2, m4, 0xcc ; 0 1 + vpblendd m1, m3, 0xcc ; 2 3 + pshufb m2, m5 + pshufb m1, m5 + pshufb m0, m5 + pmaddubsw m2, m6 + pmaddubsw m1, m6 + pmaddubsw m0, m6 + phaddw m2, m1 + phaddw m0, m0 + pmulhrsw m2, m7 + pmulhrsw m0, m7 + palignr m3, m0, m2, 4 + punpcklwd m1, m2, m3 ; 01 12 + punpckhwd m2, m3 ; 23 34 +.hv_w4_loop: + vpbroadcastq m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddwd m3, m9, m1 ; a0 b0 + mova m1, m2 + pmaddwd m2, m10 ; a1 b1 + paddd m3, m2 + vpbroadcastq m2, [srcq+ssq*0] + vpblendd m4, m2, 0xcc ; 5 6 + pshufb m4, m5 + pmaddubsw m4, m6 + phaddw m4, m4 + pmulhrsw m4, m7 + palignr m2, m4, m0, 12 + mova m0, m4 + punpcklwd m2, m4 ; 45 56 + pmaddwd m4, m11, m2 ; a2 b2 + paddd m3, m8 + paddd m3, m4 + psrad m3, 10 + vextracti128 xm4, m3, 1 + packssdw xm3, xm4 + packuswb xm3, xm3 + pshuflw xm3, xm3, q3120 + movd [dstq+dsq*0], xm3 + pextrd [dstq+dsq*1], xm3, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: + shr mxd, 16 + sub srcq, 2 + lea mxq, [r8+mxq*8+subpel_filters+1-put_avx2] + WIN64_PUSH_XMM 16 + vpbroadcastw m10, [mxq+0] + vpbroadcastw m11, [mxq+2] + vpbroadcastw m12, [mxq+4] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + vpbroadcastq m0, [r8+myq*8+subpel_filters+1-put_avx2] + lea r6d, [wq*8-64] + vbroadcasti128 m8, [z_filter_s+ 6] + punpcklbw m0, m0 + vbroadcasti128 m9, [z_filter_s+10] + psraw m0, 8 ; sign-extend + mov nsq, ssq + pshufd m13, m0, q0000 + neg nsq + pshufd m14, m0, q1111 + lea r6d, [hq+r6*4] + pshufd m15, m0, q2222 +.hv_w8_loop0: + vbroadcasti128 m7, [z_filter_s+2] + movu xm3, [srcq+nsq*2] + lea r4, [srcq+ssq*2] + movu xm4, [srcq+nsq*1] + vbroadcasti128 m0, [srcq+ssq*0] + mov r7, dstq + vinserti128 m4, [srcq+ssq*1], 1 ; 1 3 + vpblendd m3, m0, 0xf0 ; 0 2 + vinserti128 m0, [r4+ssq*0], 1 ; 2 4 + vpbroadcastd m5, [pw_8192] +%macro HV_H_6TAP_W8 6 ; src/dst, tmp[1-2], shuf[1-3] + pshufb %2, %1, %4 + pmaddubsw %2, m10 + pshufb %3, %1, %5 + pmaddubsw %3, m11 + pshufb %1, %6 + pmaddubsw %1, m12 + paddw %2, %3 + paddw %1, %2 +%endmacro + HV_H_6TAP_W8 m3, m1, m2, m7, m8, m9 + HV_H_6TAP_W8 m4, m1, m2, m7, m8, m9 + HV_H_6TAP_W8 m0, m1, m2, m7, m8, m9 + vpermq m3, m3, q3120 + vpermq m4, m4, q3120 + vpermq m0, m0, q3120 + pmulhrsw m3, m5 + pmulhrsw m4, m5 + pmulhrsw m0, m5 + punpcklwd m1, m3, m4 ; 01 + punpckhwd m3, m4 ; 23 + punpcklwd m2, m4, m0 ; 12 + punpckhwd m4, m0 ; 34 +.hv_w8_loop: + movu xm7, [r4+ssq*1] + lea r4, [r4+ssq*2] + vinserti128 m7, [r4+ssq*0], 1 ; 5 6 + pmaddwd m5, m13, m1 ; a0 + mova m1, m3 + pmaddwd m6, m13, m2 ; b0 + mova m2, m4 + pmaddwd m3, m14 ; a1 + pmaddwd m4, m14 ; b1 + paddd m5, m3 + vbroadcasti128 m3, [z_filter_s+2] + paddd m6, m4 + HV_H_6TAP_W8 m7, m3, m4, m3, m8, m9 + vpbroadcastd m3, [pw_8192] + vpbroadcastd m4, [pd_512] + pmulhrsw m7, m3 + paddd m5, m4 + paddd m6, m4 + mova m4, m0 + vpermq m0, m7, q3120 + shufpd m4, m0, 0x05 + punpcklwd m3, m4, m0 ; 45 + pmaddwd m7, m15, m3 ; a2 + punpckhwd m4, m0 ; 67 + paddd m5, m7 + pmaddwd m7, m15, m4 ; b2 + paddd m6, m7 + psrad m5, 10 + psrad m6, 10 + packssdw m5, m6 + vextracti128 xm6, m5, 1 + packuswb xm5, xm6 + pshufd xm5, xm5, q3120 + movq [r7+dsq*0], xm5 + movhps [r7+dsq*1], xm5 + lea r7, [r7+dsq*2] + sub hd, 2 + jg .hv_w8_loop + add srcq, 8 + add dstq, 8 + movzx hd, r6b + sub r6d, 1<<8 + jg .hv_w8_loop0 + RET + +PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP, put_8tap_8bpc +PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH, put_8tap_8bpc +PUT_8TAP_FN regular_sharp, REGULAR, SHARP, put_8tap_8bpc +PUT_8TAP_FN sharp_regular, SHARP, REGULAR, put_8tap_8bpc +PUT_8TAP_FN sharp, SHARP, SHARP + +cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + lea r8, [put_avx2] + movsxd wq, wm + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jz mangle(private_prefix %+ _put_6tap_8bpc_avx2).put +.v: + WIN64_SPILL_XMM 12, 15 movzx mxd, myb shr myd, 16 cmp hd, 6 @@ -1765,19 +2175,19 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 .v_w64: .v_w128: lea r6d, [wq*8-128] - mov r4, srcq - mov r7, dstq + WIN64_PUSH_XMM 15 lea r6d, [hq+r6*2] .v_w16_loop0: vbroadcasti128 m4, [srcq+ssq*0] vbroadcasti128 m5, [srcq+ssq*1] + lea r4, [srcq+ss3q] vbroadcasti128 m6, [srcq+ssq*2] - add srcq, ss3q - vbroadcasti128 m0, [srcq+ssq*0] - vbroadcasti128 m1, [srcq+ssq*1] - vbroadcasti128 m2, [srcq+ssq*2] - add srcq, ss3q - vbroadcasti128 m3, [srcq+ssq*0] + vbroadcasti128 m0, [r4+ssq*0] + mov r7, dstq + vbroadcasti128 m1, [r4+ssq*1] + vbroadcasti128 m2, [r4+ssq*2] + add r4, ss3q + vbroadcasti128 m3, [r4+ssq*0] shufpd m4, m0, 0x0c shufpd m5, m1, 0x0c punpcklbw m1, m4, m5 ; 01 @@ -1789,51 +2199,138 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 punpcklbw m3, m6, m0 ; 23 punpckhbw m6, m0 ; 56 .v_w16_loop: - vbroadcasti128 m12, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - vbroadcasti128 m13, [srcq+ssq*0] - pmaddubsw m14, m1, m8 ; a0 - pmaddubsw m15, m2, m8 ; b0 + vbroadcasti128 m12, [r4+ssq*1] + lea r4, [r4+ssq*2] + pmaddubsw m13, m1, m8 ; a0 + pmaddubsw m14, m2, m8 ; b0 mova m1, m3 mova m2, m4 pmaddubsw m3, m9 ; a1 pmaddubsw m4, m9 ; b1 - paddw m14, m3 - paddw m15, m4 + paddw m13, m3 + paddw m14, m4 mova m3, m5 mova m4, m6 pmaddubsw m5, m10 ; a2 pmaddubsw m6, m10 ; b2 - paddw m14, m5 - paddw m15, m6 + paddw m13, m5 + vbroadcasti128 m5, [r4+ssq*0] + paddw m14, m6 shufpd m6, m0, m12, 0x0d - shufpd m0, m12, m13, 0x0c + shufpd m0, m12, m5, 0x0c punpcklbw m5, m6, m0 ; 67 punpckhbw m6, m0 ; 78 pmaddubsw m12, m5, m11 ; a3 - pmaddubsw m13, m6, m11 ; b3 + paddw m13, m12 + pmaddubsw m12, m6, m11 ; b3 paddw m14, m12 - paddw m15, m13 + pmulhrsw m13, m7 pmulhrsw m14, m7 - pmulhrsw m15, m7 - packuswb m14, m15 - vpermq m14, m14, q3120 - mova [dstq+dsq*0], xm14 - vextracti128 [dstq+dsq*1], m14, 1 - lea dstq, [dstq+dsq*2] + packuswb m13, m14 + vpermq m13, m13, q3120 + mova [r7+dsq*0], xm13 + vextracti128 [r7+dsq*1], m13, 1 + lea r7, [r7+dsq*2] sub hd, 2 jg .v_w16_loop - add r4, 16 - add r7, 16 + add srcq, 16 + add dstq, 16 movzx hd, r6b - mov srcq, r4 - mov dstq, r7 sub r6d, 1<<8 jg .v_w16_loop0 RET -.hv: - WIN64_SPILL_XMM 16 - cmp wd, 4 +.h: +.h_w2: +.h_w4: + test myd, 0xf00 + jnz .hv + vpbroadcastd m5, [pw_34] ; 2 + (8 << 2) + cmp wd, 4 + jle mangle(private_prefix %+ _put_6tap_8bpc_avx2).h_w2 + WIN64_SPILL_XMM 11 + tzcnt wd, wd + vbroadcasti128 m6, [subpel_h_shufA] + shr mxd, 16 + vbroadcasti128 m7, [subpel_h_shufB] + sub srcq, 3 + vbroadcasti128 m8, [subpel_h_shufC] + movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)] + vpbroadcastd m9, [r8+mxq*8+subpel_filters-put_avx2+0] + vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+4] + add wq, r8 + jmp wq +.h_w8: +%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3] + pshufb m%2, m%1, m7 + pshufb m%3, m%1, m8 + pshufb m%1, m6 + pmaddubsw m%4, m%2, m9 + pmaddubsw m%2, m10 + pmaddubsw m%3, m10 + pmaddubsw m%1, m9 + paddw m%3, m%4 + paddw m%1, m%2 + phaddw m%1, m%3 + paddw m%1, m5 + psraw m%1, 6 +%endmacro + movu xm0, [srcq+ssq*0] + vinserti128 m0, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + PUT_8TAP_H 0, 1, 2, 3 + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + movq [dstq+dsq*0], xm0 + movhps [dstq+dsq*1], xm0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w8 + RET +.h_w16: + movu xm0, [srcq+ssq*0+8*0] + vinserti128 m0, [srcq+ssq*1+8*0], 1 + movu xm1, [srcq+ssq*0+8*1] + vinserti128 m1, [srcq+ssq*1+8*1], 1 + PUT_8TAP_H 0, 2, 3, 4 + lea srcq, [srcq+ssq*2] + PUT_8TAP_H 1, 2, 3, 4 + packuswb m0, m1 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w16 + RET +.h_w32: + xor r6d, r6d + jmp .h_start +.h_w64: + mov r6, -32*1 + jmp .h_start +.h_w128: + mov r6, -32*3 +.h_start: + sub srcq, r6 + sub dstq, r6 + mov r4, r6 +.h_loop: + movu m0, [srcq+r6+8*0] + movu m1, [srcq+r6+8*1] + PUT_8TAP_H 0, 2, 3, 4 + PUT_8TAP_H 1, 2, 3, 4 + packuswb m0, m1 + mova [dstq+r6], m0 + add r6, 32 + jle .h_loop + add srcq, ssq + add dstq, dsq + mov r6, r4 + dec hd + jg .h_loop + RET +.hv: + WIN64_SPILL_XMM 14, 16 + cmp wd, 4 jg .hv_w8 movzx mxd, mxb dec srcq @@ -1975,6 +2472,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 jg .hv_w4_loop RET .hv_w8: + WIN64_PUSH_XMM 16 shr mxd, 16 sub srcq, 3 vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+0] @@ -1993,24 +2491,23 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 pshufd m14, m0, q2222 pshufd m15, m0, q3333 lea r6d, [wq*8-64] - mov r4, srcq - mov r7, dstq lea r6d, [hq+r6*4] .hv_w8_loop0: vbroadcasti128 m7, [subpel_h_shufA] movu xm4, [srcq+ssq*0] + lea r4, [srcq+ss3q] vbroadcasti128 m8, [subpel_h_shufB] movu xm5, [srcq+ssq*1] + mov r7, dstq vbroadcasti128 m9, [subpel_h_shufC] movu xm6, [srcq+ssq*2] - add srcq, ss3q - vbroadcasti128 m0, [srcq+ssq*0] - vpblendd m4, m0, 0xf0 ; 0 3 - vinserti128 m5, [srcq+ssq*1], 1 ; 1 4 - vinserti128 m6, [srcq+ssq*2], 1 ; 2 5 - add srcq, ss3q - vinserti128 m0, [srcq+ssq*0], 1 ; 3 6 -%macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3] + vbroadcasti128 m0, [r4+ssq*0] + vpblendd m4, m0, 0xf0 ; 0 3 + vinserti128 m5, [r4+ssq*1], 1 ; 1 4 + vinserti128 m6, [r4+ssq*2], 1 ; 2 5 + add r4, ss3q + vinserti128 m0, [r4+ssq*0], 1 ; 3 6 +%macro HV_H_8TAP_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3] pshufb %3, %1, %6 pshufb %4, %1, %7 pshufb %1, %5 @@ -2022,10 +2519,10 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 paddw %1, %3 phaddw %1, %2 %endmacro - HV_H_W8 m4, m1, m2, m3, m7, m8, m9 - HV_H_W8 m5, m1, m2, m3, m7, m8, m9 - HV_H_W8 m6, m1, m2, m3, m7, m8, m9 - HV_H_W8 m0, m1, m2, m3, m7, m8, m9 + HV_H_8TAP_W8 m4, m1, m2, m3, m7, m8, m9 + HV_H_8TAP_W8 m5, m1, m2, m3, m7, m8, m9 + HV_H_8TAP_W8 m6, m1, m2, m3, m7, m8, m9 + HV_H_8TAP_W8 m0, m1, m2, m3, m7, m8, m9 vpbroadcastd m7, [pw_8192] vpermq m4, m4, q3120 vpermq m5, m5, q3120 @@ -2043,9 +2540,9 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 punpckhwd m6, m7 ; 56 .hv_w8_loop: vextracti128 r6m, m0, 1 ; not enough registers - movu xm0, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - vinserti128 m0, [srcq+ssq*0], 1 ; 7 8 + movu xm0, [r4+ssq*1] + lea r4, [r4+ssq*2] + vinserti128 m0, [r4+ssq*0], 1 ; 7 8 pmaddwd m8, m1, m12 ; a0 pmaddwd m9, m2, m12 ; b0 mova m1, m3 @@ -2063,15 +2560,15 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 vbroadcasti128 m6, [subpel_h_shufB] vbroadcasti128 m7, [subpel_h_shufC] vbroadcasti128 m5, [subpel_h_shufA] - HV_H_W8 m0, m5, m6, m7, m5, m6, m7 + HV_H_8TAP_W8 m0, m5, m6, m7, m5, m6, m7 vpbroadcastd m5, [pw_8192] vpbroadcastd m7, [pd_512] vbroadcasti128 m6, r6m pmulhrsw m0, m5 paddd m8, m7 paddd m9, m7 - vpermq m7, m0, q3120 ; 7 8 - shufpd m6, m6, m7, 0x04 ; 6 7 + vpermq m7, m0, q3120 ; 7 8 + shufpd m6, m7, 0x04 ; 6 7 punpcklwd m5, m6, m7 ; 67 punpckhwd m6, m7 ; 78 pmaddwd m7, m5, m15 ; a3 @@ -2084,34 +2581,18 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 vextracti128 xm7, m8, 1 packuswb xm8, xm7 pshufd xm7, xm8, q3120 - movq [dstq+dsq*0], xm7 - movhps [dstq+dsq*1], xm7 - lea dstq, [dstq+dsq*2] + movq [r7+dsq*0], xm7 + movhps [r7+dsq*1], xm7 + lea r7, [r7+dsq*2] sub hd, 2 jg .hv_w8_loop - add r4, 8 - add r7, 8 + add srcq, 8 + add dstq, 8 movzx hd, r6b - mov srcq, r4 - mov dstq, r7 sub r6d, 1<<8 jg .hv_w8_loop0 RET -%macro PREP_8TAP_H 0 - pshufb m1, m0, m5 - pshufb m2, m0, m6 - pshufb m3, m0, m7 - pmaddubsw m1, m8 - pmaddubsw m0, m2, m8 - pmaddubsw m2, m9 - pmaddubsw m3, m9 - paddw m1, m2 - paddw m0, m3 - phaddw m0, m1, m0 - pmulhrsw m0, m4 -%endmacro - %if WIN64 DECLARE_REG_TMP 6, 4 %else @@ -2119,71 +2600,197 @@ DECLARE_REG_TMP 6, 7 %endif %define PREP_8TAP_FN FN prep_8tap, -PREP_8TAP_FN sharp, SHARP, SHARP -PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH -PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP -PREP_8TAP_FN smooth, SMOOTH, SMOOTH -PREP_8TAP_FN sharp_regular, SHARP, REGULAR -PREP_8TAP_FN regular_sharp, REGULAR, SHARP -PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR -PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH +PREP_8TAP_FN smooth, SMOOTH, SMOOTH, prep_6tap_8bpc +PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR, prep_6tap_8bpc +PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH, prep_6tap_8bpc PREP_8TAP_FN regular, REGULAR, REGULAR -cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 +cglobal prep_6tap_8bpc, 3, 8, 0, tmp, src, ss, w, h, mx, my, ns imul mxd, mxm, 0x010101 - add mxd, t0d ; 8tap_h, mx, 4tap_h + add mxd, t0d ; 6tap_h, mx, 4tap_h imul myd, mym, 0x010101 - add myd, t1d ; 8tap_v, my, 4tap_v + add myd, t1d ; 6tap_v, my, 4tap_v lea r7, [prep%+SUFFIX] - movsxd wq, wm + mov wd, wm movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v +.prep: tzcnt wd, wd movzx wd, word [r7+wq*2+table_offset(prep,)] add wq, r7 - lea r6, [strideq*3] + lea r6, [ssq*3] %if WIN64 pop r7 %endif jmp wq -.h: - test myd, 0xf00 - jnz .hv - vpbroadcastd m4, [pw_8192] - vbroadcasti128 m5, [subpel_h_shufA] - WIN64_SPILL_XMM 10 - cmp wd, 4 - je .h_w4 - tzcnt wd, wd - vbroadcasti128 m6, [subpel_h_shufB] - vbroadcasti128 m7, [subpel_h_shufC] - shr mxd, 16 - sub srcq, 3 - movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)] - vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0] - vpbroadcastd m9, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4] - add wq, r7 - jmp wq +.v: + WIN64_SPILL_XMM 10, 12 + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + lea myq, [r7+myq*8+subpel_filters+1-prep%+SUFFIX] + vpbroadcastd m9, [pw_8192] + vpbroadcastw m6, [myq+0] + mov nsq, ssq + vpbroadcastw m7, [myq+2] + neg nsq + vpbroadcastw m8, [myq+4] + cmp wd, 8 + jg .v_w16 + je .v_w8 +.v_w4: + movd xm2, [srcq+nsq*2] + pinsrd xm2, [srcq+nsq*1], 1 + vpbroadcastd m1, [srcq+ssq*0] + vpbroadcastd m3, [srcq+ssq*1] + vpbroadcastd m0, [srcq+ssq*2] + vbroadcasti128 m5, [deint_shuf4] + vpblendd m1, m2, 0xeb + punpcklqdq m3, m0 + vpblendd m1, m3, 0x60 ; 0 1 2 _ 2 3 4 _ + pshufb m1, m5 ; 01 12 23 34 +.v_w4_loop: + lea srcq, [srcq+ssq*4] + pinsrd xm0, [srcq+nsq*1], 1 + vpbroadcastd m2, [srcq+ssq*0] + vpbroadcastd m3, [srcq+ssq*1] + vpblendd m2, m0, 0xeb + vpbroadcastd m0, [srcq+ssq*2] + punpcklqdq m3, m0 + vpblendd m2, m3, 0x60 ; 4 5 6 _ 6 7 8 _ + pshufb m2, m5 ; 45 56 67 78 + pmaddubsw m3, m1, m6 ; a0 b0 c0 d0 + vperm2i128 m1, m2, 0x21 ; 23 34 45 56 + pmaddubsw m4, m2, m8 ; a2 b2 c2 d2 + pmaddubsw m1, m7 ; a1 b1 c1 d1 + paddw m3, m4 + paddw m3, m1 + pmulhrsw m3, m9 + mova m1, m2 + mova [tmpq], m3 + add tmpq, 32 + sub hd, 4 + jg .v_w4_loop + RET +.v_w8: + movq xm1, [srcq+nsq*2] + vpbroadcastq m3, [srcq+nsq*1] + vpbroadcastq m2, [srcq+ssq*0] + vpbroadcastq m4, [srcq+ssq*1] + vpbroadcastq m0, [srcq+ssq*2] + vpblendd m1, m3, 0x30 + vpblendd m3, m2, 0x30 + punpcklbw m1, m3 ; 01 12 + vpblendd m2, m4, 0x30 + vpblendd m4, m0, 0x30 + punpcklbw m2, m4 ; 23 34 +.v_w8_loop: + lea srcq, [srcq+ssq*4] + pmaddubsw m1, m6 ; a0 + vpbroadcastq m3, [srcq+nsq*1] + pmaddubsw m4, m2, m7 ; a1 + pmaddubsw m5, m2, m6 ; b0 + vpbroadcastq m2, [srcq+ssq*0] + vpblendd m0, m3, 0x30 + vpblendd m3, m2, 0x30 + paddw m4, m1 + punpcklbw m1, m0, m3 ; 45 56 + vpbroadcastq m3, [srcq+ssq*1] + vpbroadcastq m0, [srcq+ssq*2] + vpblendd m2, m3, 0x30 + vpblendd m3, m0, 0x30 + punpcklbw m2, m3 ; 67 78 + pmaddubsw m3, m1, m7 ; b1 + paddw m5, m3 + pmaddubsw m3, m1, m8 ; a2 + paddw m4, m3 + pmaddubsw m3, m2, m8 ; b2 + paddw m5, m3 + pmulhrsw m4, m9 + pmulhrsw m5, m9 + mova [tmpq+32*0], m4 + mova [tmpq+32*1], m5 + add tmpq, 32*2 + sub hd, 4 + jg .v_w8_loop + RET +.v_w16: + lea r6d, [wq*2-32] + lea srcq, [srcq+nsq*2] + WIN64_PUSH_XMM 12 + lea r6d, [hq+r6*8] +.v_w16_loop0: + vbroadcasti128 m3, [srcq+ssq*0] + lea r5, [srcq+ssq*2] + vbroadcasti128 m4, [srcq+ssq*1] + mov r7, tmpq + vbroadcasti128 m0, [r5+ssq*0] + vbroadcasti128 m1, [r5+ssq*1] + lea r5, [r5+ssq*2] + vbroadcasti128 m2, [r5+ssq*0] + shufpd m3, m0, 0x0c + shufpd m4, m1, 0x0c + punpcklbw m1, m3, m4 ; 01 + punpckhbw m3, m4 ; 23 + shufpd m0, m2, 0x0c + punpcklbw m2, m4, m0 ; 12 + punpckhbw m4, m0 ; 34 +.v_w16_loop: + vbroadcasti128 m5, [r5+ssq*1] + pmaddubsw m10, m1, m6 ; a0 + lea r5, [r5+ssq*2] + pmaddubsw m11, m2, m6 ; b0 + mova m1, m3 + pmaddubsw m3, m7 ; a1 + mova m2, m4 + pmaddubsw m4, m7 ; b1 + paddw m10, m3 + vbroadcasti128 m3, [r5+ssq*0] + paddw m11, m4 + shufpd m4, m0, m5, 0x0d + shufpd m0, m5, m3, 0x0c + punpcklbw m3, m4, m0 ; 45 + punpckhbw m4, m0 ; 56 + pmaddubsw m5, m3, m8 ; a2 + paddw m10, m5 + pmaddubsw m5, m4, m8 ; b2 + paddw m11, m5 + pmulhrsw m10, m9 + pmulhrsw m11, m9 + mova [r7+wq*0], m10 + mova [r7+wq*2], m11 + lea r7, [r7+wq*4] + sub hd, 2 + jg .v_w16_loop + add srcq, 16 + add tmpq, 32 + movzx hd, r6b + sub r6d, 1<<8 + jg .v_w16_loop0 + RET .h_w4: + RESET_STACK_STATE movzx mxd, mxb + vbroadcasti128 m3, [subpel_h_shufA] dec srcq - vpbroadcastd m6, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2] - lea stride3q, [strideq*3] + vpbroadcastd m5, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2] + lea r3, [ssq*3] .h_w4_loop: - movq xm0, [srcq+strideq*0] - vpbroadcastq m2, [srcq+strideq*2] - movq xm1, [srcq+strideq*1] - vpblendd m0, m2, 0xf0 - vpbroadcastq m2, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] - vpblendd m1, m2, 0xf0 - pshufb m0, m5 - pshufb m1, m5 - pmaddubsw m0, m6 - pmaddubsw m1, m6 + movq xm0, [srcq+ssq*0] + vpbroadcastq m2, [srcq+ssq*2] + movq xm1, [srcq+ssq*1] + vpblendd m0, m2, 0x30 + vpbroadcastq m2, [srcq+r3 ] + lea srcq, [srcq+ssq*4] + vpblendd m1, m2, 0x30 + pshufb m0, m3 + pshufb m1, m3 + pmaddubsw m0, m5 + pmaddubsw m1, m5 phaddw m0, m1 pmulhrsw m0, m4 mova [tmpq], m0 @@ -2191,25 +2798,56 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 sub hd, 4 jg .h_w4_loop RET +.h: + test myd, 0xf00 + jnz .hv + vpbroadcastd m4, [pw_8192] + cmp wd, 4 + je .h_w4 + WIN64_SPILL_XMM 10 + tzcnt wd, wd + vbroadcasti128 m3, [z_filter_s+ 2] + shr mxd, 16 + vbroadcasti128 m5, [z_filter_s+ 6] + sub srcq, 2 + vbroadcasti128 m6, [z_filter_s+10] + lea mxq, [r7+mxq*8+subpel_filters+1-prep%+SUFFIX] + movzx wd, word [r7+wq*2+table_offset(prep, _6tap_h)] + vpbroadcastw m7, [mxq+0] + vpbroadcastw m8, [mxq+2] + add wq, r7 + vpbroadcastw m9, [mxq+4] + jmp wq .h_w8: - movu xm0, [srcq+strideq*0] - vinserti128 m0, [srcq+strideq*1], 1 - lea srcq, [srcq+strideq*2] - PREP_8TAP_H + movu xm0, [srcq+ssq*0] + vinserti128 m0, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] +%macro PREP_6TAP_H 0 + pshufb m1, m0, m3 + pmaddubsw m1, m7 + pshufb m2, m0, m5 + pmaddubsw m2, m8 + pshufb m0, m6 + pmaddubsw m0, m9 + paddw m1, m2 + paddw m0, m1 + pmulhrsw m0, m4 +%endmacro + PREP_6TAP_H mova [tmpq], m0 add tmpq, 32 sub hd, 2 jg .h_w8 RET .h_w16: - movu xm0, [srcq+strideq*0+8*0] - vinserti128 m0, [srcq+strideq*0+8*1], 1 - PREP_8TAP_H + movu xm0, [srcq+ssq*0+8*0] + vinserti128 m0, [srcq+ssq*0+8*1], 1 + PREP_6TAP_H mova [tmpq+32*0], m0 - movu xm0, [srcq+strideq*1+8*0] - vinserti128 m0, [srcq+strideq*1+8*1], 1 - lea srcq, [srcq+strideq*2] - PREP_8TAP_H + movu xm0, [srcq+ssq*1+8*0] + vinserti128 m0, [srcq+ssq*1+8*1], 1 + lea srcq, [srcq+ssq*2] + PREP_6TAP_H mova [tmpq+32*1], m0 add tmpq, 32*2 sub hd, 2 @@ -2229,27 +2867,219 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 .h_loop: movu xm0, [srcq+r6+8*0] vinserti128 m0, [srcq+r6+8*1], 1 - PREP_8TAP_H + PREP_6TAP_H mova [tmpq+32*0], m0 movu xm0, [srcq+r6+8*2] vinserti128 m0, [srcq+r6+8*3], 1 - PREP_8TAP_H + PREP_6TAP_H mova [tmpq+32*1], m0 add tmpq, 32*2 add r6, 32 jle .h_loop - add srcq, strideq + add srcq, ssq mov r6, r5 dec hd jg .h_loop RET +.hv: + WIN64_SPILL_XMM 14, 16 + cmp wd, 4 + jne .hv_w8 +.hv_w4: + movzx mxd, mxb + dec srcq + vpbroadcastd m7, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + mova m6, [subpel_h_shuf4] + vpbroadcastq m0, [r7+myq*8+subpel_filters+1-prep%+SUFFIX] + mov nsq, ssq + pmovzxbd m13, [deint_shuf4] + neg nsq + vpbroadcastd m8, [pw_8192] + vpbroadcastd m9, [pd_32] + punpcklbw m0, m0 + vpbroadcastq m2, [srcq+nsq*2] + psraw m0, 8 ; sign-extend + vpbroadcastq m4, [srcq+nsq*1] + pshufd m10, m0, q0000 + vpbroadcastq m1, [srcq+ssq*0] + pshufd m11, m0, q1111 + vpbroadcastq m3, [srcq+ssq*1] + pshufd m12, m0, q2222 + vpbroadcastq m0, [srcq+ssq*2] + vpblendd m2, m4, 0xcc ; 0 1 + vpblendd m1, m3, 0xcc ; 2 3 + pshufb m2, m6 + pshufb m1, m6 + pshufb m0, m6 + pmaddubsw m2, m7 + pmaddubsw m1, m7 + pmaddubsw m0, m7 + phaddw m2, m1 ; 0 1 2 3 + phaddw m0, m0 ; 4 + pmulhrsw m2, m8 + pmulhrsw m0, m8 + palignr m0, m2, 4 + punpcklwd m1, m2, m0 ; 01 12 + punpckhwd m2, m0 ; 23 34 +.hv_w4_loop: + pmaddwd m4, m10, m1 ; a0 b0 + lea srcq, [srcq+ssq*4] + pmaddwd m5, m2, m10 ; c0 d0 + vpbroadcastq m1, [srcq+nsq*1] + pmaddwd m2, m11 ; a1 b1 + vpbroadcastq m3, [srcq+ssq*0] + paddd m4, m2 + vpbroadcastq m2, [srcq+ssq*1] + vpblendd m1, m3, 0xcc ; 5 6 + vpbroadcastq m3, [srcq+ssq*2] + vpblendd m2, m3, 0xcc ; 7 8 + pshufb m1, m6 + pshufb m2, m6 + pmaddubsw m1, m7 + pmaddubsw m2, m7 + phaddw m1, m2 ; 5 6 7 8 + pmulhrsw m1, m8 + paddd m5, m9 + paddd m4, m9 + palignr m2, m1, m0, 12 + mova m0, m1 + punpcklwd m1, m2, m0 ; 45 56 + punpckhwd m2, m0 ; 67 78 + pmaddwd m3, m11, m1 ; c1 d1 + paddd m5, m3 + pmaddwd m3, m12, m1 ; a2 b2 + paddd m4, m3 + pmaddwd m3, m12, m2 ; c2 d2 + paddd m5, m3 + psrad m4, 6 + psrad m5, 6 + packssdw m4, m5 + vpermd m4, m13, m4 + mova [tmpq], m4 + add tmpq, 32 + sub hd, 4 + jg .hv_w4_loop + RET +.hv_w8: + shr mxd, 16 + lea mxq, [r7+mxq*8+subpel_filters+1-prep_avx2] + WIN64_PUSH_XMM 16 + vpbroadcastw m10, [mxq+0] + vpbroadcastw m11, [mxq+2] + vpbroadcastw m12, [mxq+4] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + vpbroadcastq m0, [r7+myq*8+subpel_filters+1-prep_avx2] + lea r7, [ssq*2+2] + vbroadcasti128 m8, [z_filter_s+ 6] + punpcklbw m0, m0 + vbroadcasti128 m9, [z_filter_s+10] + psraw m0, 8 ; sign-extend + lea r6d, [wq*8-64] + pshufd m13, m0, q0000 + sub srcq, r7 + pshufd m14, m0, q1111 + lea r6d, [hq+r6*4] + pshufd m15, m0, q2222 +.hv_w8_loop0: + vbroadcasti128 m7, [z_filter_s+2] + movu xm3, [srcq+ssq*0] + lea r5, [srcq+ssq*2] + movu xm4, [srcq+ssq*1] + vbroadcasti128 m0, [r5+ssq*0] + mov r7, tmpq + vinserti128 m4, [r5+ssq*1], 1 ; 1 3 + lea r5, [r5+ssq*2] + vpblendd m3, m0, 0xf0 ; 0 2 + vinserti128 m0, [r5+ssq*0], 1 ; 2 4 + vpbroadcastd m5, [pw_8192] + HV_H_6TAP_W8 m3, m1, m2, m7, m8, m9 + HV_H_6TAP_W8 m4, m1, m2, m7, m8, m9 + HV_H_6TAP_W8 m0, m1, m2, m7, m8, m9 + vpermq m3, m3, q3120 + vpermq m4, m4, q3120 + vpermq m0, m0, q3120 + pmulhrsw m3, m5 + pmulhrsw m4, m5 + pmulhrsw m0, m5 + punpcklwd m1, m3, m4 ; 01 + punpckhwd m3, m4 ; 23 + punpcklwd m2, m4, m0 ; 12 + punpckhwd m4, m0 ; 34 +.hv_w8_loop: + movu xm7, [r5+ssq*1] + lea r5, [r5+ssq*2] + vinserti128 m7, [r5+ssq*0], 1 ; 5 6 + pmaddwd m5, m13, m1 ; a0 + mova m1, m3 + pmaddwd m6, m13, m2 ; b0 + mova m2, m4 + pmaddwd m3, m14 ; a1 + pmaddwd m4, m14 ; b1 + paddd m5, m3 + vbroadcasti128 m3, [z_filter_s+2] + paddd m6, m4 + HV_H_6TAP_W8 m7, m3, m4, m3, m8, m9 + vpbroadcastd m3, [pw_8192] + vpbroadcastd m4, [pd_32] + pmulhrsw m7, m3 + paddd m5, m4 + paddd m6, m4 + mova m4, m0 + vpermq m0, m7, q3120 + shufpd m4, m0, 0x05 + punpcklwd m3, m4, m0 ; 45 + pmaddwd m7, m15, m3 ; a2 + punpckhwd m4, m0 ; 67 + paddd m5, m7 + pmaddwd m7, m15, m4 ; b2 + paddd m6, m7 + psrad m5, 6 + psrad m6, 6 + packssdw m5, m6 + vpermq m5, m5, q3120 + mova [r7+wq*0], xm5 + vextracti128 [r7+wq*2], m5, 1 + lea r7, [r7+wq*4] + sub hd, 2 + jg .hv_w8_loop + add srcq, 8 + add tmpq, 16 + movzx hd, r6b + sub r6d, 1<<8 + jg .hv_w8_loop0 + RET + +PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_8bpc +PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_8bpc +PREP_8TAP_FN regular_sharp, REGULAR, SHARP, prep_8tap_8bpc +PREP_8TAP_FN sharp_regular, SHARP, REGULAR, prep_8tap_8bpc +PREP_8TAP_FN sharp, SHARP, SHARP + +cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + lea r7, [prep%+SUFFIX] + mov wd, wm + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jz mangle(private_prefix %+ _prep_6tap_8bpc_avx2).prep .v: - WIN64_SPILL_XMM 16 + WIN64_SPILL_XMM 12, 15 movzx mxd, myb ; Select 4-tap/8-tap filter multipliers. shr myd, 16 ; Note that the code is 8-tap only, having cmp hd, 4 ; a separate 4-tap code path for (4|8|16)x4 cmove myd, mxd ; had a negligible effect on performance. - ; TODO: Would a 6-tap code path be worth it? lea myq, [r7+myq*8+subpel_filters-prep%+SUFFIX] lea stride3q, [strideq*3] sub srcq, stride3q @@ -2359,72 +3189,154 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 jg .v_w8_loop RET .v_w16: - add wd, wd - mov r5, srcq - mov r7, tmpq - lea r6d, [hq+wq*8-256] + lea r6d, [wq*2-32] + WIN64_PUSH_XMM 15 + lea r6d, [hq+r6*8] .v_w16_loop0: vbroadcasti128 m4, [srcq+strideq*0] vbroadcasti128 m5, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vbroadcasti128 m0, [srcq+strideq*1] - vbroadcasti128 m6, [srcq+strideq*0] - lea srcq, [srcq+strideq*2] - vbroadcasti128 m1, [srcq+strideq*0] - vbroadcasti128 m2, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vbroadcasti128 m3, [srcq+strideq*0] - shufpd m4, m4, m0, 0x0c - shufpd m5, m5, m1, 0x0c + lea r5, [srcq+strideq*2] + vbroadcasti128 m0, [r5+strideq*1] + vbroadcasti128 m6, [r5+strideq*0] + lea r5, [r5+strideq*2] + vbroadcasti128 m1, [r5+strideq*0] + vbroadcasti128 m2, [r5+strideq*1] + lea r5, [r5+strideq*2] + vbroadcasti128 m3, [r5+strideq*0] + mov r7, tmpq + shufpd m4, m0, 0x0c + shufpd m5, m1, 0x0c punpcklbw m1, m4, m5 ; 01 punpckhbw m4, m5 ; 34 - shufpd m6, m6, m2, 0x0c + shufpd m6, m2, 0x0c punpcklbw m2, m5, m6 ; 12 punpckhbw m5, m6 ; 45 - shufpd m0, m0, m3, 0x0c + shufpd m0, m3, 0x0c punpcklbw m3, m6, m0 ; 23 punpckhbw m6, m0 ; 56 .v_w16_loop: - vbroadcasti128 m12, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vbroadcasti128 m13, [srcq+strideq*0] - pmaddubsw m14, m1, m8 ; a0 - pmaddubsw m15, m2, m8 ; b0 + vbroadcasti128 m12, [r5+strideq*1] + lea r5, [r5+strideq*2] + pmaddubsw m13, m1, m8 ; a0 + pmaddubsw m14, m2, m8 ; b0 mova m1, m3 mova m2, m4 pmaddubsw m3, m9 ; a1 pmaddubsw m4, m9 ; b1 - paddw m14, m3 - paddw m15, m4 + paddw m13, m3 + paddw m14, m4 mova m3, m5 mova m4, m6 pmaddubsw m5, m10 ; a2 pmaddubsw m6, m10 ; b2 - paddw m14, m5 - paddw m15, m6 + paddw m13, m5 + vbroadcasti128 m5, [r5+strideq*0] + paddw m14, m6 shufpd m6, m0, m12, 0x0d - shufpd m0, m12, m13, 0x0c + shufpd m0, m12, m5, 0x0c punpcklbw m5, m6, m0 ; 67 punpckhbw m6, m0 ; 78 pmaddubsw m12, m5, m11 ; a3 - pmaddubsw m13, m6, m11 ; b3 + paddw m13, m12 + pmaddubsw m12, m6, m11 ; b3 paddw m14, m12 - paddw m15, m13 + pmulhrsw m13, m7 pmulhrsw m14, m7 - pmulhrsw m15, m7 - mova [tmpq+wq*0], m14 - mova [tmpq+wq*1], m15 - lea tmpq, [tmpq+wq*2] + mova [r7+wq*0], m13 + mova [r7+wq*2], m14 + lea r7, [r7+wq*4] sub hd, 2 jg .v_w16_loop - add r5, 16 - add r7, 32 + add srcq, 16 + add tmpq, 32 movzx hd, r6b - mov srcq, r5 - mov tmpq, r7 sub r6d, 1<<8 jg .v_w16_loop0 RET +.h: +.h_w4: + test myd, 0xf00 + jnz .hv + vpbroadcastd m4, [pw_8192] + cmp wd, 4 + je mangle(private_prefix %+ _prep_6tap_8bpc_avx2).h_w4 + WIN64_SPILL_XMM 10 + vbroadcasti128 m5, [subpel_h_shufA] + tzcnt wd, wd + vbroadcasti128 m6, [subpel_h_shufB] + vbroadcasti128 m7, [subpel_h_shufC] + shr mxd, 16 + sub srcq, 3 + movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)] + vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0] + vpbroadcastd m9, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4] + add wq, r7 + jmp wq +.h_w8: + movu xm0, [srcq+strideq*0] + vinserti128 m0, [srcq+strideq*1], 1 + lea srcq, [srcq+strideq*2] +%macro PREP_8TAP_H 0 + pshufb m1, m0, m5 + pshufb m2, m0, m6 + pshufb m3, m0, m7 + pmaddubsw m1, m8 + pmaddubsw m0, m2, m8 + pmaddubsw m2, m9 + pmaddubsw m3, m9 + paddw m1, m2 + paddw m0, m3 + phaddw m0, m1, m0 + pmulhrsw m0, m4 +%endmacro + PREP_8TAP_H + mova [tmpq], m0 + add tmpq, 32 + sub hd, 2 + jg .h_w8 + RET +.h_w16: + movu xm0, [srcq+strideq*0+8*0] + vinserti128 m0, [srcq+strideq*0+8*1], 1 + PREP_8TAP_H + mova [tmpq+32*0], m0 + movu xm0, [srcq+strideq*1+8*0] + vinserti128 m0, [srcq+strideq*1+8*1], 1 + lea srcq, [srcq+strideq*2] + PREP_8TAP_H + mova [tmpq+32*1], m0 + add tmpq, 32*2 + sub hd, 2 + jg .h_w16 + RET +.h_w32: + xor r6d, r6d + jmp .h_start +.h_w64: + mov r6, -32*1 + jmp .h_start +.h_w128: + mov r6, -32*3 +.h_start: + sub srcq, r6 + mov r5, r6 +.h_loop: + movu xm0, [srcq+r6+8*0] + vinserti128 m0, [srcq+r6+8*1], 1 + PREP_8TAP_H + mova [tmpq+32*0], m0 + movu xm0, [srcq+r6+8*2] + vinserti128 m0, [srcq+r6+8*3], 1 + PREP_8TAP_H + mova [tmpq+32*1], m0 + add tmpq, 32*2 + add r6, 32 + jle .h_loop + add srcq, strideq + mov r6, r5 + dec hd + jg .h_loop + RET .hv: WIN64_SPILL_XMM 16 cmp wd, 4 @@ -2542,28 +3454,27 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 RET .hv_w8: lea r6d, [wq*8-64] - mov r5, srcq - mov r7, tmpq lea r6d, [hq+r6*4] .hv_w8_loop0: vbroadcasti128 m7, [subpel_h_shufA] movu xm4, [srcq+strideq*0] + lea r5, [srcq+strideq*2] vbroadcasti128 m8, [subpel_h_shufB] movu xm5, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] + mov r7, tmpq vbroadcasti128 m9, [subpel_h_shufC] - movu xm6, [srcq+strideq*0] - vbroadcasti128 m0, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vpblendd m4, m0, 0xf0 ; 0 3 - vinserti128 m5, [srcq+strideq*0], 1 ; 1 4 - vinserti128 m6, [srcq+strideq*1], 1 ; 2 5 - lea srcq, [srcq+strideq*2] - vinserti128 m0, [srcq+strideq*0], 1 ; 3 6 - HV_H_W8 m4, m1, m2, m3, m7, m8, m9 - HV_H_W8 m5, m1, m2, m3, m7, m8, m9 - HV_H_W8 m6, m1, m2, m3, m7, m8, m9 - HV_H_W8 m0, m1, m2, m3, m7, m8, m9 + movu xm6, [r5+strideq*0] + vbroadcasti128 m0, [r5+strideq*1] + lea r5, [r5+strideq*2] + vpblendd m4, m0, 0xf0 ; 0 3 + vinserti128 m5, [r5+strideq*0], 1 ; 1 4 + vinserti128 m6, [r5+strideq*1], 1 ; 2 5 + lea r5, [r5+strideq*2] + vinserti128 m0, [r5+strideq*0], 1 ; 3 6 + HV_H_8TAP_W8 m4, m1, m2, m3, m7, m8, m9 + HV_H_8TAP_W8 m5, m1, m2, m3, m7, m8, m9 + HV_H_8TAP_W8 m6, m1, m2, m3, m7, m8, m9 + HV_H_8TAP_W8 m0, m1, m2, m3, m7, m8, m9 vpbroadcastd m7, [pw_8192] vpermq m4, m4, q3120 vpermq m5, m5, q3120 @@ -2580,10 +3491,10 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 punpcklwd m3, m6, m7 ; 23 punpckhwd m6, m7 ; 56 .hv_w8_loop: - vextracti128 [tmpq], m0, 1 ; not enough registers - movu xm0, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vinserti128 m0, [srcq+strideq*0], 1 ; 7 8 + vextracti128 [r7], m0, 1 ; not enough registers + movu xm0, [r5+strideq*1] + lea r5, [r5+strideq*2] + vinserti128 m0, [r5+strideq*0], 1 ; 7 8 pmaddwd m8, m1, m12 ; a0 pmaddwd m9, m2, m12 ; b0 mova m1, m3 @@ -2601,15 +3512,15 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 vbroadcasti128 m6, [subpel_h_shufB] vbroadcasti128 m7, [subpel_h_shufC] vbroadcasti128 m5, [subpel_h_shufA] - HV_H_W8 m0, m5, m6, m7, m5, m6, m7 + HV_H_8TAP_W8 m0, m5, m6, m7, m5, m6, m7 vpbroadcastd m5, [pw_8192] vpbroadcastd m7, [pd_32] - vbroadcasti128 m6, [tmpq] + vbroadcasti128 m6, [r7] pmulhrsw m0, m5 paddd m8, m7 paddd m9, m7 - vpermq m7, m0, q3120 ; 7 8 - shufpd m6, m6, m7, 0x04 ; 6 7 + vpermq m7, m0, q3120 ; 7 8 + shufpd m6, m7, 0x04 ; 6 7 punpcklwd m5, m6, m7 ; 67 punpckhwd m6, m7 ; 78 pmaddwd m7, m5, m15 ; a3 @@ -2620,16 +3531,14 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 psrad m7, 6 packssdw m8, m7 vpermq m7, m8, q3120 - mova [tmpq+wq*0], xm7 - vextracti128 [tmpq+wq*2], m7, 1 - lea tmpq, [tmpq+wq*4] + mova [r7+wq*0], xm7 + vextracti128 [r7+wq*2], m7, 1 + lea r7, [r7+wq*4] sub hd, 2 jg .hv_w8_loop - add r5, 8 - add r7, 16 + add srcq, 8 + add tmpq, 16 movzx hd, r6b - mov srcq, r5 - mov tmpq, r7 sub r6d, 1<<8 jg .hv_w8_loop0 RET @@ -4008,14 +4917,14 @@ DECLARE_REG_TMP 6, 8 %define PREP_8TAP_SCALED_FN FN prep_8tap_scaled, BILIN_SCALED_FN put -PUT_8TAP_SCALED_FN sharp, SHARP, SHARP -PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH -PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP -PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH -PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR -PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP -PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR -PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH +PUT_8TAP_SCALED_FN sharp, SHARP, SHARP, put_8tap_scaled_8bpc +PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH, put_8tap_scaled_8bpc +PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP, put_8tap_scaled_8bpc +PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH, put_8tap_scaled_8bpc +PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR, put_8tap_scaled_8bpc +PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP, put_8tap_scaled_8bpc +PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR, put_8tap_scaled_8bpc +PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH, put_8tap_scaled_8bpc PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR MC_8TAP_SCALED put @@ -4026,14 +4935,14 @@ DECLARE_REG_TMP 6, 7 %endif BILIN_SCALED_FN prep -PREP_8TAP_SCALED_FN sharp, SHARP, SHARP -PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH -PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP -PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH -PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR -PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP -PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR -PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH +PREP_8TAP_SCALED_FN sharp, SHARP, SHARP, prep_8tap_scaled_8bpc +PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_scaled_8bpc +PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_scaled_8bpc +PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH, prep_8tap_scaled_8bpc +PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR, prep_8tap_scaled_8bpc +PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP, prep_8tap_scaled_8bpc +PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR, prep_8tap_scaled_8bpc +PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH, prep_8tap_scaled_8bpc PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR MC_8TAP_SCALED prep diff --git a/third_party/dav1d/src/x86/mc_avx512.asm b/third_party/dav1d/src/x86/mc_avx512.asm index f9043f1ad3..50e670ec25 100644 --- a/third_party/dav1d/src/x86/mc_avx512.asm +++ b/third_party/dav1d/src/x86/mc_avx512.asm @@ -89,55 +89,47 @@ wm_444_mask: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 3 db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63 db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62 -bilin_h_perm16: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 - db 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15 - db 33, 32, 34, 33, 35, 34, 36, 35, 37, 36, 38, 37, 39, 38, 40, 39 - db 41, 40, 42, 41, 43, 42, 44, 43, 45, 44, 46, 45, 47, 46, 48, 47 -bilin_h_perm32: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 - db 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15 - db 17, 16, 18, 17, 19, 18, 20, 19, 21, 20, 22, 21, 23, 22, 24, 23 - db 25, 24, 26, 25, 27, 26, 28, 27, 29, 28, 30, 29, 31, 30, 32, 31 -bilin_v_perm8: db 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 - db 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23 - db 32, 80, 33, 81, 34, 82, 35, 83, 36, 84, 37, 85, 38, 86, 39, 87 - db 64, 32, 65, 33, 66, 34, 67, 35, 68, 36, 69, 37, 70, 38, 71, 39 -bilin_v_perm16: db 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 - db 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 - db 64, 16, 65, 17, 66, 18, 67, 19, 68, 20, 69, 21, 70, 22, 71, 23 - db 72, 24, 73, 25, 74, 26, 75, 27, 76, 28, 77, 29, 78, 30, 79, 31 -bilin_v_perm32: db 64, 0, 65, 1, 66, 2, 67, 3, 68, 4, 69, 5, 70, 6, 71, 7 - db 72, 8, 73, 9, 74, 10, 75, 11, 76, 12, 77, 13, 78, 14, 79, 15 - db 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23 - db 88, 24, 89, 25, 90, 26, 91, 27, 92, 28, 93, 29, 94, 30, 95, 31 -bilin_v_perm64: dq 0, 4, 1, 5, 2, 6, 3, 7 -spel_h_perm16a: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 +bilin_h_perm16: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 + db 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16 + db 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39, 40 + db 40, 41, 41, 42, 42, 43, 43, 44, 44, 45, 45, 46, 46, 47, 47, 48 +bilin_h_perm32: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 + db 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16 + db 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24 + db 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32 +bilin_v_perm8: db 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 + db 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87 + db 80, 32, 81, 33, 82, 34, 83, 35, 84, 36, 85, 37, 86, 38, 87, 39 + db 32, 64, 33, 65, 34, 66, 35, 67, 36, 68, 37, 69, 38, 70, 39, 71 +bilin_v_perm16: db 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 + db 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 + db 16, 64, 17, 65, 18, 66, 19, 67, 20, 68, 21, 69, 22, 70, 23, 71 + db 24, 72, 25, 73, 26, 74, 27, 75, 28, 76, 29, 77, 30, 78, 31, 79 +bilin_v_perm32: db 0, 64, 1, 65, 2, 66, 3, 67, 4, 68, 5, 69, 6, 70, 7, 71 + db 8, 72, 9, 73, 10, 74, 11, 75, 12, 76, 13, 77, 14, 78, 15, 79 + db 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87 + db 24, 88, 25, 89, 26, 90, 27, 91, 28, 92, 29, 93, 30, 94, 31, 95 +bilin_v_perm64: dd 0, 0, 4, 8, 1, 1, 5, 9, 2, 2, 6, 10, 3, 3, 7, 11 +spel_h_perm16: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38 db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46 -spel_h_perm16b: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 - db 12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 - db 36, 37, 38, 39, 37, 38, 39, 40, 38, 39, 40, 41, 39, 40, 41, 42 - db 44, 45, 46, 47, 45, 46, 47, 48, 46, 47, 48, 49, 47, 48, 49, 50 -spel_h_perm16c: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 - db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22 - db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46 - db 48, 49, 50, 51, 49, 50, 51, 52, 50, 51, 52, 53, 51, 52, 53, 54 -spel_h_perm32a: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 +spel_h_perm32: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22 db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30 -spel_h_perm32b: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 - db 12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 - db 20, 21, 22, 23, 21, 22, 23, 24, 22, 23, 24, 25, 23, 24, 25, 26 - db 28, 29, 30, 31, 29, 30, 31, 32, 30, 31, 32, 33, 31, 32, 33, 34 -spel_h_perm32c: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 - db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22 - db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30 - db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38 -spel_v_perm16: db 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7 +spel_v_perm8: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 + db 8, 16, 9, 17, 10, 18, 11, 19, 12, 20, 13, 21, 14, 22, 15, 23 + db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 + db 24, 32, 25, 33, 26, 34, 27, 35, 28, 36, 29, 37, 30, 38, 31, 39 +spel_v_perm16a: db 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7 db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 db 40, 16, 41, 17, 42, 18, 43, 19, 44, 20, 45, 21, 46, 22, 47, 23 db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 +spel_v_perm16b: db 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7 + db 40, 16, 41, 17, 42, 18, 43, 19, 44, 20, 45, 21, 46, 22, 47, 23 + db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 + db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 spel_v_perm32: db 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39 db 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 db 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55 @@ -154,34 +146,20 @@ spel_hv_perm8a: db 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 2 db 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31 db 16, 17, 32, 33, 18, 19, 34, 35, 20, 21, 36, 37, 22, 23, 38, 39 db 24, 25, 40, 41, 26, 27, 42, 43, 28, 29, 44, 45, 30, 31, 46, 47 -spel_hv_perm8b: db 32, 33, 48, 49, 34, 35, 50, 51, 36, 37, 52, 53, 38, 39, 54, 55 - db 40, 41, 56, 57, 42, 43, 58, 59, 44, 45, 60, 61, 46, 47, 62, 63 - db 48, 49, 64, 65, 50, 51, 66, 67, 52, 53, 68, 69, 54, 55, 70, 71 - db 56, 57, 72, 73, 58, 59, 74, 75, 60, 61, 76, 77, 62, 63, 78, 79 -spel_hv_perm8c: db 34, 35, 0, 1, 38, 39, 4, 5, 42, 43, 8, 9, 46, 47, 12, 13 +spel_hv_perm8b: db 34, 35, 0, 1, 38, 39, 4, 5, 42, 43, 8, 9, 46, 47, 12, 13 db 50, 51, 16, 17, 54, 55, 20, 21, 58, 59, 24, 25, 62, 63, 28, 29 db 0, 1, 32, 33, 4, 5, 36, 37, 8, 9, 40, 41, 12, 13, 44, 45 db 16, 17, 48, 49, 20, 21, 52, 53, 24, 25, 56, 57, 28, 29, 60, 61 -spel_hv_end16: db 1, 3, 17, 19, 5, 7, 21, 23, 33, 35, 49, 51, 37, 39, 53, 55 - db 9, 11, 25, 27, 13, 15, 29, 31, 41, 43, 57, 59, 45, 47, 61, 63 spel_hv_perm16a:db 0, 1, 2, 3, 32, 33, 34, 35, 1, 2, 3, 4, 33, 34, 35, 36 db 2, 3, 4, 5, 34, 35, 36, 37, 3, 4, 5, 6, 35, 36, 37, 38 -spel_hv_perm16c:db 8, 9, 10, 11, 40, 41, 42, 43, 9, 10, 11, 12, 41, 42, 43, 44 + db 8, 9, 10, 11, 40, 41, 42, 43, 9, 10, 11, 12, 41, 42, 43, 44 db 10, 11, 12, 13, 42, 43, 44, 45, 11, 12, 13, 14, 43, 44, 45, 46 - db 16, 17, 18, 19, 48, 49, 50, 51, 17, 18, 19, 20, 49, 50, 51, 52 - db 18, 19, 20, 21, 50, 51, 52, 53, 19, 20, 21, 22, 51, 52, 53, 54 -spel_hv_perm16b:db 4, 5, 6, 7, 36, 37, 38, 39, 5, 6, 7, 8, 37, 38, 39, 40 - db 6, 7, 8, 9, 38, 39, 40, 41, 7, 8, 9, 10, 39, 40, 41, 42 - db 12, 13, 14, 15, 44, 45, 46, 47, 13, 14, 15, 16, 45, 46, 47, 48 - db 14, 15, 16, 17, 46, 47, 48, 49, 15, 16, 17, 18, 47, 48, 49, 50 -spel_hv_perm16d:db 0, 1, 2, 3, 1, 2, 3, 4, 4, 5, 6, 7, 5, 6, 7, 8 +spel_hv_perm16b:db 0, 1, 2, 3, 1, 2, 3, 4, 4, 5, 6, 7, 5, 6, 7, 8 db 2, 3, 4, 5, 3, 4, 5, 6, 6, 7, 8, 9, 7, 8, 9, 10 db 8, 9, 10, 11, 9, 10, 11, 12, 12, 13, 14, 15, 13, 14, 15, 16 db 10, 11, 12, 13, 11, 12, 13, 14, 14, 15, 16, 17, 15, 16, 17, 18 -spel_hv_perm16e:db 4, 5, 6, 7, 5, 6, 7, 8, 8, 9, 10, 11, 9, 10, 11, 12 - db 6, 7, 8, 9, 7, 8, 9, 10, 10, 11, 12, 13, 11, 12, 13, 14 - db 12, 13, 14, 15, 13, 14, 15, 16, 16, 17, 18, 19, 17, 18, 19, 20 - db 14, 15, 16, 17, 15, 16, 17, 18, 18, 19, 20, 21, 19, 20, 21, 22 +spel_hv_end16: db 1, 3, 17, 19, 5, 7, 21, 23, 33, 35, 49, 51, 37, 39, 53, 55 + db 9, 11, 25, 27, 13, 15, 29, 31, 41, 43, 57, 59, 45, 47, 61, 63 spel_hv_end: db 1, 3, 5, 7, 17, 19, 21, 23, 33, 35, 37, 39, 49, 51, 53, 55 deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11 subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12 @@ -189,15 +167,14 @@ subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 1 subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 -bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11 -bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 -bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7 +bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 +bilin_v_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11 blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 -resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7 resize_permA: dd 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 resize_permB: dd 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 resize_permC: dd 0, 4, 8, 12 +resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7 pb_02461357: db 0, 2, 4, 6, 1, 3, 5, 7 wm_420_perm64: dq 0xfedcba9876543210 @@ -205,6 +182,8 @@ wm_sign: dd 0x40804080, 0xc0c0c0c0, 0x40404040 pb_8x0_8x8: times 8 db 0 times 8 db 8 +pb_4: times 4 db 4 +pb_32: times 4 db 32 pb_127: times 4 db 127 pw_m128 times 2 dw -128 pw_m256: times 2 dw -256 @@ -216,7 +195,6 @@ pd_32: dd 32 pd_34: dd 34 pd_63: dd 63 pd_512: dd 512 -pd_32768: dd 32768 %define pb_m64 (wm_sign+4) %define pb_64 (wm_sign+8) @@ -289,8 +267,10 @@ BASE_JMP_TABLE put, avx512icl, 2, 4, 8, 16, 32, 64, 128 BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE put, 6tap, avx512icl, 2, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, 8tap, avx512icl, 3, 2, 4, 8, 16, 32, 64, 128 -HV_JMP_TABLE prep, 8tap, avx512icl, 7, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, 6tap, avx512icl, 2, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, 8tap, avx512icl, 3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE avg, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_avg, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE mask, avx512icl, 4, 8, 16, 32, 64, 128 @@ -401,9 +381,9 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy .h: ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4 ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4 - imul mxyd, 0xff01 - vbroadcasti128 m4, [bilin_h_shuf8] - add mxyd, 16 << 8 + imul mxyd, 255 + vbroadcasti128 m4, [bilin_h_perm16] + add mxyd, 16 vpbroadcastw m5, mxyd mov mxyd, r7m ; my test mxyd, mxyd @@ -526,9 +506,9 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy RET .v: movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)] - imul mxyd, 0xff01 + imul mxyd, 255 vpbroadcastd m5, [pw_2048] - add mxyd, 16 << 8 + add mxyd, 16 add wq, r7 vpbroadcastw m4, mxyd jmp wq @@ -539,7 +519,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy lea srcq, [srcq+ssq*2] pinsrw xmm0, xmm1, [srcq+ssq*0], 0 ; 2 1 pshuflw xmm1, xmm1, q2301 ; 1 0 - punpcklbw xmm1, xmm0, xmm1 + punpcklbw xmm1, xmm0 pmaddubsw xmm1, xm4 pmulhrsw xmm1, xm5 packuswb xmm1, xmm1 @@ -552,11 +532,11 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy .v_w4: movd xmm0, [srcq+ssq*0] .v_w4_loop: - vpbroadcastd xmm1, [srcq+ssq*1] + vpbroadcastd xmm2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - vpblendd xmm2, xmm1, xmm0, 0x01 ; 0 1 + vpblendd xmm1, xmm2, xmm0, 0x01 ; 0 1 vpbroadcastd xmm0, [srcq+ssq*0] - vpblendd xmm1, xmm0, 0x02 ; 1 2 + vpblendd xmm2, xmm0, 0x02 ; 1 2 punpcklbw xmm1, xmm2 pmaddubsw xmm1, xm4 pmulhrsw xmm1, xm5 @@ -570,11 +550,11 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy .v_w8: movq xmm0, [srcq+ssq*0] .v_w8_loop: - movq xmm3, [srcq+ssq*1] + movq xmm2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - punpcklbw xmm1, xmm3, xmm0 + punpcklbw xmm1, xmm0, xmm2 movq xmm0, [srcq+ssq*0] - punpcklbw xmm2, xmm0, xmm3 + punpcklbw xmm2, xmm0 pmaddubsw xmm1, xm4 pmaddubsw xmm2, xm4 pmulhrsw xmm1, xm5 @@ -589,11 +569,11 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy .v_w16: movu xmm0, [srcq+ssq*0] .v_w16_loop: - vbroadcasti128 ymm2, [srcq+ssq*1] + vbroadcasti128 ymm3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - vpblendd ymm3, ymm2, ymm0, 0x0f ; 0 1 + vpblendd ymm2, ymm3, ymm0, 0x0f ; 0 1 vbroadcasti128 ymm0, [srcq+ssq*0] - vpblendd ymm2, ymm2, ymm0, 0xf0 ; 1 2 + vpblendd ymm3, ymm0, 0xf0 ; 1 2 punpcklbw ymm1, ymm2, ymm3 punpckhbw ymm2, ymm3 pmaddubsw ymm1, ym4 @@ -612,11 +592,11 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy movu ym0, [srcq+ssq*0] kxnorb k1, k1, k1 .v_w32_loop: - vbroadcasti32x8 m2, [srcq+ssq*1] + vbroadcasti32x8 m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - vpblendmd m3{k1}, m2, m0 ; 0 1 + vpblendmd m2{k1}, m3, m0 ; 0 1 vbroadcasti32x8 m0, [srcq+ssq*0] - vpblendmd m2{k1}, m0, m2 ; 1 2 + vpblendmd m3{k1}, m0, m3 ; 1 2 punpcklbw m1, m2, m3 punpckhbw m2, m3 pmaddubsw m1, m4 @@ -635,18 +615,18 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy .v_w64_loop: movu m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - punpcklbw m1, m3, m0 - punpckhbw m6, m3, m0 + punpcklbw m1, m0, m3 + punpckhbw m6, m0, m3 movu m0, [srcq+ssq*0] pmaddubsw m1, m4 pmaddubsw m6, m4 - punpcklbw m2, m0, m3 - punpckhbw m7, m0, m3 + punpcklbw m2, m3, m0 + punpckhbw m3, m0 pmaddubsw m2, m4 - pmaddubsw m7, m4 - REPX {pmulhrsw x, m5}, m1, m6, m2, m7 + pmaddubsw m3, m4 + REPX {pmulhrsw x, m5}, m1, m6, m2, m3 packuswb m1, m6 - packuswb m2, m7 + packuswb m2, m3 mova [dstq+dsq*0], m1 mova [dstq+dsq*1], m2 lea dstq, [dstq+dsq*2] @@ -660,13 +640,13 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy add srcq, ssq movu m2, [srcq+64*0] movu m3, [srcq+64*1] - punpcklbw m6, m2, m0 + punpcklbw m6, m0, m2 pmaddubsw m6, m4 - punpckhbw m0, m2, m0 + punpckhbw m0, m2 pmaddubsw m0, m4 - punpcklbw m7, m3, m1 + punpcklbw m7, m1, m3 pmaddubsw m7, m4 - punpckhbw m1, m3, m1 + punpckhbw m1, m3 pmaddubsw m1, m4 REPX {pmulhrsw x, m5}, m6, m0, m7, m1 packuswb m6, m0 @@ -1005,8 +985,8 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 .h: ; 16 * src[x] + (mx * (src[x + 1] - src[x])) ; = (16 - mx) * src[x] + mx * src[x + 1] - imul mxyd, 0xff01 - add mxyd, 16 << 8 + imul mxyd, 255 + add mxyd, 16 vpbroadcastw m5, mxyd mov mxyd, r6m ; my test mxyd, mxyd @@ -1032,7 +1012,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 jg .h_w4_loop RET .h_w8: - vbroadcasti32x4 m4, [bilin_h_shuf8] + vbroadcasti32x4 m4, [bilin_h_perm16] .h_w8_loop: movu xmm0, [srcq+strideq*0] vinserti32x4 ym0, ymm0, [srcq+strideq*1], 1 @@ -1127,8 +1107,8 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 .v: WIN64_SPILL_XMM 7 movzx wd, word [t2+wq*2+table_offset(prep, _bilin_v)] - imul mxyd, 0xff01 - add mxyd, 16 << 8 + imul mxyd, 255 + add mxyd, 16 add wq, t2 lea stride3q, [strideq*3] vpbroadcastw m6, mxyd @@ -1218,11 +1198,11 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 .v_w64_loop: vpermq m1, m5, [srcq+strideq*1] lea srcq, [srcq+strideq*2] - punpcklbw m4, m1, m0 - punpckhbw m2, m1, m0 + punpcklbw m4, m0, m1 + punpckhbw m2, m0, m1 vpermq m0, m5, [srcq+strideq*0] - punpcklbw m3, m0, m1 - punpckhbw m1, m0, m1 + punpcklbw m3, m1, m0 + punpckhbw m1, m0 pmaddubsw m4, m6 pmaddubsw m2, m6 pmaddubsw m3, m6 @@ -1243,28 +1223,28 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 vpermq m2, m5, [srcq+strideq*1+ 0] vpermq m3, m5, [srcq+strideq*1+64] lea srcq, [srcq+strideq*2] - punpcklbw m4, m2, m0 - punpckhbw m0, m2, m0 + punpcklbw m4, m0, m2 + punpckhbw m0, m2 pmaddubsw m4, m6 pmaddubsw m0, m6 mova [tmpq+64*0], m4 mova [tmpq+64*1], m0 - punpcklbw m4, m3, m1 - punpckhbw m1, m3, m1 + punpcklbw m4, m1, m3 + punpckhbw m1, m3 pmaddubsw m4, m6 pmaddubsw m1, m6 mova [tmpq+64*2], m4 mova [tmpq+64*3], m1 vpermq m0, m5, [srcq+strideq*0+ 0] vpermq m1, m5, [srcq+strideq*0+64] - punpcklbw m4, m0, m2 - punpckhbw m2, m0, m2 + punpcklbw m4, m2, m0 + punpckhbw m2, m0 pmaddubsw m4, m6 pmaddubsw m2, m6 mova [tmpq+64*4], m4 mova [tmpq+64*5], m2 - punpcklbw m4, m1, m3 - punpckhbw m3, m1, m3 + punpcklbw m4, m3, m1 + punpckhbw m3, m1 pmaddubsw m4, m6 pmaddubsw m3, m6 mova [tmpq+64*6], m4 @@ -1308,7 +1288,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 jg .hv_w4_loop RET .hv_w8: - vbroadcasti32x4 m4, [bilin_h_shuf8] + vbroadcasti32x4 m4, [bilin_h_perm16] vbroadcasti32x4 m0, [srcq+strideq*0] pshufb m0, m4 pmaddubsw m0, m5 @@ -1448,7 +1428,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 %assign FILTER_SMOOTH (1*15 << 16) | 4*15 %assign FILTER_SHARP (2*15 << 16) | 3*15 -%macro FN 4 ; fn, type, type_h, type_v +%macro FN 4-5 ; fn, type, type_h, type_v, jmp_to cglobal %1_%2_8bpc mov t0d, FILTER_%3 %ifidn %3, %4 @@ -1456,8 +1436,8 @@ cglobal %1_%2_8bpc %else mov t1d, FILTER_%4 %endif -%ifnidn %2, regular ; skip the jump in the last filter - jmp mangle(private_prefix %+ _%1_8bpc %+ SUFFIX) +%if %0 == 5 ; skip the jump in the last filter + jmp mangle(private_prefix %+ _%5 %+ SUFFIX) %endif %endmacro @@ -1489,24 +1469,22 @@ DECLARE_REG_TMP 4, 5 DECLARE_REG_TMP 7, 8 %endif +; Due to the use of vpdpbusd (which does 4 pixels per instruction) in +; the horizontal filter, 6-tap is only used for the vertical filter. %define PUT_8TAP_FN FN put_8tap, - -PUT_8TAP_FN sharp, SHARP, SHARP -PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH -PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP -PUT_8TAP_FN smooth, SMOOTH, SMOOTH -PUT_8TAP_FN sharp_regular, SHARP, REGULAR -PUT_8TAP_FN regular_sharp, REGULAR, SHARP -PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR -PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH +PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH, put_6tap_8bpc +PUT_8TAP_FN sharp_regular, SHARP, REGULAR, put_6tap_8bpc +PUT_8TAP_FN smooth, SMOOTH, SMOOTH, put_6tap_8bpc +PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR, put_6tap_8bpc +PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH, put_6tap_8bpc PUT_8TAP_FN regular, REGULAR, REGULAR -cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 +cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ns %define base r8-put_avx512icl imul mxd, mxm, 0x010101 - add mxd, t0d ; 8tap_h, mx, 4tap_h + add mxd, t0d ; 6tap_h, mx, 4tap_h imul myd, mym, 0x010101 - add myd, t1d ; 8tap_v, my, 4tap_v + add myd, t1d ; 6tap_v, my, 4tap_v lea r8, [put_avx512icl] movsxd wq, wm movifnidn hd, hm @@ -1514,6 +1492,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 jnz .h test myd, 0xf00 jnz .v +.put: tzcnt wd, wd movzx wd, word [r8+wq*2+table_offset(put,)] add wq, r8 @@ -1523,474 +1502,273 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 pop r8 %endif jmp wq -.h: - test myd, 0xf00 - jnz .hv - vpbroadcastd m5, [pd_34] ; 2 + (8 << 2) - WIN64_SPILL_XMM 11 - cmp wd, 4 - jl .h_w2 - vbroadcasti128 m6, [subpel_h_shufA] - je .h_w4 - tzcnt wd, wd - vbroadcasti128 m7, [subpel_h_shufB] - vbroadcasti128 m8, [subpel_h_shufC] - shr mxd, 16 - sub srcq, 3 - movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)] - vpbroadcastd m9, [base+mxq*8+subpel_filters+0] - vpbroadcastd m10, [base+mxq*8+subpel_filters+4] - add wq, r8 - jmp wq -.h_w2: - movzx mxd, mxb - dec srcq - mova xmm4, [subpel_h_shuf4] - vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2] -.h_w2_loop: - movq xmm0, [srcq+ssq*0] - movhps xmm0, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - pshufb xmm0, xmm4 - mova xmm1, xm5 - vpdpbusd xmm1, xmm0, xmm3 - packssdw xmm0, xmm1, xmm1 - psraw xmm0, 6 - packuswb xmm0, xm0 - pextrw [dstq+dsq*0], xmm0, 0 - pextrw [dstq+dsq*1], xmm0, 1 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .h_w2_loop - RET -.h_w4: - movzx mxd, mxb - dec srcq - vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2] -.h_w4_loop: - movq xmm0, [srcq+ssq*0] - movq xmm1, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - pshufb xmm0, xm6 - pshufb xmm1, xm6 - mova xmm2, xm5 - vpdpbusd xmm2, xmm0, xmm3 - mova xmm0, xm5 - vpdpbusd xmm0, xmm1, xmm3 - packssdw xmm0, xmm2, xmm0 - psraw xmm0, 6 - packuswb xmm0, xmm0 - movd [dstq+dsq*0], xmm0 - pextrd [dstq+dsq*1], xmm0, 1 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .h_w4_loop - RET -.h_w8: - movu xm0, [srcq+ssq*0] - vinserti32x4 ym0, [srcq+ssq*1], 1 - lea srcq, [srcq+ssq*2] - WRAP_YMM PUT_8TAP_H 0, 1, 2, 3 - vpmovuswb xm0, ym0 - movq [dstq+dsq*0], xm0 - movhps [dstq+dsq*1], xm0 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .h_w8 - RET -.h_w16: - mova m6, [spel_h_perm16a] - mova m7, [spel_h_perm16b] - mova m8, [spel_h_perm16c] -.h_w16_loop: - movu ym0, [srcq+ssq*0] - vinserti32x8 m0, [srcq+ssq*1], 1 - lea srcq, [srcq+ssq*2] - PUT_8TAP_H 0, 1, 2, 3, 1 - vpmovuswb ym0, m0 - mova [dstq+dsq*0], xm0 - vextracti128 [dstq+dsq*1], ym0, 1 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .h_w16_loop - RET -.h_w32: - movu ym0, [srcq+ssq*0+8*0] - vinserti32x8 m0, [srcq+ssq*1+8*0], 1 - movu ym1, [srcq+ssq*0+8*1] - vinserti32x8 m1, [srcq+ssq*1+8*1], 1 - lea srcq, [srcq+ssq*2] - PUT_8TAP_H 0, 2, 3, 4 - PUT_8TAP_H 1, 4, 3, 2 - packuswb m0, m1 - mova [dstq+dsq*0], ym0 - vextracti32x8 [dstq+dsq*1], m0, 1 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .h_w32 - RET -.h_w64: - movu m0, [srcq+8*0] - movu m1, [srcq+8*1] - add srcq, ssq - PUT_8TAP_H 0, 2, 3, 4 - PUT_8TAP_H 1, 4, 3, 2 - packuswb m0, m1 - mova [dstq], m0 - add dstq, dsq - dec hd - jg .h_w64 - RET -.h_w128: - movu m0, [srcq+8*0] - movu m2, [srcq+8*1] - movu m1, [srcq+8*8] - movu m3, [srcq+8*9] - add srcq, ssq - PUT_8TAP_H 0, 4, 11, 12 - PUT_8TAP_H 2, 12, 11, 4 - PUT_8TAP_H 1, 4, 11, 12 - PUT_8TAP_H 3, 12, 11, 4 - packuswb m0, m2 - packuswb m1, m3 - mova [dstq+64*0], m0 - mova [dstq+64*1], m1 - add dstq, dsq - dec hd - jg .h_w128 - RET .v: movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd tzcnt r6d, wd - movzx r6d, word [r8+r6*2+table_offset(put, _8tap_v)] - vpbroadcastd m7, [pw_512] - lea myq, [base+subpel_filters+myq*8] - vpbroadcastw m8, [myq+0] - vpbroadcastw m9, [myq+2] - vpbroadcastw m10, [myq+4] - vpbroadcastw m11, [myq+6] + movzx r6d, word [r8+r6*2+table_offset(put, _6tap_v)] + vpbroadcastd m6, [pw_512] + lea myq, [base+subpel_filters+1+myq*8] + vpbroadcastw m7, [myq+0] add r6, r8 - lea ss3q, [ssq*3] - sub srcq, ss3q + vpbroadcastw m8, [myq+2] + mov nsq, ssq + vpbroadcastw m9, [myq+4] + neg nsq jmp r6 .v_w2: - movd xmm2, [srcq+ssq*0] - pinsrw xmm2, [srcq+ssq*1], 2 - pinsrw xmm2, [srcq+ssq*2], 4 - add srcq, ss3q - pinsrw xmm2, [srcq+ssq*0], 6 ; 0 1 2 3 - movd xmm3, [srcq+ssq*1] - vpbroadcastd xmm1, [srcq+ssq*2] - add srcq, ss3q + movd xmm2, [srcq+nsq*2] + pinsrw xmm2, [srcq+nsq*1], 2 + pinsrw xmm2, [srcq+ssq*0], 4 + pinsrw xmm2, [srcq+ssq*1], 6 ; 0 1 2 3 + lea srcq, [srcq+ssq*2] vpbroadcastd xmm0, [srcq+ssq*0] - vpblendd xmm3, xmm3, xmm1, 0x02 ; 4 5 - vpblendd xmm1, xmm1, xmm0, 0x02 ; 5 6 - palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4 - punpcklbw xmm3, xmm1 ; 45 56 - punpcklbw xmm1, xmm2, xmm4 ; 01 12 - punpckhbw xmm2, xmm4 ; 23 34 + palignr xmm3, xmm0, xmm2, 4 ; 1 2 3 4 + punpcklbw xmm1, xmm2, xmm3 ; 01 12 + punpckhbw xmm2, xmm3 ; 23 34 .v_w2_loop: - pmaddubsw xmm5, xmm1, xm8 ; a0 b0 - mova xmm1, xmm2 - pmaddubsw xmm2, xm9 ; a1 b1 - paddw xmm5, xmm2 - mova xmm2, xmm3 - pmaddubsw xmm3, xm10 ; a2 b2 - paddw xmm5, xmm3 vpbroadcastd xmm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7 + pmaddubsw xmm3, xmm1, xm7 ; a0 b0 + mova xmm1, xmm2 + pmaddubsw xmm2, xm8 ; a1 b1 + paddw xmm3, xmm2 + vpblendd xmm2, xmm0, xmm4, 0x02 ; 4 5 vpbroadcastd xmm0, [srcq+ssq*0] - vpblendd xmm4, xmm4, xmm0, 0x02 ; 7 8 - punpcklbw xmm3, xmm4 ; 67 78 - pmaddubsw xmm4, xmm3, xm11 ; a3 b3 - paddw xmm5, xmm4 - pmulhrsw xmm5, xm7 - packuswb xmm5, xmm5 - pextrw [dstq+dsq*0], xmm5, 0 - pextrw [dstq+dsq*1], xmm5, 2 + vpblendd xmm4, xmm0, 0x02 ; 5 6 + punpcklbw xmm2, xmm4 ; 67 78 + pmaddubsw xmm4, xmm2, xm9 ; a3 b3 + paddw xmm3, xmm4 + pmulhrsw xmm3, xm6 + packuswb xmm3, xmm3 + pextrw [dstq+dsq*0], xmm3, 0 + pextrw [dstq+dsq*1], xmm3, 2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: - movd xmm2, [srcq+ssq*0] - pinsrd xmm2, [srcq+ssq*1], 1 - pinsrd xmm2, [srcq+ssq*2], 2 - add srcq, ss3q - pinsrd xmm2, [srcq+ssq*0], 3 ; 0 1 2 3 - movd xmm3, [srcq+ssq*1] - vpbroadcastd xmm1, [srcq+ssq*2] - add srcq, ss3q + movd xmm2, [srcq+nsq*2] + pinsrd xmm2, [srcq+nsq*1], 1 + pinsrd xmm2, [srcq+ssq*0], 2 + pinsrd xmm2, [srcq+ssq*1], 3 ; 0 1 2 3 + lea srcq, [srcq+ssq*2] vpbroadcastd xmm0, [srcq+ssq*0] - vpblendd xmm3, xmm3, xmm1, 0x02 ; 4 5 - vpblendd xmm1, xmm1, xmm0, 0x02 ; 5 6 - palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4 - punpcklbw xmm3, xmm1 ; 45 56 - punpcklbw xmm1, xmm2, xmm4 ; 01 12 - punpckhbw xmm2, xmm4 ; 23 34 + palignr xmm3, xmm0, xmm2, 4 ; 1 2 3 4 + punpcklbw xmm1, xmm2, xmm3 ; 01 12 + punpckhbw xmm2, xmm3 ; 23 34 .v_w4_loop: vpbroadcastd xmm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - pmaddubsw xmm5, xmm1, xm8 ; a0 b0 + pmaddubsw xmm3, xmm1, xm7 ; a0 b0 mova xmm1, xmm2 - pmaddubsw xmm2, xm9 ; a1 b1 - paddw xmm5, xmm2 - mova xmm2, xmm3 - pmaddubsw xmm3, xm10 ; a2 b2 - paddw xmm5, xmm3 - vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7 + pmaddubsw xmm2, xm8 ; a1 b1 + paddw xmm3, xmm2 + vpblendd xmm2, xmm0, xmm4, 0x02 ; 4 5 vpbroadcastd xmm0, [srcq+ssq*0] - vpblendd xmm4, xmm4, xmm0, 0x02 ; 7 8 - punpcklbw xmm3, xmm4 ; 67 78 - pmaddubsw xmm4, xmm3, xm11 ; a3 b3 - paddw xmm5, xmm4 - pmulhrsw xmm5, xm7 - packuswb xmm5, xmm5 - movd [dstq+dsq*0], xmm5 - pextrd [dstq+dsq*1], xmm5, 1 + vpblendd xmm4, xmm0, 0x02 ; 5 6 + punpcklbw xmm2, xmm4 ; 45 56 + pmaddubsw xmm4, xmm2, xm9 ; a2 b2 + paddw xmm3, xmm4 + pmulhrsw xmm3, xm6 + packuswb xmm3, xmm3 + movd [dstq+dsq*0], xmm3 + pextrd [dstq+dsq*1], xmm3, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop RET .v_w8: - movq xmm1, [srcq+ssq*0] - vpbroadcastq ymm0, [srcq+ssq*1] - vpbroadcastq ymm2, [srcq+ssq*2] - add srcq, ss3q - vpbroadcastq ymm5, [srcq+ssq*0] - vpbroadcastq ymm3, [srcq+ssq*1] - vpbroadcastq ymm4, [srcq+ssq*2] - add srcq, ss3q - vpblendd ymm1, ymm0, 0x30 - vpblendd ymm0, ymm2, 0x30 - punpcklbw ymm1, ymm0 ; 01 12 + movq xmm1, [srcq+nsq*2] + vpbroadcastq ymm3, [srcq+nsq*1] + vpbroadcastq ymm2, [srcq+ssq*0] + vpbroadcastq ymm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] vpbroadcastq ymm0, [srcq+ssq*0] - vpblendd ymm2, ymm5, 0x30 - vpblendd ymm5, ymm3, 0x30 - punpcklbw ymm2, ymm5 ; 23 34 - vpblendd ymm3, ymm4, 0x30 + vpblendd ymm1, ymm3, 0x30 + vpblendd ymm3, ymm2, 0x30 + punpcklbw ymm1, ymm3 ; 01 12 + vpblendd ymm2, ymm4, 0x30 vpblendd ymm4, ymm0, 0x30 - punpcklbw ymm3, ymm4 ; 45 56 + punpcklbw ymm2, ymm4 ; 23 34 .v_w8_loop: vpbroadcastq ymm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - pmaddubsw ymm5, ymm1, ym8 ; a0 b0 + pmaddubsw ymm3, ymm1, ym7 ; a0 b0 mova ymm1, ymm2 - pmaddubsw ymm2, ym9 ; a1 b1 - paddw ymm5, ymm2 - mova ymm2, ymm3 - pmaddubsw ymm3, ym10 ; a2 b2 - paddw ymm5, ymm3 - vpblendd ymm3, ymm0, ymm4, 0x30 + pmaddubsw ymm2, ym8 ; a1 b1 + paddw ymm3, ymm2 + vpblendd ymm2, ymm0, ymm4, 0x30 vpbroadcastq ymm0, [srcq+ssq*0] - vpblendd ymm4, ymm4, ymm0, 0x30 - punpcklbw ymm3, ymm4 ; 67 78 - pmaddubsw ymm4, ymm3, ym11 ; a3 b3 - paddw ymm5, ymm4 - pmulhrsw ymm5, ym7 - vextracti128 xmm4, ymm5, 1 - packuswb xmm5, xmm4 - movq [dstq+dsq*0], xmm5 - movhps [dstq+dsq*1], xmm5 + vpblendd ymm4, ymm0, 0x30 + punpcklbw ymm2, ymm4 ; 45 56 + pmaddubsw ymm4, ymm2, ym9 ; a2 b2 + paddw ymm3, ymm4 + pmulhrsw ymm3, ym6 + vextracti128 xmm4, ymm3, 1 + packuswb xmm3, xmm4 + movq [dstq+dsq*0], xmm3 + movhps [dstq+dsq*1], xmm3 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop vzeroupper RET .v_w16: - mova m12, [spel_v_perm16] - vbroadcasti32x4 m1, [srcq+ssq*0] - vbroadcasti32x4 ym4, [srcq+ssq*1] + mova m5, [spel_v_perm16a] + vbroadcasti32x4 m1, [srcq+nsq*2] + vbroadcasti32x4 ym3, [srcq+nsq*1] mov r6d, 0x0f - vbroadcasti32x4 m2, [srcq+ssq*2] - add srcq, ss3q - vbroadcasti32x4 ym5, [srcq+ssq*0] + vbroadcasti32x4 m2, [srcq+ssq*0] kmovb k1, r6d - vbroadcasti32x4 m3, [srcq+ssq*1] - vbroadcasti32x4 ym6, [srcq+ssq*2] - add srcq, ss3q + vbroadcasti32x4 ym4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] vbroadcasti32x4 m0, [srcq+ssq*0] - vshufpd m1{k1}, m4, m2, 0xcc - vshufpd m2{k1}, m5, m3, 0xcc - vshufpd m3{k1}, m6, m0, 0xcc - vpermb m1, m12, m1 ; 01 12 - vpermb m2, m12, m2 ; 23 34 - vpermb m3, m12, m3 ; 45 56 + vshufpd m1{k1}, m3, m2, 0xcc + vshufpd m2{k1}, m4, m0, 0xcc + vpermb m1, m5, m1 ; 01 12 + vpermb m2, m5, m2 ; 23 34 .v_w16_loop: - pmaddubsw m4, m1, m8 ; a0 b0 - mova m1, m2 - pmaddubsw m5, m2, m9 ; a1 b1 - mova m2, m3 - pmaddubsw m6, m3, m10 ; a2 b2 - mova m3, m0 - paddw m4, m5 - vbroadcasti32x4 ym5, [srcq+ssq*1] + vbroadcasti32x4 ym4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] + pmaddubsw m3, m1, m7 ; a0 b0 + mova m1, m2 + pmaddubsw m2, m8 ; a1 b1 + paddw m3, m2 + mova m2, m0 vbroadcasti32x4 m0, [srcq+ssq*0] - vshufpd m3{k1}, m5, m0, 0xcc - vpermb m3, m12, m3 ; 67 78 - pmaddubsw m5, m3, m11 ; a3 b3 - paddw m4, m6 - paddw m4, m5 - pmulhrsw m4, m7 - vextracti32x8 ym5, m4, 1 - packuswb ym4, ym5 - mova [dstq+dsq*0], xm4 - vextracti32x4 [dstq+dsq*1], ym4, 1 + vshufpd m2{k1}, m4, m0, 0xcc + vpermb m2, m5, m2 ; 45 56 + pmaddubsw m4, m2, m9 ; a2 b2 + paddw m3, m4 + pmulhrsw m3, m6 + vextracti32x8 ym4, m3, 1 + packuswb ym3, ym4 + mova [dstq+dsq*0], xm3 + vextracti32x4 [dstq+dsq*1], ym3, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w16_loop RET .v_w32: - mova m12, [spel_v_perm32] - pmovzxbq m14, [pb_02461357] - vpshrdw m13, m12, m12, 8 - movu ym0, [srcq+ssq*0] + mova m10, [spel_v_perm32] + pmovzxbq m5, [pb_02461357] + vpshrdw m11, m10, m10, 8 + movu ym0, [srcq+nsq*2] + vinserti32x8 m0, [srcq+nsq*1], 1 + vpermb m1, m10, m0 ; 01 + vinserti32x8 m0, [srcq+ssq*0], 0 + vpermb m2, m11, m0 ; 12 vinserti32x8 m0, [srcq+ssq*1], 1 - vpermb m1, m12, m0 ; 01 - vinserti32x8 m0, [srcq+ssq*2], 0 - add srcq, ss3q - vpermb m2, m13, m0 ; 12 - vinserti32x8 m0, [srcq+ssq*0], 1 - vpermb m3, m12, m0 ; 23 - vinserti32x8 m0, [srcq+ssq*1], 0 - vpermb m4, m13, m0 ; 34 - vinserti32x8 m0, [srcq+ssq*2], 1 - add srcq, ss3q - vpermb m5, m12, m0 ; 45 + lea srcq, [srcq+ssq*2] + vpermb m3, m10, m0 ; 23 vinserti32x8 m0, [srcq+ssq*0], 0 - vpermb m6, m13, m0 ; 56 + vpermb m4, m11, m0 ; 34 .v_w32_loop: vinserti32x8 m0, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] - pmaddubsw m15, m1, m8 + pmaddubsw m12, m1, m7 mova m1, m3 - pmaddubsw m16, m2, m8 + pmaddubsw m13, m2, m7 mova m2, m4 - pmaddubsw m17, m3, m9 - mova m3, m5 - pmaddubsw m18, m4, m9 - mova m4, m6 - pmaddubsw m19, m5, m10 - vpermb m5, m12, m0 ; 67 + pmaddubsw m14, m3, m8 + vpermb m3, m10, m0 ; 45 vinserti32x8 m0, [srcq+ssq*0], 0 - pmaddubsw m20, m6, m10 - vpermb m6, m13, m0 ; 78 - paddw m15, m17 - pmaddubsw m17, m5, m11 - paddw m16, m18 - pmaddubsw m18, m6, m11 - paddw m15, m19 - paddw m16, m20 - paddw m15, m17 - paddw m16, m18 - pmulhrsw m15, m7 - pmulhrsw m16, m7 - packuswb m15, m16 - vpermq m15, m14, m15 - mova [dstq+dsq*0], ym15 - vextracti32x8 [dstq+dsq*1], m15, 1 + pmaddubsw m15, m4, m8 + vpermb m4, m11, m0 ; 56 + paddw m12, m14 + pmaddubsw m14, m3, m9 + paddw m13, m15 + pmaddubsw m15, m4, m9 + paddw m12, m14 + paddw m13, m15 + pmulhrsw m12, m6 + pmulhrsw m13, m6 + packuswb m12, m13 + vpermq m12, m5, m12 + mova [dstq+dsq*0], ym12 + vextracti32x8 [dstq+dsq*1], m12, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w32_loop - vzeroupper RET .v_w64: .v_w128: lea r6d, [hq+wq*4-256] - mov r4, srcq - mov r7, dstq .v_loop0: - movu m2, [srcq+ssq*0] - movu m4, [srcq+ssq*1] - movu m6, [srcq+ssq*2] - add srcq, ss3q - movu m13, [srcq+ssq*0] - movu m15, [srcq+ssq*1] - movu m17, [srcq+ssq*2] - add srcq, ss3q - movu m0, [srcq+ssq*0] - punpcklbw m1, m2, m4 ; 01l - punpckhbw m2, m4 ; 01h - punpcklbw m3, m4, m6 ; 12l - punpckhbw m4, m6 ; 12h - punpcklbw m5, m6, m13 ; 23l - punpckhbw m6, m13 ; 23h - punpcklbw m12, m13, m15 ; 34l - punpckhbw m13, m15 ; 34h - punpcklbw m14, m15, m17 ; 45l - punpckhbw m15, m17 ; 45h - punpcklbw m16, m17, m0 ; 56l - punpckhbw m17, m0 ; 56h + movu m2, [srcq+nsq*2] + movu m4, [srcq+nsq*1] + lea r4, [srcq+ssq*2] + movu m11, [srcq+ssq*0] + movu m13, [srcq+ssq*1] + mov r7, dstq + movu m0, [r4 +ssq*0] + punpcklbw m1, m2, m4 ; 01l + punpckhbw m2, m4 ; 01h + punpcklbw m3, m4, m11 ; 12l + punpckhbw m4, m11 ; 12h + punpcklbw m10, m11, m13 ; 23l + punpckhbw m11, m13 ; 23h + punpcklbw m12, m13, m0 ; 34l + punpckhbw m13, m0 ; 34h .v_loop: - pmaddubsw m18, m1, m8 ; a0l - mova m1, m5 - pmaddubsw m19, m2, m8 ; a0h - mova m2, m6 - pmaddubsw m20, m3, m8 ; b0l + movu m5, [r4+ssq*1] + pmaddubsw m14, m1, m7 ; a0l + mova m1, m10 + pmaddubsw m10, m8 ; a1l + lea r4, [r4+ssq*2] + pmaddubsw m15, m2, m7 ; a0h + mova m2, m11 + pmaddubsw m11, m8 ; a1h + paddw m14, m10 + punpcklbw m10, m0, m5 ; 45l + paddw m15, m11 + punpckhbw m11, m0, m5 ; 45h + pmaddubsw m0, m10, m9 ; a2l + paddw m14, m0 + pmaddubsw m0, m11, m9 ; a2h + paddw m15, m0 + movu m0, [r4+ssq*0] + pmulhrsw m14, m6 + pmulhrsw m15, m6 + packuswb m14, m15 + pmaddubsw m15, m3, m7 ; b0l mova m3, m12 - pmaddubsw m21, m4, m8 ; b0h + pmaddubsw m12, m8 ; b1l + mova [r7+dsq*0], m14 + pmaddubsw m14, m4, m7 ; b0h mova m4, m13 - pmaddubsw m5, m9 ; a1l - pmaddubsw m6, m9 ; a1h - pmaddubsw m12, m9 ; b1l - pmaddubsw m13, m9 ; b1h - paddw m18, m5 - mova m5, m14 - pmaddubsw m14, m10 ; a2l - paddw m19, m6 - mova m6, m15 - pmaddubsw m15, m10 ; a2h - paddw m20, m12 - mova m12, m16 - pmaddubsw m16, m10 ; b2l - paddw m21, m13 - mova m13, m17 - pmaddubsw m17, m10 ; b2h - paddw m18, m14 - paddw m19, m15 - paddw m20, m16 - paddw m21, m17 - movu m17, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - punpcklbw m14, m0, m17 ; 67l - punpckhbw m15, m0, m17 ; 67h - pmaddubsw m16, m14, m11 ; a3l - pmaddubsw m0, m15, m11 ; a3h - paddw m18, m16 - paddw m19, m0 - movu m0, [srcq+ssq*0] - punpcklbw m16, m17, m0 ; 78l - punpckhbw m17, m0 ; 78h - pmulhrsw m18, m7 - pmulhrsw m19, m7 - packuswb m18, m19 - mova [dstq+dsq*0], m18 - pmaddubsw m18, m16, m11 ; b3l - pmaddubsw m19, m17, m11 ; b3h - paddw m18, m20 - paddw m19, m21 - pmulhrsw m18, m7 - pmulhrsw m19, m7 - packuswb m18, m19 - mova [dstq+dsq*1], m18 - lea dstq, [dstq+dsq*2] + pmaddubsw m13, m8 ; b1h + paddw m15, m12 + punpcklbw m12, m5, m0 ; 56l + paddw m14, m13 + punpckhbw m13, m5, m0 ; 56h + pmaddubsw m5, m12, m9 ; b2l + paddw m15, m5 + pmaddubsw m5, m13, m9 ; b2h + paddw m14, m5 + pmulhrsw m15, m6 + pmulhrsw m14, m6 + packuswb m15, m14 + mova [r7+dsq*1], m15 + lea r7, [r7+dsq*2] sub hd, 2 jg .v_loop - add r4, 64 - add r7, 64 + add srcq, 64 + add dstq, 64 movzx hd, r6b - mov srcq, r4 - mov dstq, r7 sub r6d, 256 jg .v_loop0 - vzeroupper RET +.h: + test myd, 0xf00 + jz mangle(private_prefix %+ _put_8tap_8bpc_avx512icl).h2 .hv: + vpbroadcastd m9, [pd_34] + mova xm10, [spel_hv_end] + pxor xm0, xm0 cmp wd, 4 jg .hv_w8 movzx mxd, mxb @@ -2000,94 +1778,850 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 shr myd, 16 cmp hd, 6 cmovs myd, mxd - vpbroadcastd m8, [pd_2] - vpbroadcastq ym0, [base+subpel_filters+myq*8] - lea ss3q, [ssq*3] - vpbroadcastd ym9, [pd_32768] - mov r6, srcq - punpcklbw ym0, ym8, ym0 - sub r6, ss3q + vpbroadcastq ym1, [base+subpel_filters+1+myq*8] + mov nsq, ssq + punpcklbw ym0, ym1 + neg nsq psraw ym0, 2 ; << 6 - mova xm14, [spel_hv_end] - pshufd ym10, ym0, q0000 - pshufd ym11, ym0, q1111 - pshufd ym12, ym0, q2222 - pshufd ym13, ym0, q3333 + pshufd ym11, ym0, q0000 + pshufd ym12, ym0, q1111 + pshufd ym13, ym0, q2222 cmp wd, 4 je .hv_w4 - vbroadcasti128 ym6, [subpel_h_shuf4] - movq xmm2, [r6+ssq*0] - movhps xmm2, [r6+ssq*1] - movq xmm0, [r6+ssq*2] - movhps xmm0, [srcq+ssq*0] - vpbroadcastq ymm3, [srcq+ssq*1] - vpbroadcastq ymm4, [srcq+ssq*2] - add srcq, ss3q + vbroadcasti128 ym5, [subpel_h_shuf4] + movq xmm0, [srcq+nsq*2] + movhps xmm0, [srcq+nsq*1] + movq xmm2, [srcq+ssq*0] + movhps xmm2, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] vpbroadcastq ymm1, [srcq+ssq*0] - vpblendd ymm2, ymm3, 0x30 - vpblendd ymm0, ymm1, 0x30 ; 2 3 6 _ - vpblendd ymm2, ymm4, 0xc0 ; 0 1 4 5 - pshufb ymm2, ym6 - pshufb ymm0, ym6 - mova ymm1, ym8 - vpdpbusd ymm1, ymm2, ym7 - mova ymm2, ym8 + vpblendd ymm0, ymm1, 0x30 + pshufb xmm2, xm5 ; 2 3 + pshufb ymm0, ym5 ; 0 1 4 + mova xmm1, xm9 + vpdpbusd xmm1, xmm2, xm7 + mova ymm2, ym9 vpdpbusd ymm2, ymm0, ym7 - packssdw ymm2, ymm1, ymm2 + packssdw ymm2, ymm1 psraw ymm2, 2 - vextracti128 xmm3, ymm2, 1 - palignr xmm4, xmm3, xmm2, 4 - punpcklwd xmm1, xmm2, xmm4 ; 01 12 - punpckhwd xmm2, xmm4 ; 23 34 - pshufd xmm0, xmm3, q2121 - punpcklwd xmm3, xmm0 ; 45 56 + vextracti128 xmm0, ymm2, 1 + vzeroupper + palignr xmm0, xmm2, 4 + punpcklwd xmm1, xmm2, xmm0 ; 01 12 + punpckhwd xmm2, xmm0 ; 23 34 .hv_w2_loop: - movq xmm4, [srcq+ssq*1] + movq xmm3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - movhps xmm4, [srcq+ssq*0] - mova xmm5, xm9 - vpdpwssd xmm5, xmm1, xm10 ; a0 b0 + movhps xmm3, [srcq+ssq*0] + pmaddwd xmm4, xmm1, xm11 ; a0 b0 mova xmm1, xmm2 - vpdpwssd xmm5, xmm2, xm11 ; a1 b1 - pshufb xmm4, xm6 - mova xmm2, xmm3 - vpdpwssd xmm5, xmm3, xm12 ; a2 b2 - mova xmm3, xm8 - vpdpbusd xmm3, xmm4, xm7 - packssdw xmm4, xmm3, xmm3 - psraw xmm4, 2 - palignr xmm3, xmm4, xmm0, 12 - mova xmm0, xmm4 - punpcklwd xmm3, xmm4 ; 67 78 - vpdpwssd xmm5, xmm3, xm13 ; a3 b3 - packuswb xmm5, xmm5 - pshufb xmm5, xm14 - pextrw [dstq+dsq*0], xmm5, 0 - pextrw [dstq+dsq*1], xmm5, 1 + vpdpwssd xmm4, xmm2, xm12 ; a1 b1 + pshufb xmm3, xm5 + mova xmm2, xm9 + vpdpbusd xmm2, xmm3, xm7 + packssdw xmm3, xmm2, xmm2 + psraw xmm3, 2 + palignr xmm2, xmm3, xmm0, 12 + mova xmm0, xmm3 + punpcklwd xmm2, xmm3 ; 45 56 + vpdpwssd xmm4, xmm2, xm13 ; a2 b2 + packuswb xmm4, xmm4 + pshufb xmm4, xm10 + pextrw [dstq+dsq*0], xmm4, 0 + pextrw [dstq+dsq*1], xmm4, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop - vzeroupper RET .hv_w4: - movq xmm1, [r6+ssq*0] - vpbroadcastq ym2, [r6+ssq*1] - vinserti32x4 ym1, ymm1, [r6+ssq*2], 1 - vinserti32x4 m2, [srcq+ssq*0], 2 - vinserti32x4 m1, [srcq+ssq*1], 2 - vinserti32x4 m2, [srcq+ssq*2], 3 ; _ 1 3 5 - vbroadcasti32x4 m6, [subpel_h_shufA] - add srcq, ss3q - vinserti32x4 m1, [srcq+ssq*0], 3 ; 0 2 4 6 - pshufb m2, m6 - pshufb m1, m6 - mova m0, m8 - vpdpbusd m0, m2, m7 - mova m4, m8 - vpdpbusd m4, m1, m7 + movq xm2, [srcq+nsq*2] + vpbroadcastq ym1, [srcq+nsq*1] + vinserti32x4 ym2, [srcq+ssq*0], 1 + vinserti32x4 m1, [srcq+ssq*1], 2 ; _ 1 3 + lea srcq, [srcq+ssq*2] + vbroadcasti32x4 m5, [subpel_h_shufA] + vinserti32x4 m2, [srcq+ssq*0], 2 ; 0 2 4 + pshufb m1, m5 + mova m0, m9 + pshufb m2, m5 + mova m3, m9 + vpdpbusd m0, m1, m7 mova ym1, [spel_hv_perm4a] + vpdpbusd m3, m2, m7 mova ym2, [spel_hv_perm4b] - mova ym3, [spel_hv_perm4c] + mov r6d, 0x5555 + mova ym6, [spel_hv_perm4d] + packssdw m0, m3 + kmovw k1, r6d + psraw m0, 2 ; _ 0 1 2 3 4 5 6 + vpermb ym1, ym1, ym0 ; 01 12 + vpermb m2, m2, m0 ; 23 34 +.hv_w4_loop: + movq xm3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vinserti32x4 ym3, [srcq+ssq*0], 1 + pmaddwd ym4, ym1, ym11 ; a0 b0 + mova ym1, ym2 + pshufb ym3, ym5 + mova ym0, ym9 + vpdpbusd ym0, ym3, ym7 + vpdpwssd ym4, ym2, ym12 ; a1 b1 + vpsraw ym2{k1}, ym0, 2 ; 5 6 + vpermb ym2, ym6, ym2 ; 45 56 + vpdpwssd ym4, ym2, ym13 ; a2 b2 + packuswb ym4, ym4 + vpermb ym4, ym10, ym4 + movd [dstq+dsq*0], xm4 + pextrd [dstq+dsq*1], xm4, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: + shr mxd, 16 + sub srcq, 3 + vpbroadcastd m11, [base+subpel_filters+mxq*8+0] + vpbroadcastd m12, [base+subpel_filters+mxq*8+4] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + vpbroadcastq m1, [base+subpel_filters+1+myq*8] + mov nsq, ssq + punpcklbw m0, m1 + neg nsq + psraw m0, 2 ; << 6 + pshufd m13, m0, q0000 + pshufd m14, m0, q1111 + pshufd m15, m0, q2222 + cmp wd, 8 + jne .hv_w16 + movu xm0, [srcq+nsq*2] + vinserti32x4 ym0, [srcq+nsq*1], 1 + vbroadcasti32x4 m1, [subpel_h_shufA] + vinserti32x4 m0, [srcq+ssq*0], 2 + vbroadcasti32x4 m4, [subpel_h_shufB] + vinserti32x4 m0, [srcq+ssq*1], 3 + lea srcq, [srcq+ssq*2] + vbroadcasti32x4 m7, [subpel_h_shufC] + vbroadcasti32x4 ym5, [srcq+ssq*0] + vbroadcasti32x8 m6, [subpel_h_shufA] + pshufb m1, m0, m1 ; 0 1 2 3 0123 + mova m2, m9 + vpdpbusd m2, m1, m11 + pshufb m4, m0, m4 ; 0 1 2 3 4567 + mova m1, m9 + vpdpbusd m1, m4, m11 + pshufb m0, m7 ; 0 1 2 3 89ab + pshufb ym7, ym5, ym6 ; 4 0123 4567 + mova ym3, ym9 + vpdpbusd ym3, ym7, ym11 + vbroadcasti32x8 m7, [subpel_h_shufB] + vpdpbusd m2, m4, m12 + mova m4, [spel_hv_perm8a] + pshufb ym5, ym7 ; 4 4567 89ab + vpdpbusd m1, m0, m12 + vpaddd m0, m4, [pb_32] {1to16} + vpdpbusd ym3, ym5, ym12 + mova m5, [spel_hv_perm8b] + mov r6, 0x55555555ff00 + packssdw m2, m1 + vpmovsdw xm3, ym3 + kmovq k1, r6 + psraw m2, 2 ; 0 1 2 3 + psraw xm3, 2 ; 4 + vpermb m1, m4, m2 ; 01 12 + kshiftrq k2, k1, 16 + vpermt2b m2, m0, m3 ; 23 34 +.hv_w8_loop: + vbroadcasti32x4 ym3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vbroadcasti32x4 m3{k1}, [srcq+ssq*0] + pmaddwd m0, m1, m13 ; a0 b0 + pshufb m1, m3, m6 ; 5 6 0123 4567 + mova m4, m9 + vpdpbusd m4, m1, m11 + pshufb m3, m7 ; 5 6 4567 89ab + vpdpwssd m0, m2, m14 ; a1 b1 + mova m1, m2 + vpdpbusd m4, m3, m12 + psraw m2{k2}, m4, 2 ; 53 64 + vpermb m2, m5, m2 ; 45 56 + vpdpwssd m0, m2, m15 ; a2 b2 + packuswb m0, m0 + vpermb m0, m10, m0 + movq [dstq+dsq*0], xm0 + movhps [dstq+dsq*1], xm0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w8_loop + RET +.hv_w16: + movu m19, [spel_hv_perm16a] + vpbroadcastd m7, [pb_4] + lea r6d, [wq*2-32] + mova m6, [spel_hv_perm16b] + paddb m20, m7, m19 + lea r6d, [hq+r6*8] + paddb m21, m7, m20 + mova ym10, [spel_hv_end16] + paddb m7, m6 +.hv_w16_loop0: + movu ym16, [srcq+nsq*2] + vinserti32x8 m16, [srcq+nsq*1], 1 + lea r4, [srcq+ssq*2] + movu ym17, [srcq+ssq*0] + vinserti32x8 m17, [srcq+ssq*1], 1 + mov r7, dstq + movu ym18, [r4 +ssq*0] + vpermb m2, m19, m16 ; 0 1 0123 89ab + mova m1, m9 + vpermb m3, m21, m16 ; 0 1 89ab ghij + vpdpbusd m1, m2, m11 + mova m2, m9 + vpermb m4, m19, m17 ; 2 3 0123 89ab + vpdpbusd m2, m3, m12 + mova m3, m9 + vpermb m5, m21, m17 ; 2 3 89ab ghij + vpdpbusd m3, m4, m11 + mova m4, m9 + vpermb m0, m6, m18 ; 4 0145 2367 89cd abef + vpdpbusd m4, m5, m12 + mova m5, m9 + vpermb m16, m20, m16 ; 0 1 4567 cdef + vpdpbusd m5, m0, m11 + vpermb m17, m20, m17 ; 2 3 4567 cdef + vpdpbusd m1, m16, m12 + vpermb m18, m7, m18 ; 4 4589 67ab cdgh efij + vpdpbusd m2, m16, m11 + vpdpbusd m3, m17, m12 + vpdpbusd m4, m17, m11 + vpdpbusd m5, m18, m12 + packssdw m1, m2 ; 01 + packssdw m3, m4 ; 23 + REPX {psraw x, 2}, m1, m3, m5 + vpshrdd m2, m1, m3, 16 ; 12 + vpshrdd m4, m3, m5, 16 ; 34 +.hv_w16_loop: + movu ym18, [r4+ssq*1] + lea r4, [r4+ssq*2] + vinserti32x8 m18, [r4+ssq*0], 1 + pmaddwd m16, m1, m13 ; a0 + vpermb m1, m19, m18 ; 5 6 0123 89ab + pmaddwd m17, m2, m13 ; b0 + vpermb m2, m20, m18 ; 5 6 4567 cdef + mova m0, m9 + vpdpbusd m0, m1, m11 + vpermb m18, m21, m18 + mova m1, m9 + vpdpbusd m1, m2, m11 + vpdpwssd m16, m3, m14 ; a1 + vpdpwssd m17, m4, m14 ; b1 + vpdpbusd m0, m2, m12 + mova m2, m4 + vpdpbusd m1, m18, m12 + packssdw m0, m1 + mova m1, m3 + psraw m4, m0, 2 ; 5 6 + vpshrdd m3, m2, m4, 16 ; 4 5 + vpdpwssd m17, m4, m15 ; b2 + vpdpwssd m16, m3, m15 ; a2 + packuswb m16, m17 + vpermb m16, m10, m16 + mova [r7+dsq*0], xm16 + vextracti128 [r7+dsq*1], ym16, 1 + lea r7, [r7+dsq*2] + sub hd, 2 + jg .hv_w16_loop + add srcq, 16 + add dstq, 16 + movzx hd, r6b + sub r6d, 1<<8 + jg .hv_w16_loop0 + vzeroupper + RET + +PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP, put_8tap_8bpc +PUT_8TAP_FN regular_sharp, REGULAR, SHARP, put_8tap_8bpc +PUT_8TAP_FN sharp, SHARP, SHARP + +cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + lea r8, [put_avx512icl] + movsxd wq, wm + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jz mangle(private_prefix %+ _put_6tap_8bpc_avx512icl).put +.v: + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + tzcnt r6d, wd + lea myq, [base+subpel_filters+myq*8] + movzx r6d, word [r8+r6*2+table_offset(put, _8tap_v)] + vpbroadcastd m7, [pw_512] + vpbroadcastw m8, [myq+0] + add r6, r8 + vpbroadcastw m9, [myq+2] + lea ss3q, [ssq*3] + vpbroadcastw m10, [myq+4] + sub srcq, ss3q + vpbroadcastw m11, [myq+6] + jmp r6 +.v_w2: + movd xmm2, [srcq+ssq*0] + pinsrw xmm2, [srcq+ssq*1], 2 + pinsrw xmm2, [srcq+ssq*2], 4 + add srcq, ss3q + pinsrw xmm2, [srcq+ssq*0], 6 ; 0 1 2 3 + movd xmm3, [srcq+ssq*1] + vpbroadcastd xmm1, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastd xmm0, [srcq+ssq*0] + vpblendd xmm3, xmm3, xmm1, 0x02 ; 4 5 + vpblendd xmm1, xmm1, xmm0, 0x02 ; 5 6 + palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4 + punpcklbw xmm3, xmm1 ; 45 56 + punpcklbw xmm1, xmm2, xmm4 ; 01 12 + punpckhbw xmm2, xmm4 ; 23 34 +.v_w2_loop: + pmaddubsw xmm5, xmm1, xm8 ; a0 b0 + mova xmm1, xmm2 + pmaddubsw xmm2, xm9 ; a1 b1 + paddw xmm5, xmm2 + mova xmm2, xmm3 + pmaddubsw xmm3, xm10 ; a2 b2 + paddw xmm5, xmm3 + vpbroadcastd xmm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7 + vpbroadcastd xmm0, [srcq+ssq*0] + vpblendd xmm4, xmm4, xmm0, 0x02 ; 7 8 + punpcklbw xmm3, xmm4 ; 67 78 + pmaddubsw xmm4, xmm3, xm11 ; a3 b3 + paddw xmm5, xmm4 + pmulhrsw xmm5, xm7 + packuswb xmm5, xmm5 + pextrw [dstq+dsq*0], xmm5, 0 + pextrw [dstq+dsq*1], xmm5, 2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: + movd xmm2, [srcq+ssq*0] + pinsrd xmm2, [srcq+ssq*1], 1 + pinsrd xmm2, [srcq+ssq*2], 2 + add srcq, ss3q + pinsrd xmm2, [srcq+ssq*0], 3 ; 0 1 2 3 + movd xmm3, [srcq+ssq*1] + vpbroadcastd xmm1, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastd xmm0, [srcq+ssq*0] + vpblendd xmm3, xmm3, xmm1, 0x02 ; 4 5 + vpblendd xmm1, xmm1, xmm0, 0x02 ; 5 6 + palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4 + punpcklbw xmm3, xmm1 ; 45 56 + punpcklbw xmm1, xmm2, xmm4 ; 01 12 + punpckhbw xmm2, xmm4 ; 23 34 +.v_w4_loop: + vpbroadcastd xmm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddubsw xmm5, xmm1, xm8 ; a0 b0 + mova xmm1, xmm2 + pmaddubsw xmm2, xm9 ; a1 b1 + paddw xmm5, xmm2 + mova xmm2, xmm3 + pmaddubsw xmm3, xm10 ; a2 b2 + paddw xmm5, xmm3 + vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7 + vpbroadcastd xmm0, [srcq+ssq*0] + vpblendd xmm4, xmm4, xmm0, 0x02 ; 7 8 + punpcklbw xmm3, xmm4 ; 67 78 + pmaddubsw xmm4, xmm3, xm11 ; a3 b3 + paddw xmm5, xmm4 + pmulhrsw xmm5, xm7 + packuswb xmm5, xmm5 + movd [dstq+dsq*0], xmm5 + pextrd [dstq+dsq*1], xmm5, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: + movq xmm1, [srcq+ssq*0] + vpbroadcastq ymm0, [srcq+ssq*1] + vpbroadcastq ymm2, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastq ymm5, [srcq+ssq*0] + vpbroadcastq ymm3, [srcq+ssq*1] + vpbroadcastq ymm4, [srcq+ssq*2] + add srcq, ss3q + vpblendd ymm1, ymm0, 0x30 + vpblendd ymm0, ymm2, 0x30 + punpcklbw ymm1, ymm0 ; 01 12 + vpbroadcastq ymm0, [srcq+ssq*0] + vpblendd ymm2, ymm5, 0x30 + vpblendd ymm5, ymm3, 0x30 + punpcklbw ymm2, ymm5 ; 23 34 + vpblendd ymm3, ymm4, 0x30 + vpblendd ymm4, ymm0, 0x30 + punpcklbw ymm3, ymm4 ; 45 56 +.v_w8_loop: + vpbroadcastq ymm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddubsw ymm5, ymm1, ym8 ; a0 b0 + mova ymm1, ymm2 + pmaddubsw ymm2, ym9 ; a1 b1 + paddw ymm5, ymm2 + mova ymm2, ymm3 + pmaddubsw ymm3, ym10 ; a2 b2 + paddw ymm5, ymm3 + vpblendd ymm3, ymm0, ymm4, 0x30 + vpbroadcastq ymm0, [srcq+ssq*0] + vpblendd ymm4, ymm4, ymm0, 0x30 + punpcklbw ymm3, ymm4 ; 67 78 + pmaddubsw ymm4, ymm3, ym11 ; a3 b3 + paddw ymm5, ymm4 + pmulhrsw ymm5, ym7 + vextracti128 xmm4, ymm5, 1 + packuswb xmm5, xmm4 + movq [dstq+dsq*0], xmm5 + movhps [dstq+dsq*1], xmm5 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w8_loop + vzeroupper + RET +.v_w16: + mova m12, [spel_v_perm16a] + vbroadcasti32x4 m1, [srcq+ssq*0] + vbroadcasti32x4 ym4, [srcq+ssq*1] + mov r6d, 0x0f + vbroadcasti32x4 m2, [srcq+ssq*2] + add srcq, ss3q + vbroadcasti32x4 ym5, [srcq+ssq*0] + kmovb k1, r6d + vbroadcasti32x4 m3, [srcq+ssq*1] + vbroadcasti32x4 ym6, [srcq+ssq*2] + add srcq, ss3q + vbroadcasti32x4 m0, [srcq+ssq*0] + vshufpd m1{k1}, m4, m2, 0xcc + vshufpd m2{k1}, m5, m3, 0xcc + vshufpd m3{k1}, m6, m0, 0xcc + vpermb m1, m12, m1 ; 01 12 + vpermb m2, m12, m2 ; 23 34 + vpermb m3, m12, m3 ; 45 56 +.v_w16_loop: + pmaddubsw m4, m1, m8 ; a0 b0 + mova m1, m2 + pmaddubsw m5, m2, m9 ; a1 b1 + mova m2, m3 + pmaddubsw m6, m3, m10 ; a2 b2 + mova m3, m0 + paddw m4, m5 + vbroadcasti32x4 ym5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vbroadcasti32x4 m0, [srcq+ssq*0] + vshufpd m3{k1}, m5, m0, 0xcc + vpermb m3, m12, m3 ; 67 78 + pmaddubsw m5, m3, m11 ; a3 b3 + paddw m4, m6 + paddw m4, m5 + pmulhrsw m4, m7 + vextracti32x8 ym5, m4, 1 + packuswb ym4, ym5 + mova [dstq+dsq*0], xm4 + vextracti32x4 [dstq+dsq*1], ym4, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w16_loop + RET +.v_w32: + mova m12, [spel_v_perm32] + pmovzxbq m14, [pb_02461357] + vpshrdw m13, m12, m12, 8 + movu ym0, [srcq+ssq*0] + vinserti32x8 m0, [srcq+ssq*1], 1 + vpermb m1, m12, m0 ; 01 + vinserti32x8 m0, [srcq+ssq*2], 0 + add srcq, ss3q + vpermb m2, m13, m0 ; 12 + vinserti32x8 m0, [srcq+ssq*0], 1 + vpermb m3, m12, m0 ; 23 + vinserti32x8 m0, [srcq+ssq*1], 0 + vpermb m4, m13, m0 ; 34 + vinserti32x8 m0, [srcq+ssq*2], 1 + add srcq, ss3q + vpermb m5, m12, m0 ; 45 + vinserti32x8 m0, [srcq+ssq*0], 0 + vpermb m6, m13, m0 ; 56 +.v_w32_loop: + vinserti32x8 m0, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + pmaddubsw m15, m1, m8 + mova m1, m3 + pmaddubsw m16, m2, m8 + mova m2, m4 + pmaddubsw m17, m3, m9 + mova m3, m5 + pmaddubsw m18, m4, m9 + mova m4, m6 + pmaddubsw m19, m5, m10 + vpermb m5, m12, m0 ; 67 + vinserti32x8 m0, [srcq+ssq*0], 0 + pmaddubsw m20, m6, m10 + vpermb m6, m13, m0 ; 78 + paddw m15, m17 + pmaddubsw m17, m5, m11 + paddw m16, m18 + pmaddubsw m18, m6, m11 + paddw m15, m19 + paddw m16, m20 + paddw m15, m17 + paddw m16, m18 + pmulhrsw m15, m7 + pmulhrsw m16, m7 + packuswb m15, m16 + vpermq m15, m14, m15 + mova [dstq+dsq*0], ym15 + vextracti32x8 [dstq+dsq*1], m15, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w32_loop + vzeroupper + RET +.v_w64: +.v_w128: + lea r6d, [hq+wq*4-256] + mov r4, srcq + mov r7, dstq +.v_loop0: + movu m2, [srcq+ssq*0] + movu m4, [srcq+ssq*1] + movu m6, [srcq+ssq*2] + add srcq, ss3q + movu m13, [srcq+ssq*0] + movu m15, [srcq+ssq*1] + movu m17, [srcq+ssq*2] + add srcq, ss3q + movu m0, [srcq+ssq*0] + punpcklbw m1, m2, m4 ; 01l + punpckhbw m2, m4 ; 01h + punpcklbw m3, m4, m6 ; 12l + punpckhbw m4, m6 ; 12h + punpcklbw m5, m6, m13 ; 23l + punpckhbw m6, m13 ; 23h + punpcklbw m12, m13, m15 ; 34l + punpckhbw m13, m15 ; 34h + punpcklbw m14, m15, m17 ; 45l + punpckhbw m15, m17 ; 45h + punpcklbw m16, m17, m0 ; 56l + punpckhbw m17, m0 ; 56h +.v_loop: + pmaddubsw m18, m1, m8 ; a0l + mova m1, m5 + pmaddubsw m19, m2, m8 ; a0h + mova m2, m6 + pmaddubsw m20, m3, m8 ; b0l + mova m3, m12 + pmaddubsw m21, m4, m8 ; b0h + mova m4, m13 + pmaddubsw m5, m9 ; a1l + pmaddubsw m6, m9 ; a1h + pmaddubsw m12, m9 ; b1l + pmaddubsw m13, m9 ; b1h + paddw m18, m5 + mova m5, m14 + pmaddubsw m14, m10 ; a2l + paddw m19, m6 + mova m6, m15 + pmaddubsw m15, m10 ; a2h + paddw m20, m12 + mova m12, m16 + pmaddubsw m16, m10 ; b2l + paddw m21, m13 + mova m13, m17 + pmaddubsw m17, m10 ; b2h + paddw m18, m14 + paddw m19, m15 + paddw m20, m16 + paddw m21, m17 + movu m17, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklbw m14, m0, m17 ; 67l + punpckhbw m15, m0, m17 ; 67h + pmaddubsw m16, m14, m11 ; a3l + pmaddubsw m0, m15, m11 ; a3h + paddw m18, m16 + paddw m19, m0 + movu m0, [srcq+ssq*0] + punpcklbw m16, m17, m0 ; 78l + punpckhbw m17, m0 ; 78h + pmulhrsw m18, m7 + pmulhrsw m19, m7 + packuswb m18, m19 + mova [dstq+dsq*0], m18 + pmaddubsw m18, m16, m11 ; b3l + pmaddubsw m19, m17, m11 ; b3h + paddw m18, m20 + paddw m19, m21 + pmulhrsw m18, m7 + pmulhrsw m19, m7 + packuswb m18, m19 + mova [dstq+dsq*1], m18 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_loop + add r4, 64 + add r7, 64 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 + sub r6d, 256 + jg .v_loop0 + vzeroupper + RET +.h: + test myd, 0xf00 + jnz .hv +.h2: + vpbroadcastd m5, [pd_34] ; 2 + (8 << 2) + cmp wd, 4 + jl .h_w2 + vbroadcasti128 m6, [subpel_h_shufA] + je .h_w4 + tzcnt wd, wd + vbroadcasti128 m7, [subpel_h_shufB] + vbroadcasti128 m8, [subpel_h_shufC] + shr mxd, 16 + sub srcq, 3 + movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)] + vpbroadcastd m9, [base+mxq*8+subpel_filters+0] + vpbroadcastd m10, [base+mxq*8+subpel_filters+4] + add wq, r8 + jmp wq +.h_w2: + movzx mxd, mxb + dec srcq + mova xmm4, [subpel_h_shuf4] + vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2] +.h_w2_loop: + movq xmm0, [srcq+ssq*0] + movhps xmm0, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb xmm0, xmm4 + mova xmm1, xm5 + vpdpbusd xmm1, xmm0, xmm3 + packssdw xmm0, xmm1, xmm1 + psraw xmm0, 6 + packuswb xmm0, xm0 + pextrw [dstq+dsq*0], xmm0, 0 + pextrw [dstq+dsq*1], xmm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w2_loop + RET +.h_w4: + movzx mxd, mxb + dec srcq + vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2] +.h_w4_loop: + movq xmm0, [srcq+ssq*0] + movq xmm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb xmm0, xm6 + pshufb xmm1, xm6 + mova xmm2, xm5 + vpdpbusd xmm2, xmm0, xmm3 + mova xmm0, xm5 + vpdpbusd xmm0, xmm1, xmm3 + packssdw xmm0, xmm2, xmm0 + psraw xmm0, 6 + packuswb xmm0, xmm0 + movd [dstq+dsq*0], xmm0 + pextrd [dstq+dsq*1], xmm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w4_loop + RET +.h_w8: + movu xm0, [srcq+ssq*0] + vinserti32x4 ym0, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + WRAP_YMM PUT_8TAP_H 0, 1, 2, 3 + vpmovuswb xm0, ym0 + movq [dstq+dsq*0], xm0 + movhps [dstq+dsq*1], xm0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w8 + RET +.h_w16: + mova m6, [spel_h_perm16] + vpbroadcastd m8, [pb_4] + paddb m7, m8, m6 + paddb m8, m7 +.h_w16_loop: + movu ym0, [srcq+ssq*0] + vinserti32x8 m0, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + PUT_8TAP_H 0, 1, 2, 3, 1 + vpmovuswb ym0, m0 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], ym0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w16_loop + RET +.h_w32: + movu ym0, [srcq+ssq*0+8*0] + vinserti32x8 m0, [srcq+ssq*1+8*0], 1 + movu ym1, [srcq+ssq*0+8*1] + vinserti32x8 m1, [srcq+ssq*1+8*1], 1 + lea srcq, [srcq+ssq*2] + PUT_8TAP_H 0, 2, 3, 4 + PUT_8TAP_H 1, 4, 3, 2 + packuswb m0, m1 + mova [dstq+dsq*0], ym0 + vextracti32x8 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w32 + RET +.h_w64: + movu m0, [srcq+8*0] + movu m1, [srcq+8*1] + add srcq, ssq + PUT_8TAP_H 0, 2, 3, 4 + PUT_8TAP_H 1, 4, 3, 2 + packuswb m0, m1 + mova [dstq], m0 + add dstq, dsq + dec hd + jg .h_w64 + RET +.h_w128: + movu m0, [srcq+8*0] + movu m2, [srcq+8*1] + movu m1, [srcq+8*8] + movu m3, [srcq+8*9] + add srcq, ssq + PUT_8TAP_H 0, 4, 11, 12 + PUT_8TAP_H 2, 12, 11, 4 + PUT_8TAP_H 1, 4, 11, 12 + PUT_8TAP_H 3, 12, 11, 4 + packuswb m0, m2 + packuswb m1, m3 + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + add dstq, dsq + dec hd + jg .h_w128 + RET +.hv: + vpbroadcastd m9, [pd_34] + pxor xm0, xm0 + cmp wd, 4 + jg .hv_w8 + movzx mxd, mxb + dec srcq + vpbroadcastd m7, [base+subpel_filters+mxq*8+2] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + vpbroadcastq ym1, [base+subpel_filters+myq*8] + lea ss3q, [ssq*3] + mov r6, srcq + punpcklbw ym0, ym1 + sub r6, ss3q + psraw ym0, 2 ; << 6 + mova xm14, [spel_hv_end] + pshufd ym10, ym0, q0000 + pshufd ym11, ym0, q1111 + pshufd ym12, ym0, q2222 + pshufd ym13, ym0, q3333 + cmp wd, 4 + je .hv_w4 + vbroadcasti128 ym6, [subpel_h_shuf4] + movq xmm2, [r6+ssq*0] + movhps xmm2, [r6+ssq*1] + movq xmm0, [r6+ssq*2] + movhps xmm0, [srcq+ssq*0] + vpbroadcastq ymm3, [srcq+ssq*1] + vpbroadcastq ymm4, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastq ymm1, [srcq+ssq*0] + vpblendd ymm2, ymm3, 0x30 + vpblendd ymm0, ymm1, 0x30 ; 2 3 6 _ + vpblendd ymm2, ymm4, 0xc0 ; 0 1 4 5 + pshufb ymm2, ym6 + pshufb ymm0, ym6 + mova ymm1, ym9 + vpdpbusd ymm1, ymm2, ym7 + mova ymm2, ym9 + vpdpbusd ymm2, ymm0, ym7 + packssdw ymm2, ymm1, ymm2 + psraw ymm2, 2 + vextracti128 xmm3, ymm2, 1 + palignr xmm4, xmm3, xmm2, 4 + punpcklwd xmm1, xmm2, xmm4 ; 01 12 + punpckhwd xmm2, xmm4 ; 23 34 + pshufd xmm0, xmm3, q2121 + punpcklwd xmm3, xmm0 ; 45 56 +.hv_w2_loop: + movq xmm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movhps xmm4, [srcq+ssq*0] + pmaddwd xmm5, xmm1, xm10 ; a0 b0 + mova xmm1, xmm2 + vpdpwssd xmm5, xmm2, xm11 ; a1 b1 + pshufb xmm4, xm6 + mova xmm2, xmm3 + vpdpwssd xmm5, xmm3, xm12 ; a2 b2 + mova xmm3, xm9 + vpdpbusd xmm3, xmm4, xm7 + packssdw xmm4, xmm3, xmm3 + psraw xmm4, 2 + palignr xmm3, xmm4, xmm0, 12 + mova xmm0, xmm4 + punpcklwd xmm3, xmm4 ; 67 78 + vpdpwssd xmm5, xmm3, xm13 ; a3 b3 + packuswb xmm5, xmm5 + pshufb xmm5, xm14 + pextrw [dstq+dsq*0], xmm5, 0 + pextrw [dstq+dsq*1], xmm5, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + vzeroupper + RET +.hv_w4: + movq xmm1, [r6+ssq*0] + vpbroadcastq ym2, [r6+ssq*1] + vinserti32x4 ym1, ymm1, [r6+ssq*2], 1 + vinserti32x4 m2, [srcq+ssq*0], 2 + vinserti32x4 m1, [srcq+ssq*1], 2 + vinserti32x4 m2, [srcq+ssq*2], 3 ; _ 1 3 5 + vbroadcasti32x4 m6, [subpel_h_shufA] + add srcq, ss3q + vinserti32x4 m1, [srcq+ssq*0], 3 ; 0 2 4 6 + pshufb m2, m6 + pshufb m1, m6 + mova m0, m9 + vpdpbusd m0, m2, m7 + mova m4, m9 + vpdpbusd m4, m1, m7 + mova ym1, [spel_hv_perm4a] + mova ym2, [spel_hv_perm4b] + mova ym3, [spel_hv_perm4c] packssdw m0, m4 psraw m0, 2 ; _ 0 1 2 3 4 5 6 mov r6d, 0x5555 @@ -2100,11 +2634,10 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 movq xmm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti32x4 ym4, ymm4, [srcq+ssq*0], 1 - mova ym5, ym9 - vpdpwssd ym5, ym1, ym10 ; a0 b0 + pmaddwd ym5, ym1, ym10 ; a0 b0 mova ym1, ym2 pshufb ym4, ym6 - mova ym0, ym8 + mova ym0, ym9 vpdpbusd ym0, ym4, ym7 vpdpwssd ym5, ym2, ym11 ; a1 b1 mova ym2, ym3 @@ -2129,10 +2662,8 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 shr myd, 16 cmp hd, 6 cmovs myd, mxd - vpbroadcastd m8, [pd_2] - vpbroadcastq m0, [base+subpel_filters+myq*8] - vpbroadcastd m9, [pd_32768] - punpcklbw m0, m8, m0 + vpbroadcastq m1, [base+subpel_filters+myq*8] + punpcklbw m0, m1 lea ss3q, [ssq*3] psraw m0, 2 ; << 6 pshufd m12, m0, q0000 @@ -2150,177 +2681,717 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 vinserti128 ymm2, [srcq+ssq*2], 1 vinserti32x4 m6, [srcq+ssq*0], 3 ; 0 1 2 3 add srcq, ss3q - vbroadcasti32x4 m4, [subpel_h_shufA] - vinserti32x4 m0, zmm2, [srcq+ssq*0], 2 ; 4 5 6 _ - vbroadcasti32x4 m7, [subpel_h_shufB] - vbroadcasti32x4 m17, [subpel_h_shufC] - pshufb m1, m6, m4 ; 0 1 2 3 0123 + vbroadcasti32x4 m4, [subpel_h_shufA] + vinserti32x4 m0, zmm2, [srcq+ssq*0], 2 ; 4 5 6 _ + vbroadcasti32x4 m7, [subpel_h_shufB] + vbroadcasti32x4 m8, [subpel_h_shufC] + pshufb m1, m6, m4 ; 0 1 2 3 0123 + mova m2, m9 + vpdpbusd m2, m1, m10 + pshufb m5, m6, m7 ; 0 1 2 3 4567 + mova m1, m9 + vpdpbusd m1, m5, m10 + pshufb m4, m0, m4 ; 4 5 6 _ 0123 + mova m3, m9 + vpdpbusd m3, m4, m10 + pshufb m7, m0, m7 ; 4 5 6 _ 4567 + mova m4, m9 + vpdpbusd m4, m7, m10 + pshufb m6, m8 + vpdpbusd m2, m5, m11 + vpdpbusd m1, m6, m11 + pshufb m6, m0, m8 + vpdpbusd m3, m7, m11 + vpdpbusd m4, m6, m11 + mova m5, [spel_hv_perm8a] + vpaddd m0, m5, [pb_32] {1to16} + mov r6, 0x55555555ff00 + packssdw m2, m1 + packssdw m3, m4 + mova m8, [spel_hv_perm8b] + psraw m2, 2 ; 0 1 2 3 + psraw m3, 2 ; 4 5 6 _ + vpermb m1, m5, m2 ; 01 12 + vbroadcasti32x8 m6, [subpel_h_shufA] + kmovq k1, r6 + vpermt2b m2, m0, m3 ; 23 34 + vbroadcasti32x8 m7, [subpel_h_shufB] + kshiftrq k2, k1, 16 + mova xm16, [spel_hv_end] + vpermb m3, m5, m3 ; 45 56 +.hv_w8_loop: + vbroadcasti32x4 ym4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vbroadcasti32x4 m4{k1}, [srcq+ssq*0] + pmaddwd m0, m1, m12 ; a0 b0 + pshufb m1, m4, m6 ; 7 8 0123 4567 + mova m5, m9 + vpdpbusd m5, m1, m10 + pshufb m4, m7 ; 7 8 4567 89ab + vpdpwssd m0, m2, m13 ; a1 b1 + mova m1, m2 + vpdpbusd m5, m4, m11 + mova m2, m3 + vpdpwssd m0, m3, m14 ; a2 b2 + psraw m3{k2}, m5, 2 ; 75 86 + vpermb m3, m8, m3 ; 67 78 + vpdpwssd m0, m3, m15 ; a3 b3 + packuswb m0, m0 + vpermb zmm1, m16, m0 + movq [dstq+dsq*0], xmm1 + movhps [dstq+dsq*1], xmm1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w8_loop + vzeroupper + RET +.hv_w16: + WIN64_SPILL_XMM 23 + movu m22, [spel_hv_perm16a] + sub srcq, ss3q + vpbroadcastd m8, [pb_4] + lea r6d, [wq*2-32] + mova m7, [spel_hv_perm16b] + paddb m20, m8, m22 + mova ym16, [spel_hv_end16] + paddb m21, m8, m20 + lea r6d, [hq+r6*8] + paddb m8, m7 +.hv_w16_loop0: + movu ym17, [srcq+ssq*0] + vinserti32x8 m17, [srcq+ssq*1], 1 ; 0 1 + lea r4, [srcq+ss3q] + movu ym18, [srcq+ssq*2] + vinserti32x8 m18, [r4 +ssq*0], 1 ; 2 3 + mov r7, dstq + movu ym19, [r4 +ssq*1] + vinserti32x8 m19, [r4 +ssq*2], 1 ; 4 5 + add r4, ss3q + vpermb m2, m22, m17 ; 0 1 0123 89ab + mova m1, m9 + vpermb m3, m21, m17 ; 0 1 89ab ghij + vpdpbusd m1, m2, m10 + mova m2, m9 + vpermb m4, m22, m18 ; 2 3 0123 89ab + vpdpbusd m2, m3, m11 + mova m3, m9 + vpermb m5, m21, m18 ; 2 3 89ab ghij + vpdpbusd m3, m4, m10 + mova m4, m9 + vpermb m6, m22, m19 ; 4 5 0123 89ab + vpdpbusd m4, m5, m11 + mova m5, m9 + vpermb m17, m20, m17 ; 0 1 4567 cdef + vpdpbusd m5, m6, m10 + mova m6, m9 + vpermb m0, m21, m19 ; 4 5 89ab ghij + vpdpbusd m1, m17, m11 + vpdpbusd m2, m17, m10 + movu ym17, [r4+ssq*0] ; 6 + vpermb m18, m20, m18 ; 2 3 4567 cdef + vpdpbusd m6, m0, m11 + vpermb m0, m7, m17 ; 6 0145 2367 89cd abef + vpdpbusd m3, m18, m11 + vpermb m19, m20, m19 ; 4 5 4567 cdef + vpdpbusd m4, m18, m10 + mova m18, m9 + vpermb m17, m8, m17 ; 6 4589 67ab cdgh efij + vpdpbusd m18, m0, m10 + packssdw m1, m2 + vpdpbusd m5, m19, m11 + vpdpbusd m6, m19, m10 + packssdw m3, m4 + vpdpbusd m18, m17, m11 + psraw m1, 2 ; 01 + psraw m3, 2 ; 23 + packssdw m5, m6 + vpshrdd m2, m1, m3, 16 ; 12 + psraw m5, 2 ; 45 + vpshrdd m4, m3, m5, 16 ; 34 + psraw m18, 2 + vpshrdd m6, m5, m18, 16 ; 56 +.hv_w16_loop: + movu ym19, [r4+ssq*1] + lea r4, [r4+ssq*2] + vinserti32x8 m19, [r4+ssq*0], 1 + pmaddwd m17, m1, m12 ; a0 + vpermb m1, m22, m19 ; 7 8 0123 89ab + pmaddwd m18, m2, m12 ; b0 + mova m0, m9 + vpermb m2, m21, m19 ; 7 8 89ab ghij + vpdpbusd m0, m1, m10 + mova m1, m9 + vpermb m19, m20, m19 ; 7 8 4567 cdef + vpdpbusd m1, m2, m11 + mova m2, m4 + vpdpwssd m17, m3, m13 ; a1 + vpdpwssd m18, m4, m13 ; b1 + mova m4, m6 + vpdpbusd m0, m19, m11 + vpdpbusd m1, m19, m10 + vpdpwssd m17, m5, m14 ; a2 + vpdpwssd m18, m6, m14 ; b2 + packssdw m0, m1 + mova m1, m3 + psraw m6, m0, 2 ; 78 + mova m3, m5 + vpshrdd m5, m4, m6, 16 ; 67 + vpdpwssd m18, m6, m15 ; b3 + vpdpwssd m17, m5, m15 ; a3 + packuswb m17, m18 + vpermb m17, m16, m17 + mova [r7+dsq*0], xm17 + vextracti128 [r7+dsq*1], ym17, 1 + lea r7, [r7+dsq*2] + sub hd, 2 + jg .hv_w16_loop + add srcq, 16 + add dstq, 16 + movzx hd, r6b + sub r6d, 1<<8 + jg .hv_w16_loop0 + RET + +%if WIN64 +DECLARE_REG_TMP 6, 4 +%else +DECLARE_REG_TMP 6, 7 +%endif + +%define PREP_8TAP_FN FN prep_8tap, +PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH, prep_6tap_8bpc +PREP_8TAP_FN sharp_regular, SHARP, REGULAR, prep_6tap_8bpc +PREP_8TAP_FN smooth, SMOOTH, SMOOTH, prep_6tap_8bpc +PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR, prep_6tap_8bpc +PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH, prep_6tap_8bpc +PREP_8TAP_FN regular, REGULAR, REGULAR + +cglobal prep_6tap_8bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my, ss3 +%define base r7-prep_avx512icl + imul mxd, mxm, 0x010101 + add mxd, t0d ; 6tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 6tap_v, my, 4tap_v + lea r7, [prep_avx512icl] + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jnz .v +.prep: + tzcnt wd, wd + movzx wd, word [r7+wq*2+table_offset(prep,)] + add wq, r7 + lea r6, [ssq*3] +%if WIN64 + pop r7 +%endif + jmp wq +.v: + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + tzcnt r5d, wd + lea myq, [base+subpel_filters+1+myq*8] + movzx r5d, word [r7+r5*2+table_offset(prep, _6tap_v)] + vpbroadcastd m7, [pw_8192] + sub srcq, ssq + vpbroadcastw m8, [myq+0] + add r5, r7 + vpbroadcastw m9, [myq+2] + lea ss3q, [ssq*3] + vpbroadcastw m10, [myq+4] + sub srcq, ssq + jmp r5 +.v_w4: + movd xmm2, [srcq+ssq*0] + pinsrd xmm2, [srcq+ssq*1], 1 + vpbroadcastd ymm1, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastd ymm3, [srcq+ssq*0] + vpbroadcastd ymm0, [srcq+ssq*1] + vbroadcasti128 ymm5, [deint_shuf4] + vpblendd ymm1, ymm2, 0xeb + punpcklqdq ymm3, ymm0 + vpblendd ymm1, ymm3, 0x60 ; 0 1 2 _ 2 3 4 _ + pshufb ymm1, ymm5 ; 01 12 23 34 +.v_w4_loop: + pinsrd xmm0, [srcq+ssq*2], 1 + vpbroadcastd ymm2, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + vpbroadcastd ymm3, [srcq+ssq*0] + vpblendd ymm2, ymm0, 0xeb + vpbroadcastd ymm0, [srcq+ssq*1] + punpcklqdq ymm3, ymm0 + vpblendd ymm2, ymm3, 0x60 ; 4 5 6 _ 6 7 8 _ + pshufb ymm2, ymm5 ; 45 56 67 78 + pmaddubsw ymm3, ymm1, ym8 ; a0 b0 c0 d0 + vperm2i128 ymm1, ymm2, 0x21 ; 23 34 45 56 + pmaddubsw ymm4, ymm2, ym10 ; a2 b2 c2 d2 + pmaddubsw ymm1, ym9 ; a1 b1 c1 d1 + paddw ymm3, ymm4 + paddw ymm3, ymm1 + pmulhrsw ymm3, ym7 + mova ymm1, ymm2 + mova [tmpq], ymm3 + add tmpq, 32 + sub hd, 4 + jg .v_w4_loop + vzeroupper + RET +.v_w8: + mova m6, [spel_v_perm8] + movq xm1, [srcq+ssq*0] + mov r6d, 0x3e + movq xm2, [srcq+ssq*1] + kmovb k1, r6d + vpbroadcastq ym3, [srcq+ssq*2] + add srcq, ss3q + vpunpcklqdq ym2, [srcq+ssq*0] {1to4} + vpunpcklqdq m1{k1}, m3, [srcq+ssq*1] {1to8} + movq xm0, [srcq+ssq*1] + kshiftlb k2, k1, 2 + shufpd m1, m2, 0x18 ; 0 1 2 3 4 + vpermb m1, m6, m1 ; 01 12 23 34 +.v_w8_loop: + vpbroadcastq ym3, [srcq+ss3q ] + vpunpcklqdq ym0{k1}, ym3, [srcq+ssq*2] {1to4} + lea srcq, [srcq+ssq*4] + vpbroadcastq m3, [srcq+ssq*1] + vpunpcklqdq m0{k2}, m3, [srcq+ssq*0] {1to8} + pmaddubsw m4, m1, m8 ; a0 b0 c0 d0 + vpermb m2, m6, m0 ; 45 56 67 78 + mova xm0, xm3 + vshufi32x4 m1, m2, q1032 ; 23 34 45 56 + pmaddubsw m3, m2, m10 ; a3 b3 c3 d3 + pmaddubsw m5, m1, m9 ; a2 b2 c2 d2 + mova m1, m2 + paddw m4, m3 + paddw m4, m5 + pmulhrsw m4, m7 + mova [tmpq], m4 + add tmpq, 64 + sub hd, 4 + jg .v_w8_loop + RET +.v_w16: + mova m11, [spel_v_perm16b] + vbroadcasti32x4 m1, [srcq+ssq*0] + mov r6d, 0x0f + vbroadcasti32x4 ym3, [srcq+ssq*1] + vbroadcasti32x4 m2, [srcq+ssq*2] + kmovb k1, r6d + add srcq, ss3q + vbroadcasti32x4 ym4, [srcq+ssq*0] + vbroadcasti32x4 m0, [srcq+ssq*1] + vshufpd m1{k1}, m3, m2, 0xcc + vshufpd m2{k1}, m4, m0, 0xcc + vpermb m1, m11, m1 ; 01 12 + vpermb m2, m11, m2 ; 23 34 +.v_w16_loop: + pmaddubsw m3, m1, m8 ; a0 b0 + pmaddubsw m5, m2, m9 ; a1 b1 + vbroadcasti32x4 ym6, [srcq+ssq*2] + pmaddubsw m4, m2, m8 ; c0 d0 + vbroadcasti32x4 m2, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + vshufpd m0{k1}, m6, m2, 0xcc + vbroadcasti32x4 ym6, [srcq+ssq*0] + vpermb m1, m11, m0 ; 45 56 + vbroadcasti32x4 m0, [srcq+ssq*1] + vshufpd m2{k1}, m6, m0, 0xcc + pmaddubsw m6, m1, m9 ; c1 d1 + vpermb m2, m11, m2 ; 67 78 + paddw m3, m5 + pmaddubsw m5, m1, m10 ; a2 b2 + paddw m4, m6 + pmaddubsw m6, m2, m10 ; c2 d2 + paddw m3, m5 + paddw m4, m6 + pmulhrsw m3, m7 + pmulhrsw m4, m7 + mova [tmpq+ 0], m3 + mova [tmpq+64], m4 + add tmpq, 64*2 + sub hd, 4 + jg .v_w16_loop + RET +.v_w32: + movshdup m6, [bilin_v_perm64] + movu ym16, [srcq+ssq*0] + movu ym17, [srcq+ssq*1] + movu ym18, [srcq+ssq*2] + add srcq, ss3q + movu ym19, [srcq+ssq*0] + add srcq, ssq + movu ym20, [srcq+ssq*0] + vpermt2q m16, m6, m18 ; 0 2 + vpermt2q m17, m6, m19 ; 1 3 + vpermt2q m18, m6, m20 ; 2 4 + punpcklbw m0, m16, m17 ; 01 + punpcklbw m1, m17, m18 ; 12 + punpckhbw m2, m16, m17 ; 23 + punpckhbw m3, m17, m18 ; 34 +.v_w32_loop: + movu ym16, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movu ym17, [srcq+ssq*0] + pmaddubsw m4, m0, m8 ; a0 + mova m0, m2 + pmaddubsw m2, m9 ; a1 + vpermt2q m16, m6, m17 ; 5 6 + pmaddubsw m5, m1, m8 ; b0 + mova m1, m3 + pmaddubsw m3, m9 ; b1 + shufpd m18, m16, 0x55 ; 4 5 + paddw m4, m2 + punpcklbw m2, m18, m16 ; 45 + paddw m5, m3 + punpckhbw m3, m18, m16 ; 56 + mova m18, m16 + pmaddubsw m16, m2, m10 ; a2 + pmaddubsw m17, m3, m10 ; b2 + paddw m4, m16 + paddw m5, m17 + pmulhrsw m4, m7 + pmulhrsw m5, m7 + mova [tmpq+ 0], m4 + mova [tmpq+64], m5 + add tmpq, 64*2 + sub hd, 2 + jg .v_w32_loop + vzeroupper + RET +.v_w64: +.v_w128: + mova m6, [bilin_v_perm64] + add wd, wd + lea r6d, [hq+wq] +.v_loop0: + vpermq m12, m6, [srcq+ssq*0] + vpermq m13, m6, [srcq+ssq*1] + lea r5, [srcq+ssq*2] + vpermq m14, m6, [r5 +ssq*0] + vpermq m15, m6, [r5 +ssq*1] + lea r5, [r5+ssq*2] + vpermq m16, m6, [r5 +ssq*0] + mov r7, tmpq + punpcklbw m0, m12, m13 ; 01 + punpckhbw m12, m13 + punpcklbw m1, m13, m14 ; 12 + punpckhbw m13, m14 + punpcklbw m2, m14, m15 ; 23 + punpckhbw m14, m15 + punpcklbw m3, m15, m16 ; 34 + punpckhbw m15, m16 +.v_loop: + pmaddubsw m17, m0, m8 ; a0 + vpermq m5, m6, [r5+ssq*1] + pmaddubsw m18, m12, m8 + mova m0, m2 + pmaddubsw m2, m9 ; a1 + mova m12, m14 + pmaddubsw m14, m9 + lea r5, [r5+ssq*2] + pmaddubsw m19, m1, m8 ; b0 + pmaddubsw m20, m13, m8 + mova m1, m3 + pmaddubsw m3, m9 ; b1 + mova m13, m15 + pmaddubsw m15, m9 + paddw m17, m2 + punpcklbw m2, m16, m5 ; 67 + paddw m18, m14 + punpckhbw m14, m16, m5 + vpermq m16, m6, [r5+ssq*0] + paddw m19, m3 + pmaddubsw m3, m2, m10 ; a3 + paddw m20, m15 + pmaddubsw m15, m14, m10 + paddw m17, m3 + punpcklbw m3, m5, m16 ; 78 + pmaddubsw m4, m3, m10 ; b3 + paddw m18, m15 + punpckhbw m15, m5, m16 + pmaddubsw m5, m15, m10 + paddw m19, m4 + paddw m20, m5 + REPX {pmulhrsw x, m7}, m17, m18, m19, m20 + mova [r7+wq*0+ 0], m17 + mova [r7+wq*0+64], m18 + mova [r7+wq*1+ 0], m19 + mova [r7+wq*1+64], m20 + lea r7, [r7+wq*2] + sub hd, 2 + jg .v_loop + add srcq, 64 + add tmpq, 128 + movzx hd, r6b + sub r6d, 1<<8 + jg .v_loop0 + vzeroupper + RET +.h: + test myd, 0xf00 + jz mangle(private_prefix %+ _prep_8tap_8bpc_avx512icl).h2 +.hv: + vpbroadcastd m8, [pd_2] + vpbroadcastd m9, [pd_32] + cmp wd, 4 + jg .hv_w8 + movzx mxd, mxb + vpbroadcastd m11, [base+subpel_filters+mxq*8+2] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + vpbroadcastq m3, [base+subpel_filters+1+myq*8] + vbroadcasti128 m10, [subpel_h_shufA] + lea r6, [ssq*2+1] + mov r3d, 0x30 + sub srcq, r6 + kmovb k1, r3d + vpbroadcastq ym2, [srcq+ssq*0] + lea ss3q, [ssq*3] + vpbroadcastq m1, [srcq+ssq*1] + kaddb k2, k1, k1 + vpbroadcastq m2{k1}, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastq m1{k2}, [srcq+ssq*0] ; _ _ 1 3 + punpcklbw m3, m3 + vpbroadcastq m2{k2}, [srcq+ssq*1] ; _ 0 2 4 + psraw m3, 8 ; sign-extend + mova m6, [spel_hv_perm4a] + kshiftrb k1, k1, 2 + movu m7, [spel_hv_perm4b] + pshufb m1, m10 + mova m0, m8 + vpdpbusd m0, m1, m11 + pshufb m2, m10 + mova m1, m8 + vpdpbusd m1, m2, m11 + pshufd m12, m3, q0000 + pshufd m13, m3, q1111 + pshufd m14, m3, q2222 + packssdw m0, m1 ; _ _ _ 0 1 2 3 4 + psraw m0, 2 + vpermb m1, m7, m0 ; 01 12 23 34 +.hv_w4_loop: + movq xm3, [srcq+ssq*2] + movq xm4, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + vpbroadcastq ym3{k1}, [srcq+ssq*0] ; 5 7 + vpbroadcastq ym4{k1}, [srcq+ssq*1] ; 6 8 + pshufb ym3, ym10 + mova ym2, ym8 + vpdpbusd ym2, ym3, ym11 + pshufb ym4, ym10 + mova ym3, ym8 + vpdpbusd ym3, ym4, ym11 + mova m4, m9 + vpdpwssd m4, m1, m12 ; a0 b0 c0 d0 + packssdw ym2, ym3 ; 5 6 7 8 + psraw ym2, 2 + vshufi32x4 m0, m2, q1032 ; _ 2 3 4 5 6 7 8 + vpermb m2, m6, m0 ; 23 34 45 56 + vpermb m1, m7, m0 ; 45 56 67 78 + vpdpwssd m4, m2, m13 ; a1 b1 c1 d1 + vpdpwssd m4, m1, m14 ; a2 b2 c2 d2 + psrad m4, 6 + vpmovdw [tmpq], m4 + add tmpq, 32 + sub hd, 4 + jg .hv_w4_loop + RET +.hv_w8: + shr mxd, 16 + vpbroadcastd m10, [base+subpel_filters+mxq*8+0] + vpbroadcastd m11, [base+subpel_filters+mxq*8+4] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + vpbroadcastq m0, [base+subpel_filters+1+myq*8] + lea r6, [ssq*2+3] + punpcklbw m0, m0 + sub srcq, r6 + psraw m0, 8 ; sign-extend + lea ss3q, [ssq*3] + pshufd m12, m0, q0000 + pshufd m13, m0, q1111 + pshufd m14, m0, q2222 + cmp wd, 8 + jg .hv_w16 + movu xm16, [srcq+ssq*0] + vbroadcasti32x4 m19, [subpel_h_shufA] + vinserti128 ym16, [srcq+ssq*1], 1 + vbroadcasti32x4 m21, [subpel_h_shufC] + vinserti32x4 m16, [srcq+ssq*2], 2 + add srcq, ss3q + vinserti32x4 m16, [srcq+ssq*0], 3 + movu xm17, [srcq+ssq*1] + vbroadcasti32x4 m20, [subpel_h_shufB] + pshufb m3, m16, m19 ; 0 1 2 3 0123 mova m2, m8 - vpdpbusd m2, m1, m10 - pshufb m5, m6, m7 ; 0 1 2 3 4567 - mova m1, m8 - vpdpbusd m1, m5, m10 - pshufb m4, m0, m4 ; 4 5 6 _ 0123 + pshufb m0, m16, m21 ; 0 1 2 3 89ab + vpdpbusd m2, m3, m10 mova m3, m8 - vpdpbusd m3, m4, m10 - pshufb m7, m0, m7 ; 4 5 6 _ 4567 - mova m4, m8 - vpdpbusd m4, m7, m10 - pshufb m6, m17 - vpdpbusd m2, m5, m11 - vpdpbusd m1, m6, m11 - pshufb m6, m0, m17 - vpdpbusd m3, m7, m11 - vpdpbusd m4, m6, m11 - mova m5, [spel_hv_perm8a] - mova m0, [spel_hv_perm8b] - mov r6, 0x55555555ff00 - packssdw m2, m1 - packssdw m3, m4 - mova m18, [spel_hv_perm8c] - psraw m2, 2 ; 0 1 2 3 - psraw m3, 2 ; 4 5 6 _ - vpermb m1, m5, m2 ; 01 12 - vbroadcasti32x8 m6, [subpel_h_shufA] - kmovq k1, r6 - vpermt2b m2, m0, m3 ; 23 34 - vbroadcasti32x8 m7, [subpel_h_shufB] - kshiftrq k2, k1, 16 - mova xm16, [spel_hv_end] - vpermb m3, m5, m3 ; 45 56 + pshufb xm1, xm17, xm19 ; 3 4 5 6 0123 + vpdpbusd m3, m0, m11 + mova xm0, xm8 + pshufb xm18, xm17, xm21 ; 3 4 5 6 89ab + vpdpbusd xm0, xm1, xm10 + mova xm1, xm8 + pshufb m16, m20 ; 0 1 2 3 4567 + vpdpbusd xm1, xm18, xm11 + pshufb xm17, xm20 ; 3 4 5 6 4567 + vpdpbusd m2, m16, m11 + vpdpbusd m3, m16, m10 + vpdpbusd xm0, xm17, xm11 + vpdpbusd xm1, xm17, xm10 + packssdw m2, m3 + packssdw xm0, xm1 + psraw m2, 2 ; 0 1 2 3 + psraw xm0, 2 ; 4 + valignq m0, m2, 2 ; 1 2 3 4 + punpcklwd m1, m2, m0 ; 01 12 23 34 + punpckhwd m2, m0 .hv_w8_loop: - vbroadcasti32x4 ym4, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - vbroadcasti32x4 m4{k1}, [srcq+ssq*0] - mova m0, m9 - vpdpwssd m0, m1, m12 ; a0 b0 - pshufb m1, m4, m6 ; 7 8 0123 4567 + movu xm16, [srcq+ssq*2] + vinserti128 ym16, [srcq+ss3q ], 1 + lea srcq, [srcq+ssq*4] + vinserti32x4 m16, [srcq+ssq*0], 2 + vinserti32x4 m16, [srcq+ssq*1], 3 + pshufb m6, m16, m19 ; 5 6 7 8 0123 mova m5, m8 - vpdpbusd m5, m1, m10 - pshufb m4, m7 ; 7 8 4567 89ab - vpdpwssd m0, m2, m13 ; a1 b1 - mova m1, m2 - vpdpbusd m5, m4, m11 - mova m2, m3 - vpdpwssd m0, m3, m14 ; a2 b2 - psraw m3{k2}, m5, 2 ; 75 86 - vpermb m3, m18, m3 ; 67 78 - vpdpwssd m0, m3, m15 ; a3 b3 - packuswb m0, m0 - vpermb zmm1, m16, m0 - movq [dstq+dsq*0], xmm1 - movhps [dstq+dsq*1], xmm1 - lea dstq, [dstq+dsq*2] - sub hd, 2 + pshufb m3, m16, m21 ; 5 6 7 8 89ab + vpdpbusd m5, m6, m10 + mova m6, m8 + pshufb m16, m20 ; 5 6 7 8 4567 + vpdpbusd m6, m3, m11 + mova m3, m9 + vpdpwssd m3, m1, m12 ; a0 b0 c0 d0 + mova m4, m9 + vpdpwssd m4, m2, m12 + vpdpbusd m5, m16, m11 + vpdpbusd m6, m16, m10 + mova m16, m1 + packssdw m5, m6 + mova m6, m2 + psraw m5, 2 ; 5 6 7 8 + valignq m2, m5, m0, 6 ; 4 5 6 7 + mova m0, m5 + punpcklwd m1, m2, m5 ; 45 56 67 78 + punpckhwd m2, m5 + vpdpwssd m3, m1, m14 ; a2 b2 c2 d2 + vpdpwssd m4, m2, m14 + vshufi32x4 m16, m1, q1032 ; 23 34 45 56 + vshufi32x4 m6, m2, q1032 + vpdpwssd m3, m16, m13 ; a1 b1 c1 d1 + vpdpwssd m4, m6, m13 + psrad m3, 6 + psrad m4, 6 + packssdw m3, m4 + mova [tmpq], m3 + add tmpq, 64 + sub hd, 4 jg .hv_w8_loop vzeroupper RET .hv_w16: - movu m7, [spel_hv_perm16a] - sub srcq, ss3q - mova m20, [spel_hv_perm16b] - lea r6d, [wq*2-32] - mova m21, [spel_hv_perm16c] - mov r4, srcq - mov r7, dstq - mova ym16, [spel_hv_end16] - lea r6d, [hq+r6*8] + mova m16, [spel_h_perm16] + vpbroadcastd m18, [pb_4] + add wd, wd + paddb m17, m18, m16 + lea r6d, [hq+wq*8-256] + paddb m18, m17 .hv_w16_loop0: - movu ym17, [srcq+ssq*0] - vinserti32x8 m17, [srcq+ssq*1], 1 ; 0 1 - movu ym18, [srcq+ssq*2] - add srcq, ss3q - vinserti32x8 m18, [srcq+ssq*0], 1 ; 2 3 - movu ym19, [srcq+ssq*1] - vinserti32x8 m19, [srcq+ssq*2], 1 ; 4 5 - add srcq, ss3q - vpermb m2, m7, m17 ; 0 1 0123 89ab - vpermb m0, m20, m17 ; 0 1 4567 cdef - vpermb m4, m7, m18 ; 2 3 0123 89ab - mova m1, m8 - vpdpbusd m1, m2, m10 - vpermb m5, m20, m18 ; 2 3 4567 cdef + movu ym19, [srcq+ssq*0] + vinserti32x8 m19, [srcq+ssq*1], 1 + lea r5, [srcq+ssq*2] + movu ym20, [r5 +ssq*0] + vinserti32x8 m20, [r5 +ssq*1], 1 + lea r5, [r5 +ssq*2] + movu ym21, [r5 +ssq*0] + mov r7, tmpq + vpermb m3, m16, m19 ; 0 1 0123 89ab mova m2, m8 - vpdpbusd m2, m0, m10 - vpermb m17, m21, m17 ; 0 1 89ab ghij + vpermb m4, m18, m19 ; 0 1 89ab ghij + vpdpbusd m2, m3, m10 mova m3, m8 - vpdpbusd m3, m4, m10 - vpermb m6, m7, m19 ; 4 5 0123 89ab + vpermb m5, m16, m20 ; 2 3 0123 89ab + vpdpbusd m3, m4, m11 mova m4, m8 + vpermb m0, m18, m20 ; 2 3 89ab ghij vpdpbusd m4, m5, m10 - vpermb m18, m21, m18 ; 2 3 89ab ghij - vpdpbusd m1, m0, m11 - movu ym0, [srcq+ssq*0] ; 6 - vpdpbusd m2, m17, m11 - vpermb m17, m20, m19 ; 4 5 4567 cdef - vpdpbusd m3, m5, m11 mova m5, m8 + vpermb ym1, ym16, ym21 ; 4 0123 89ab + vpdpbusd m5, m0, m11 + mova ym0, ym8 + vpermb ym6, ym18, ym21 ; 4 89ab ghij + vpdpbusd ym0, ym1, ym10 + mova ym1, ym8 + vpermb m19, m17, m19 ; 0 1 4567 cdef + vpdpbusd ym1, ym6, ym11 + vpermb m20, m17, m20 ; 2 3 4567 cdef + vpdpbusd m2, m19, m11 + vpdpbusd m3, m19, m10 + vpermb ym21, ym17, ym21 ; 4 4567 cdef + vpdpbusd m4, m20, m11 + vpdpbusd m5, m20, m10 + vpdpbusd ym0, ym21, ym11 + vpdpbusd ym1, ym21, ym10 + packssdw m2, m3 ; 0 1 + packssdw m4, m5 ; 2 3 + packssdw ym0, ym1 ; 4 + REPX {psraw x, 2}, m2, m4, ym0 + vshufi32x4 m3, m2, m4, q1032 ; 1 2 + vshufi32x4 m0, m4, m0, q1032 ; 3 4 + punpcklwd m1, m2, m3 ; 01 12 + punpckhwd m2, m3 + punpcklwd m3, m4, m0 ; 23 34 + punpckhwd m4, m0 +.hv_w16_loop: + movu ym19, [r5+ssq*1] + lea r5, [r5+ssq*2] + vinserti32x8 m19, [r5+ssq*0], 1 + vpermb m6, m16, m19 ; 5 6 0123 89ab + mova m5, m8 + vpermb m20, m18, m19 ; 5 6 89ab ghij vpdpbusd m5, m6, m10 mova m6, m8 - vpdpbusd m6, m17, m10 - vpdpbusd m4, m18, m11 - mova m18, [spel_hv_perm16d] - vpermb m18, m18, m0 ; 6 0145 2367 89cd abef - vpdpbusd m5, m17, m11 - vpermb m19, m21, m19 ; 4 5 89ab ghij - mova m17, m8 - vpdpbusd m17, m18, m10 - mova m18, [spel_hv_perm16e] - vpermb m0, m18, m0 ; 6 4589 67ab cdgh efij - packssdw m1, m2 ; 01 - vpdpbusd m6, m19, m11 - packssdw m3, m4 ; 23 - vpdpbusd m17, m0, m11 - psraw m1, 2 - packssdw m5, m6 ; 45 - psraw m3, 2 - vpshrdd m2, m1, m3, 16 ; 12 - psraw m5, 2 - vpshrdd m4, m3, m5, 16 ; 34 - psraw m17, 2 - vpshrdd m6, m5, m17, 16 ; 56 -.hv_w16_loop: - movu ym18, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - vinserti32x8 m18, [srcq+ssq*0], 1 - mova m0, m9 - vpdpwssd m0, m1, m12 ; a0 - vpermb m1, m7, m18 ; 7 8 0123 89ab - mova m17, m9 - vpdpwssd m17, m2, m12 ; b0 - vpermb m2, m20, m18 ; 7 8 4567 cdef - mova m19, m8 - vpdpbusd m19, m1, m10 - vpermb m18, m21, m18 - mova m1, m8 - vpdpbusd m1, m2, m10 - vpdpwssd m0, m3, m13 ; a1 - vpdpwssd m17, m4, m13 ; b1 - vpdpbusd m19, m2, m11 - mova m2, m4 - vpdpbusd m1, m18, m11 - mova m4, m6 - vpdpwssd m0, m5, m14 ; a2 - vpdpwssd m17, m6, m14 ; b2 - packssdw m19, m1 + vpermb m19, m17, m19 ; 5 6 4567 cdef + vpdpbusd m6, m20, m11 + mova m20, m9 + vpdpwssd m20, m1, m12 ; a0 b0 + mova m21, m9 + vpdpwssd m21, m2, m12 + vpdpbusd m5, m19, m11 + vpdpbusd m6, m19, m10 + vpdpwssd m20, m3, m13 ; a1 b1 + vpdpwssd m21, m4, m13 + packssdw m5, m6 mova m1, m3 - mova m3, m5 - psraw m6, m19, 2 ; 7 8 - vpshrdd m5, m4, m6, 16 ; 6 7 - vpdpwssd m17, m6, m15 ; b3 - vpdpwssd m0, m5, m15 ; a3 - packuswb m0, m17 - vpermb zmm1, m16, m0 - mova [dstq+dsq*0], xmm1 - vextracti128 [dstq+dsq*1], ymm1, 1 - lea dstq, [dstq+dsq*2] + psraw m5, 2 ; 5 6 + mova m2, m4 + vshufi32x4 m4, m0, m5, q1032 ; 4 5 + mova m0, m5 + punpcklwd m3, m4, m0 ; 45 56 + punpckhwd m4, m0 + vpdpwssd m20, m3, m14 ; a2 b2 + vpdpwssd m21, m4, m14 + psrad m20, 6 + psrad m21, 6 + packssdw m20, m21 + mova [r7+wq*0], ym20 + vextracti32x8 [r7+wq*1], m20, 1 + lea r7, [r7+wq*2] sub hd, 2 jg .hv_w16_loop - add r4, 16 - add r7, 16 + add srcq, 16 + add tmpq, 32 movzx hd, r6b - mov srcq, r4 - mov dstq, r7 sub r6d, 1<<8 jg .hv_w16_loop0 vzeroupper @@ -2353,57 +3424,333 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 mova [tmpq+64*1], m1 %endmacro -%if WIN64 -DECLARE_REG_TMP 6, 4 -%else -DECLARE_REG_TMP 6, 7 -%endif - -%define PREP_8TAP_FN FN prep_8tap, - +PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_8bpc +PREP_8TAP_FN regular_sharp, REGULAR, SHARP, prep_8tap_8bpc PREP_8TAP_FN sharp, SHARP, SHARP -PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH -PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP -PREP_8TAP_FN smooth, SMOOTH, SMOOTH -PREP_8TAP_FN sharp_regular, SHARP, REGULAR -PREP_8TAP_FN regular_sharp, REGULAR, SHARP -PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR -PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH -PREP_8TAP_FN regular, REGULAR, REGULAR -cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 +cglobal prep_8tap_8bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my, stride3 imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v lea r7, [prep_avx512icl] - movsxd wq, wm movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 - jnz .v - tzcnt wd, wd - movzx wd, word [r7+wq*2+table_offset(prep,)] - add wq, r7 - lea r6, [strideq*3] -%if WIN64 - pop r7 -%endif - jmp wq + jz mangle(private_prefix %+ _prep_6tap_8bpc_avx512icl).prep +.v: + movzx mxd, myb ; Select 4-tap/8-tap filter multipliers. + shr myd, 16 ; Note that the code is 8-tap only, having + cmp hd, 4 ; a separate 4-tap code path for (4|8|16)x4 + cmove myd, mxd ; had a negligible effect on performance. + tzcnt r5d, wd + lea myq, [base+subpel_filters+myq*8] + movzx r5d, word [r7+r5*2+table_offset(prep, _8tap_v)] + vpbroadcastd m7, [pw_8192] + vpbroadcastw m8, [myq+0] + add r5, r7 + vpbroadcastw m9, [myq+2] + lea stride3q, [strideq*3] + vpbroadcastw m10, [myq+4] + sub srcq, stride3q + vpbroadcastw m11, [myq+6] + jmp r5 +.v_w4: + movd xmm0, [srcq+strideq*0] + vpbroadcastd ymm1, [srcq+strideq*2] + vpbroadcastd xmm2, [srcq+strideq*1] + vpbroadcastd ymm3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpblendd ymm1, ymm0, 0x01 ; 0 2 2 _ 2 _ _ _ + vpblendd ymm3, ymm2, 0x03 ; 1 1 3 3 3 3 _ _ + vpbroadcastd ymm0, [srcq+strideq*0] + vpbroadcastd ymm2, [srcq+strideq*1] + vpblendd ymm1, ymm0, 0x68 ; 0 2 2 4 2 4 4 _ + vpbroadcastd ymm0, [srcq+strideq*2] + vbroadcasti128 ymm5, [deint_shuf4] + vpblendd ymm3, ymm2, 0xc0 ; 1 1 3 3 3 3 5 5 + vpblendd ymm2, ymm3, ymm1, 0x55 ; 0 1 2 3 2 3 4 5 + vpblendd ymm3, ymm1, 0xaa ; 1 2 3 4 3 4 5 _ + punpcklbw ymm1, ymm2, ymm3 ; 01 12 23 34 + vpblendd ymm3, ymm0, 0x80 ; 1 2 3 4 3 4 5 6 + punpckhbw ymm2, ymm3 ; 23 34 45 56 +.v_w4_loop: + pinsrd xmm0, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + vpbroadcastd ymm3, [srcq+strideq*0] + vpbroadcastd ymm4, [srcq+strideq*1] + vpblendd ymm3, ymm4, 0x20 ; _ _ 8 _ 8 9 _ _ + vpblendd ymm3, ymm0, 0x03 ; 6 7 8 _ 8 9 _ _ + vpbroadcastd ymm0, [srcq+strideq*2] + vpblendd ymm3, ymm0, 0x40 ; 6 7 8 _ 8 9 a _ + pshufb ymm3, ymm5 ; 67 78 89 9a + pmaddubsw ymm4, ymm1, ym8 + vperm2i128 ymm1, ymm2, ymm3, 0x21 ; 45 56 67 78 + pmaddubsw ymm2, ym9 + paddw ymm4, ymm2 + mova ymm2, ymm3 + pmaddubsw ymm3, ym11 + paddw ymm3, ymm4 + pmaddubsw ymm4, ymm1, ym10 + paddw ymm3, ymm4 + pmulhrsw ymm3, ym7 + mova [tmpq], ymm3 + add tmpq, 32 + sub hd, 4 + jg .v_w4_loop + vzeroupper + RET +.v_w8: + mova m6, [spel_v_perm8] + movq xm1, [srcq+strideq*0] + mov r6d, 0x3e + movq xm2, [srcq+strideq*1] + vpbroadcastq ym3, [srcq+strideq*2] + kmovb k1, r6d + vpbroadcastq ym4, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpunpcklqdq m1{k1}, m3, [srcq+strideq*0] {1to8} + vpunpcklqdq m2{k1}, m4, [srcq+strideq*1] {1to8} + movq xm0, [srcq+strideq*2] + kshiftlb k2, k1, 2 + shufpd m1, m2, 0x30 ; 0 1 2 3 4 5 + vshufi32x4 m2, m1, m0, q0021 ; 2 3 4 5 6 _ + vpermb m1, m6, m1 ; 01 12 23 34 + vpermb m2, m6, m2 ; 23 34 45 56 +.v_w8_loop: + vpbroadcastq ym3, [srcq+strideq*4] + vpunpcklqdq ym0{k1}, ym3, [srcq+stride3q] {1to4} + lea srcq, [srcq+strideq*4] + vpbroadcastq m3, [srcq+strideq*2] + vpunpcklqdq m0{k2}, m3, [srcq+strideq*1] {1to8} + pmaddubsw m4, m1, m8 ; a0 b0 c0 d0 + mova m1, m2 + pmaddubsw m5, m2, m9 ; a1 b1 c1 d1 + vpermb m2, m6, m0 ; 67 78 89 9a + mova xm0, xm3 + vshufi32x4 m1, m2, q1032 ; 45 56 67 78 + pmaddubsw m3, m2, m11 ; a3 b3 c3 d3 + paddw m4, m5 + pmaddubsw m5, m1, m10 ; a2 b2 c2 d2 + paddw m4, m3 + paddw m4, m5 + pmulhrsw m4, m7 + mova [tmpq], m4 + add tmpq, 64 + sub hd, 4 + jg .v_w8_loop + RET +.v_w16: + mova m12, [spel_v_perm16b] + vbroadcasti32x4 m1, [srcq+strideq*0] + mov r6d, 0x0f + vbroadcasti32x4 ym4, [srcq+strideq*1] + vbroadcasti32x4 m2, [srcq+strideq*2] + kmovb k1, r6d + vbroadcasti32x4 ym5, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vbroadcasti32x4 m3, [srcq+strideq*0] + vbroadcasti32x4 ym6, [srcq+strideq*1] + vbroadcasti32x4 m0, [srcq+strideq*2] + vshufpd m1{k1}, m4, m2, 0xcc + vshufpd m2{k1}, m5, m3, 0xcc + vshufpd m3{k1}, m6, m0, 0xcc + vpermb m1, m12, m1 ; 01 12 + vpermb m2, m12, m2 ; 23 34 + vpermb m3, m12, m3 ; 45 56 +.v_w16_loop: + pmaddubsw m4, m1, m8 ; a0 b0 + mova m1, m3 + pmaddubsw m13, m2, m9 ; a1 b1 + vbroadcasti32x4 ym6, [srcq+stride3q ] + pmaddubsw m5, m2, m8 ; c0 d0 + lea srcq, [srcq+strideq*4] + pmaddubsw m14, m3, m9 ; c1 d1 + vbroadcasti32x4 m3, [srcq+strideq*0] + vshufpd m0{k1}, m6, m3, 0xcc + vbroadcasti32x4 ym6, [srcq+strideq*1] + vpermb m2, m12, m0 ; 67 78 + vbroadcasti32x4 m0, [srcq+strideq*2] + vshufpd m3{k1}, m6, m0, 0xcc + paddw m4, m13 + pmaddubsw m13, m1, m10 ; a2 b2 + vpermb m3, m12, m3 ; 89 9a + paddw m5, m14 + pmaddubsw m14, m2, m10 ; c2 d2 + pmaddubsw m15, m2, m11 ; a3 b3 + pmaddubsw m6, m3, m11 ; c3 d3 + paddw m4, m13 + paddw m5, m14 + paddw m4, m15 + paddw m5, m6 + pmulhrsw m4, m7 + pmulhrsw m5, m7 + mova [tmpq+ 0], m4 + mova [tmpq+64], m5 + add tmpq, 64*2 + sub hd, 4 + jg .v_w16_loop + RET +.v_w32: + movshdup m21, [bilin_v_perm64] + movu ym16, [srcq+strideq*0] + movu ym17, [srcq+strideq*1] + movu ym18, [srcq+strideq*2] + add srcq, stride3q + movu ym19, [srcq+strideq*0] + vpermt2q m16, m21, m19 ; 0 3 + movu ym20, [srcq+strideq*1] + vpermt2q m17, m21, m20 ; 1 4 + movu ym20, [srcq+strideq*2] + add srcq, stride3q + vpermt2q m18, m21, m20 ; 2 5 + movu ym20, [srcq+strideq*0] + vpermt2q m19, m21, m20 ; 3 6 + punpcklbw m0, m16, m17 ; 01 + punpcklbw m1, m17, m18 ; 12 + punpcklbw m2, m18, m19 ; 23 + punpckhbw m3, m16, m17 ; 34 + punpckhbw m4, m17, m18 ; 45 + punpckhbw m5, m18, m19 ; 56 +.v_w32_loop: + movu ym16, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + movu ym17, [srcq+strideq*0] + pmaddubsw m14, m0, m8 + mova m0, m2 + pmaddubsw m15, m1, m8 + mova m1, m3 + pmaddubsw m2, m9 + vpermt2q m16, m21, m17 ; 7 8 + pmaddubsw m3, m9 + pmaddubsw m12, m4, m10 + pmaddubsw m13, m5, m10 + shufpd m19, m16, 0x55 ; 6 7 + paddw m14, m2 + mova m2, m4 + punpcklbw m4, m19, m16 ; 67 + paddw m15, m3 + mova m3, m5 + punpckhbw m5, m19, m16 ; 78 + paddw m14, m12 + paddw m15, m13 + pmaddubsw m12, m4, m11 + pmaddubsw m13, m5, m11 + mova m19, m16 + paddw m14, m12 + paddw m15, m13 + pmulhrsw m14, m7 + pmulhrsw m15, m7 + mova [tmpq+ 0], m14 + mova [tmpq+64], m15 + add tmpq, 64*2 + sub hd, 2 + jg .v_w32_loop + vzeroupper + RET +.v_w64: +.v_w128: + WIN64_SPILL_XMM 24 + mova m23, [bilin_v_perm64] + add wd, wd + lea r6d, [hq+wq] +.v_loop0: + vpermq m12, m23, [srcq+strideq*0] + vpermq m13, m23, [srcq+strideq*1] + lea r5, [srcq+strideq*2] + vpermq m14, m23, [r5 +strideq*0] + vpermq m15, m23, [r5 +strideq*1] + lea r5, [r5+strideq*2] + vpermq m16, m23, [r5 +strideq*0] + vpermq m17, m23, [r5 +strideq*1] + lea r5, [r5+strideq*2] + vpermq m18, m23, [r5 +strideq*0] + mov r7, tmpq + punpcklbw m0, m12, m13 ; 01 + punpckhbw m12, m13 + punpcklbw m1, m13, m14 ; 12 + punpckhbw m13, m14 + punpcklbw m2, m14, m15 ; 23 + punpckhbw m14, m15 + punpcklbw m3, m15, m16 ; 34 + punpckhbw m15, m16 + punpcklbw m4, m16, m17 ; 45 + punpckhbw m16, m17 + punpcklbw m5, m17, m18 ; 56 + punpckhbw m17, m18 +.v_loop: + pmaddubsw m19, m0, m8 ; a0 + vpermq m6, m23, [r5+strideq*1] + pmaddubsw m20, m12, m8 + mova m0, m2 + pmaddubsw m2, m9 ; a1 + mova m12, m14 + pmaddubsw m14, m9 + lea r5, [r5+strideq*2] + pmaddubsw m21, m1, m8 ; b0 + pmaddubsw m22, m13, m8 + mova m1, m3 + pmaddubsw m3, m9 ; b1 + mova m13, m15 + pmaddubsw m15, m9 + paddw m19, m2 + mova m2, m4 + pmaddubsw m4, m10 ; a2 + paddw m20, m14 + mova m14, m16 + pmaddubsw m16, m10 + paddw m21, m3 + mova m3, m5 + pmaddubsw m5, m10 ; b2 + paddw m22, m15 + mova m15, m17 + pmaddubsw m17, m10 + paddw m19, m4 + punpcklbw m4, m18, m6 ; 67 + paddw m20, m16 + punpckhbw m16, m18, m6 + vpermq m18, m23, [r5+strideq*0] + paddw m21, m5 + pmaddubsw m5, m4, m11 ; a3 + paddw m22, m17 + pmaddubsw m17, m16, m11 + paddw m19, m5 + punpcklbw m5, m6, m18 ; 78 + paddw m20, m17 + punpckhbw m17, m6, m18 + pmaddubsw m6, m5, m11 ; b3 + paddw m21, m6 + pmaddubsw m6, m17, m11 + paddw m22, m6 + REPX {pmulhrsw x, m7}, m19, m20, m21, m22 + mova [r7+wq*0+ 0], m19 + mova [r7+wq*0+64], m20 + mova [r7+wq*1+ 0], m21 + mova [r7+wq*1+64], m22 + lea r7, [r7+wq*2] + sub hd, 2 + jg .v_loop + add srcq, 64 + add tmpq, 128 + movzx hd, r6b + sub r6d, 1<<8 + jg .v_loop0 + RET .h: + RESET_STACK_STATE test myd, 0xf00 jnz .hv +.h2: vpbroadcastd m4, [pd_2] - WIN64_SPILL_XMM 10 cmp wd, 4 je .h_w4 tzcnt wd, wd shr mxd, 16 sub srcq, 3 movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)] - vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep_avx512icl+0] - vpbroadcastd m9, [r7+mxq*8+subpel_filters-prep_avx512icl+4] + vpbroadcastd m8, [base+subpel_filters+mxq*8+0] + vpbroadcastd m9, [base+subpel_filters+mxq*8+4] add wq, r7 jmp wq .h_w4: @@ -2411,7 +3758,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 vbroadcasti128 ym5, [subpel_h_shufA] mov r3d, 0x4 dec srcq - vpbroadcastd ym6, [r7+mxq*8+subpel_filters-prep_avx512icl+2] + vpbroadcastd ym6, [base+subpel_filters+mxq*8+2] kmovb k1, r3d lea stride3q, [strideq*3] .h_w4_loop: @@ -2461,10 +3808,11 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 jg .h_w8_loop RET .h_w16: - mova m5, [spel_h_perm16a] - mova m6, [spel_h_perm16b] - mova m7, [spel_h_perm16c] + mova m5, [spel_h_perm16] + vpbroadcastd m7, [pb_4] lea stride3q, [strideq*3] + paddb m6, m7, m5 + paddb m7, m6 .h_w16_loop: movu ym0, [srcq+strideq*0] movu ym1, [srcq+strideq*2] @@ -2477,9 +3825,10 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 jg .h_w16_loop RET .h_w32: - mova m5, [spel_h_perm32a] - mova m6, [spel_h_perm32b] - mova m7, [spel_h_perm32c] + mova m5, [spel_h_perm32] + vpbroadcastd m7, [pb_4] + paddb m6, m7, m5 + paddb m7, m6 .h_w32_loop: movu m0, [srcq+strideq*0] movu m1, [srcq+strideq*1] @@ -2495,409 +3844,47 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 .h_w128: mov r6, -64*1 .h_start: - mova m5, [spel_h_perm32a] - mova m6, [spel_h_perm32b] - mova m7, [spel_h_perm32c] + mova m5, [spel_h_perm32] + vpbroadcastd m7, [pb_4] sub srcq, r6 + paddb m6, m7, m5 + paddb m7, m6 +.h_loop0: mov r5, r6 .h_loop: - movu m0, [srcq+r6+32*0] - movu m1, [srcq+r6+32*1] + movu m0, [srcq+r5+32*0] + movu m1, [srcq+r5+32*1] PREP_8TAP_H add tmpq, 64*2 - add r6, 64 + add r5, 64 jle .h_loop add srcq, strideq - mov r6, r5 dec hd - jg .h_loop - RET -.v: - movzx mxd, myb ; Select 4-tap/8-tap filter multipliers. - shr myd, 16 ; Note that the code is 8-tap only, having - tzcnt wd, wd - cmp hd, 4 ; a separate 4-tap code path for (4|8|16)x4 - cmove myd, mxd ; had a negligible effect on performance. - ; TODO: Would a 6-tap code path be worth it? - lea myq, [r7+myq*8+subpel_filters-prep_avx512icl] - movzx wd, word [r7+wq*2+table_offset(prep, _8tap_v)] - add wq, r7 - lea stride3q, [strideq*3] - sub srcq, stride3q - vpbroadcastd m7, [pw_8192] - vpbroadcastw m8, [myq+0] - vpbroadcastw m9, [myq+2] - vpbroadcastw m10, [myq+4] - vpbroadcastw m11, [myq+6] - jmp wq -.v_w4: - movd xmm0, [srcq+strideq*0] - vpbroadcastd ymm1, [srcq+strideq*2] - vpbroadcastd xmm2, [srcq+strideq*1] - vpbroadcastd ymm3, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] - vpblendd ymm1, ymm0, 0x01 ; 0 2 2 _ 2 _ _ _ - vpblendd ymm3, ymm2, 0x03 ; 1 1 3 3 3 3 _ _ - vpbroadcastd ymm0, [srcq+strideq*0] - vpbroadcastd ymm2, [srcq+strideq*1] - vpblendd ymm1, ymm0, 0x68 ; 0 2 2 4 2 4 4 _ - vpbroadcastd ymm0, [srcq+strideq*2] - vbroadcasti128 ymm5, [deint_shuf4] - vpblendd ymm3, ymm2, 0xc0 ; 1 1 3 3 3 3 5 5 - vpblendd ymm2, ymm3, ymm1, 0x55 ; 0 1 2 3 2 3 4 5 - vpblendd ymm3, ymm1, 0xaa ; 1 2 3 4 3 4 5 _ - punpcklbw ymm1, ymm2, ymm3 ; 01 12 23 34 - vpblendd ymm3, ymm0, 0x80 ; 1 2 3 4 3 4 5 6 - punpckhbw ymm2, ymm3 ; 23 34 45 56 -.v_w4_loop: - pinsrd xmm0, [srcq+stride3q ], 1 - lea srcq, [srcq+strideq*4] - vpbroadcastd ymm3, [srcq+strideq*0] - vpbroadcastd ymm4, [srcq+strideq*1] - vpblendd ymm3, ymm4, 0x20 ; _ _ 8 _ 8 9 _ _ - vpblendd ymm3, ymm0, 0x03 ; 6 7 8 _ 8 9 _ _ - vpbroadcastd ymm0, [srcq+strideq*2] - vpblendd ymm3, ymm0, 0x40 ; 6 7 8 _ 8 9 a _ - pshufb ymm3, ymm5 ; 67 78 89 9a - pmaddubsw ymm4, ymm1, ym8 - vperm2i128 ymm1, ymm2, ymm3, 0x21 ; 45 56 67 78 - pmaddubsw ymm2, ym9 - paddw ymm4, ymm2 - mova ymm2, ymm3 - pmaddubsw ymm3, ym11 - paddw ymm3, ymm4 - pmaddubsw ymm4, ymm1, ym10 - paddw ymm3, ymm4 - pmulhrsw ymm3, ym7 - mova [tmpq], ymm3 - add tmpq, 32 - sub hd, 4 - jg .v_w4_loop - vzeroupper - RET -.v_w8: - mov r3d, 0xf044 - kmovw k1, r3d - kshiftrw k2, k1, 8 - movq xm0, [srcq+strideq*0] - vpbroadcastq ym1, [srcq+strideq*1] - vpbroadcastq m2, [srcq+strideq*2] - vpbroadcastq m3, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] - vpbroadcastq m4, [srcq+strideq*0] - vpbroadcastq m5, [srcq+strideq*1] - vpbroadcastq m6, [srcq+strideq*2] - vmovdqa64 ym0{k1}, ym1 - vmovdqa64 ym1{k1}, ym2 - vmovdqa64 m2{k1}, m3 - vmovdqa64 m3{k1}, m4 - vmovdqa64 m4{k1}, m5 - vmovdqa64 m5{k1}, m6 - punpcklbw ym0, ym1 ; 01 12 __ __ - punpcklbw m2, m3 ; 23 34 23 34 - punpcklbw m4, m5 ; 45 56 45 56 - vmovdqa64 m0{k2}, m2 ; 01 12 23 34 - vmovdqa64 m2{k2}, m4 ; 23 34 45 56 -.v_w8_loop: - vpbroadcastq m1, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] - vpbroadcastq m3, [srcq+strideq*0] - vpbroadcastq m5, [srcq+strideq*1] - pmaddubsw m14, m0, m8 - pmaddubsw m15, m2, m9 - vpblendmq m0{k1}, m6, m1 - vpblendmq m2{k1}, m1, m3 - vpbroadcastq m6, [srcq+strideq*2] - paddw m14, m15 - punpcklbw m2, m0, m2 ; 67 78 67 78 - vpblendmq m12{k1}, m3, m5 - vpblendmq m13{k1}, m5, m6 - vpblendmq m0{k2}, m4, m2 ; 45 56 67 78 - punpcklbw m4, m12, m13 ; 89 9a 89 9a - vmovdqa64 m2{k2}, m4 ; 67 78 89 9a - pmaddubsw m12, m0, m10 - pmaddubsw m13, m2, m11 - paddw m14, m12 - paddw m14, m13 - pmulhrsw m14, m7 - mova [tmpq], m14 - add tmpq, 64 - sub hd, 4 - jg .v_w8_loop - RET -.v_w16: - mov r3d, 0xf0 - kmovb k1, r3d - vbroadcasti128 m0, [srcq+strideq*0] - vbroadcasti128 m1, [srcq+strideq*1] - vbroadcasti128 m2, [srcq+strideq*2] - vbroadcasti128 m3, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] - vbroadcasti128 m4, [srcq+strideq*0] - vbroadcasti128 m5, [srcq+strideq*1] - vbroadcasti128 m6, [srcq+strideq*2] - vmovdqa64 m0{k1}, m1 - vmovdqa64 m1{k1}, m2 - vmovdqa64 m2{k1}, m3 - vmovdqa64 m3{k1}, m4 - vmovdqa64 m4{k1}, m5 - vmovdqa64 m5{k1}, m6 - shufpd m0, m2, 0xcc ; 0a_2a 0b_2b 1a_3a 1b_3b - shufpd m1, m3, 0xcc ; 1a_3a 1b_3b 2a_4a 2b_4b - shufpd m4, m4, 0x44 ; 4a_-- 4b_-- 5a_-- 5b_-- - shufpd m5, m5, 0x44 ; 5a_-- 5b_-- 6a_-- 6b_-- - punpckhbw m2, m0, m1 ; 23a 23b 34a 34b - punpcklbw m0, m1 ; 01a 01b 12a 12b - punpcklbw m4, m5 ; 45a 45b 56a 56b -.v_w16_loop: - vbroadcasti128 m3, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] - vbroadcasti128 m5, [srcq+strideq*0] - vpblendmq m1{k1}, m6, m3 - vmovdqa64 m3{k1}, m5 - pmaddubsw m12, m0, m8 - pmaddubsw m13, m2, m8 - pmaddubsw m14, m2, m9 - pmaddubsw m15, m4, m9 - pmaddubsw m0, m4, m10 - vbroadcasti128 m2, [srcq+strideq*1] - vbroadcasti128 m6, [srcq+strideq*2] - paddw m12, m14 - paddw m13, m15 - paddw m12, m0 - vmovdqa64 m5{k1}, m2 - vmovdqa64 m2{k1}, m6 - mova m0, m4 - shufpd m1, m5, 0xcc ; 6a_8a 6b_8b 7a_9a 7b_9b - shufpd m3, m2, 0xcc ; 7a_9a 7b_9b 8a_Aa 8b_Ab - punpcklbw m2, m1, m3 ; 67a 67b 78a 78b - punpckhbw m4, m1, m3 ; 89a 89b 9Aa 9Ab - pmaddubsw m14, m2, m10 - pmaddubsw m15, m2, m11 - paddw m13, m14 - paddw m12, m15 - pmaddubsw m14, m4, m11 - paddw m13, m14 - pmulhrsw m12, m7 - pmulhrsw m13, m7 - mova [tmpq+ 0], m12 - mova [tmpq+64], m13 - add tmpq, 64*2 - sub hd, 4 - jg .v_w16_loop - RET -.v_w32: - mova m18, [bilin_v_perm64] - movu ym0, [srcq+strideq*0] - movu ym1, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - movu ym2, [srcq+strideq*0] - movu ym3, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - movu ym4, [srcq+strideq*0] - movu ym5, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - movu ym6, [srcq+strideq*0] - vpermq m0, m18, m0 - vpermq m1, m18, m1 - vpermq m2, m18, m2 - vpermq m3, m18, m3 - vpermq m4, m18, m4 - vpermq m5, m18, m5 - vpermq m6, m18, m6 - punpcklbw m0, m1 - punpcklbw m1, m2 - punpcklbw m2, m3 - punpcklbw m3, m4 - punpcklbw m4, m5 - punpcklbw m5, m6 -.v_w32_loop: - movu ym12, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - movu ym13, [srcq+strideq*0] - pmaddubsw m14, m0, m8 - pmaddubsw m16, m2, m9 - pmaddubsw m15, m1, m8 - pmaddubsw m17, m3, m9 - mova m0, m2 - mova m1, m3 - vpermq m12, m18, m12 - vpermq m13, m18, m13 - paddw m14, m16 - paddw m15, m17 - pmaddubsw m16, m4, m10 - pmaddubsw m17, m5, m10 - punpcklbw m6, m12 - punpcklbw m12, m13 - mova m2, m4 - mova m3, m5 - paddw m14, m16 - paddw m15, m17 - pmaddubsw m16, m6, m11 - pmaddubsw m17, m12, m11 - mova m4, m6 - mova m5, m12 - paddw m14, m16 - paddw m15, m17 - pmulhrsw m14, m7 - pmulhrsw m15, m7 - mova m6, m13 - mova [tmpq+ 0], m14 - mova [tmpq+64], m15 - add tmpq, 64*2 - sub hd, 2 - jg .v_w32_loop - vzeroupper - RET -.v_w64: - mov wd, 64 - jmp .v_start -.v_w128: - mov wd, 128 -.v_start: - WIN64_SPILL_XMM 27 - mova m26, [bilin_v_perm64] - lea r6d, [hq+wq*2] - mov r5, srcq - mov r7, tmpq -.v_loop0: - vpermq m0, m26, [srcq+strideq*0] - vpermq m1, m26, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vpermq m2, m26, [srcq+strideq*0] - vpermq m3, m26, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vpermq m4, m26, [srcq+strideq*0] - vpermq m5, m26, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vpermq m6, m26, [srcq+strideq*0] - punpckhbw m12, m0, m1 - punpcklbw m0, m1 - punpckhbw m13, m1, m2 - punpcklbw m1, m2 - punpckhbw m14, m2, m3 - punpcklbw m2, m3 - punpckhbw m15, m3, m4 - punpcklbw m3, m4 - punpckhbw m16, m4, m5 - punpcklbw m4, m5 - punpckhbw m17, m5, m6 - punpcklbw m5, m6 -.v_loop: - vpermq m18, m26, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vpermq m19, m26, [srcq+strideq*0] - pmaddubsw m20, m0, m8 - pmaddubsw m21, m12, m8 - pmaddubsw m22, m1, m8 - pmaddubsw m23, m13, m8 - mova m0, m2 - mova m12, m14 - mova m1, m3 - mova m13, m15 - pmaddubsw m2, m9 - pmaddubsw m14, m9 - pmaddubsw m3, m9 - pmaddubsw m15, m9 - punpckhbw m24, m6, m18 - punpcklbw m6, m18 - paddw m20, m2 - paddw m21, m14 - paddw m22, m3 - paddw m23, m15 - mova m2, m4 - mova m14, m16 - mova m3, m5 - mova m15, m17 - pmaddubsw m4, m10 - pmaddubsw m16, m10 - pmaddubsw m5, m10 - pmaddubsw m17, m10 - punpckhbw m25, m18, m19 - punpcklbw m18, m19 - paddw m20, m4 - paddw m21, m16 - paddw m22, m5 - paddw m23, m17 - mova m4, m6 - mova m16, m24 - mova m5, m18 - mova m17, m25 - pmaddubsw m6, m11 - pmaddubsw m24, m11 - pmaddubsw m18, m11 - pmaddubsw m25, m11 - paddw m20, m6 - paddw m21, m24 - paddw m22, m18 - paddw m23, m25 - pmulhrsw m20, m7 - pmulhrsw m21, m7 - pmulhrsw m22, m7 - pmulhrsw m23, m7 - mova m6, m19 - mova [tmpq+wq*0+ 0], m20 - mova [tmpq+wq*0+64], m21 - mova [tmpq+wq*2+ 0], m22 - mova [tmpq+wq*2+64], m23 - lea tmpq, [tmpq+wq*4] - sub hd, 2 - jg .v_loop - add r5, 64 - add r7, 128 - movzx hd, r6b - mov srcq, r5 - mov tmpq, r7 - sub r6d, 1<<8 - jg .v_loop0 + jg .h_loop0 RET .hv: - WIN64_SPILL_XMM 16 - cmp wd, 4 - je .hv_w4 - shr mxd, 16 - sub srcq, 3 - vpbroadcastd m10, [r7+mxq*8+subpel_filters-prep_avx512icl+0] - vpbroadcastd m11, [r7+mxq*8+subpel_filters-prep_avx512icl+4] - movzx mxd, myb - shr myd, 16 - cmp hd, 4 - cmove myd, mxd - tzcnt wd, wd + RESET_STACK_STATE vpbroadcastd m8, [pd_2] - movzx wd, word [r7+wq*2+table_offset(prep, _8tap_hv)] vpbroadcastd m9, [pd_32] - add wq, r7 - vpbroadcastq m0, [r7+myq*8+subpel_filters-prep_avx512icl] - lea stride3q, [strideq*3] - sub srcq, stride3q - punpcklbw m0, m0 - psraw m0, 8 ; sign-extend - pshufd m12, m0, q0000 - pshufd m13, m0, q1111 - pshufd m14, m0, q2222 - pshufd m15, m0, q3333 - jmp wq -.hv_w4: + cmp wd, 4 + jg .hv_w8 movzx mxd, mxb dec srcq - vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep_avx512icl+2] + vpbroadcastd m11, [base+subpel_filters+mxq*8+2] movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd - vpbroadcastq m0, [r7+myq*8+subpel_filters-prep_avx512icl] + vpbroadcastq m0, [base+subpel_filters+myq*8] lea stride3q, [strideq*3] sub srcq, stride3q mov r3d, 0x04 kmovb k1, r3d kshiftlb k2, k1, 2 kshiftlb k3, k1, 4 - vpbroadcastd m10, [pd_2] - vbroadcasti128 m16, [subpel_h_shufA] + vbroadcasti128 m10, [subpel_h_shufA] punpcklbw m0, m0 psraw m0, 8 ; sign-extend - vpbroadcastd m11, [pd_32] pshufd m12, m0, q0000 pshufd m13, m0, q1111 pshufd m14, m0, q2222 @@ -2910,263 +3897,265 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 vpbroadcastq m3{k2}, [srcq+strideq*0] vpbroadcastq m2{k3}, [srcq+strideq*1] vpbroadcastq m3{k3}, [srcq+strideq*2] - mova m17, [spel_hv_perm4a] - movu m18, [spel_hv_perm4b] - mova m0, m10 - mova m1, m10 - pshufb m2, m16 - pshufb m3, m16 - vpdpbusd m0, m2, m8 - vpdpbusd m1, m3, m8 + mova m6, [spel_hv_perm4a] + movu m7, [spel_hv_perm4b] + mova m0, m8 + mova m1, m8 + pshufb m2, m10 + pshufb m3, m10 + vpdpbusd m0, m2, m11 + vpdpbusd m1, m3, m11 packssdw m0, m1 ; _ 0 1 2 3 4 5 6 psraw m0, 2 - vpermb m1, m17, m0 ; 01 12 23 34 - vpermb m2, m18, m0 ; 23 34 45 56 + vpermb m1, m6, m0 ; 01 12 23 34 + vpermb m2, m7, m0 ; 23 34 45 56 .hv_w4_loop: movq xm3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] movq xm4, [srcq+strideq*0] vpbroadcastq ym3{k1}, [srcq+strideq*1] vpbroadcastq ym4{k1}, [srcq+strideq*2] - mova ym5, ym10 - mova ym6, ym10 - pshufb ym3, ym16 - pshufb ym4, ym16 - vpdpbusd ym5, ym3, ym8 - vpdpbusd ym6, ym4, ym8 - mova m7, m11 - packssdw ym5, ym6 ; 7 8 9 a _ _ _ _ - psraw ym5, 2 - valignq m0, m5, m0, 4 ; _ 4 5 6 7 8 9 a - vpdpwssd m7, m1, m12 - vpdpwssd m7, m2, m13 - vpermb m1, m17, m0 ; 45 56 67 78 - vpermb m2, m18, m0 ; 67 78 89 9a - vpdpwssd m7, m1, m14 - vpdpwssd m7, m2, m15 - psrad m7, 6 - vpmovdw [tmpq], m7 + mova m5, m9 + pshufb ym3, ym10 + vpdpwssd m5, m1, m12 ; a0 b0 c0 d0 + mova ym1, ym8 + pshufb ym4, ym10 + vpdpbusd ym1, ym3, ym11 + mova ym3, ym8 + vpdpbusd ym3, ym4, ym11 + vpdpwssd m5, m2, m13 ; a1 b1 c1 d1 + packssdw ym1, ym3 ; 7 8 9 a + psraw ym1, 2 + vshufi32x4 m0, m1, q1032 ; _ 4 5 6 7 8 9 a + vpermb m1, m6, m0 ; 45 56 67 78 + vpermb m2, m7, m0 ; 67 78 89 9a + vpdpwssd m5, m1, m14 ; a2 b2 c2 d2 + vpdpwssd m5, m2, m15 ; a3 b3 c3 d3 + psrad m5, 6 + vpmovdw [tmpq], m5 add tmpq, 32 sub hd, 4 jg .hv_w4_loop - vzeroupper RET .hv_w8: - WIN64_SPILL_XMM 24 - vbroadcasti128 m16, [subpel_h_shufA] - vbroadcasti128 m17, [subpel_h_shufB] - vbroadcasti128 m18, [subpel_h_shufC] - vinserti128 ym0, [srcq+strideq*0], 1 - vinserti128 m0, [srcq+strideq*1], 2 - vinserti128 m0, [srcq+strideq*2], 3 - movu xm1, [srcq+stride3q ] + shr mxd, 16 + sub srcq, 3 + vpbroadcastd m10, [base+subpel_filters+mxq*8+0] + vpbroadcastd m11, [base+subpel_filters+mxq*8+4] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + vpbroadcastq m0, [base+subpel_filters+myq*8] + lea stride3q, [strideq*3] + sub srcq, stride3q + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + pshufd m12, m0, q0000 + pshufd m13, m0, q1111 + pshufd m14, m0, q2222 + pshufd m15, m0, q3333 + cmp wd, 8 + jg .hv_w16 + vbroadcasti32x4 m17, [srcq+stride3q ] + vinserti32x4 m16, m17, [srcq+strideq*0], 0 + vbroadcasti32x4 m19, [subpel_h_shufA] + vinserti32x4 m16, [srcq+strideq*1], 1 + vbroadcasti32x4 m21, [subpel_h_shufC] + vinserti32x4 m16, [srcq+strideq*2], 2 lea srcq, [srcq+strideq*4] - vinserti128 ym1, [srcq+strideq*0], 1 - vinserti128 m1, [srcq+strideq*1], 2 - vinserti128 m1, [srcq+strideq*2], 3 + vinserti128 ym17, [srcq+strideq*0], 1 + vbroadcasti32x4 m20, [subpel_h_shufB] + vinserti32x4 m17, [srcq+strideq*1], 2 + vinserti32x4 m17, [srcq+strideq*2], 3 + pshufb m3, m16, m19 ; 0 1 2 3 0123 mova m2, m8 - mova m4, m8 + pshufb m0, m16, m21 ; 0 1 2 3 89ab + vpdpbusd m2, m3, m10 mova m3, m8 - mova m5, m8 - pshufb m20, m0, m16 - pshufb m21, m0, m17 - pshufb m22, m0, m18 - pshufb m23, m1, m16 - pshufb m6, m1, m17 - pshufb m7, m1, m18 - vpdpbusd m2, m20, m10 - vpdpbusd m4, m21, m10 - vpdpbusd m2, m21, m11 - vpdpbusd m4, m22, m11 - vpdpbusd m3, m23, m10 - vpdpbusd m5, m6, m10 - vpdpbusd m3, m6, m11 - vpdpbusd m5, m7, m11 - packssdw m2, m4 - packssdw m3, m5 - psraw m2, 2 ; _ 0 1 2 - psraw m3, 2 ; 3 4 5 6 - valignq m0, m3, m2, 2 ; 0 1 2 3 - valignq m1, m3, m2, 4 ; 1 2 3 4 - valignq m2, m3, m2, 6 ; 2 3 4 5 - punpcklwd m4, m0, m1 ; 01a 12a 23a 34a - punpckhwd m5, m0, m1 ; 01b 12b 23b 34b - punpcklwd m6, m2, m3 ; 23a 34a 45a 56a - punpckhwd m7, m2, m3 ; 23b 34b 45b 56b + pshufb m1, m17, m19 ; 3 4 5 6 0123 + vpdpbusd m3, m0, m11 + mova m0, m8 + pshufb m4, m17, m21 ; 3 4 5 6 89ab + vpdpbusd m0, m1, m10 + mova m1, m8 + pshufb m16, m20 ; 0 1 2 3 4567 + vpdpbusd m1, m4, m11 + pshufb m17, m20 ; 3 4 5 6 4567 + vpdpbusd m2, m16, m11 + vpdpbusd m3, m16, m10 + vpdpbusd m0, m17, m11 + vpdpbusd m1, m17, m10 + packssdw m2, m3 + packssdw m0, m1 + psraw m2, 2 ; 0 1 2 3 + psraw m0, 2 ; 3 4 5 6 + vshufi32x4 m4, m2, m0, q2132 ; 2 3 4 5 + vshufi32x4 m5, m2, m0, q1021 ; 1 2 3 4 + punpcklwd m3, m4, m0 ; 23 34 45 56 + punpckhwd m4, m0 + punpcklwd m1, m2, m5 ; 01 12 23 34 + punpckhwd m2, m5 .hv_w8_loop: - movu xm19, [srcq+stride3q ] + movu xm18, [srcq+stride3q ] lea srcq, [srcq+strideq*4] - vinserti128 ym19, [srcq+strideq*0], 1 - vinserti128 m19, [srcq+strideq*1], 2 - vinserti128 m19, [srcq+strideq*2], 3 - mova m20, m9 - mova m21, m9 - mova m22, m8 - mova m23, m8 - vpdpwssd m20, m4, m12 - vpdpwssd m21, m5, m12 - vpdpwssd m20, m6, m13 - vpdpwssd m21, m7, m13 - pshufb m0, m19, m16 - pshufb m1, m19, m17 - pshufb m2, m19, m18 - vpdpbusd m22, m0, m10 - vpdpbusd m23, m1, m10 - vpdpbusd m22, m1, m11 - vpdpbusd m23, m2, m11 - packssdw m22, m23 - psraw m22, 2 ; 7 8 9 A - valignq m0, m22, m3, 2 ; 4 5 6 7 - valignq m1, m22, m3, 4 ; 5 6 7 8 - valignq m2, m22, m3, 6 ; 6 7 8 9 - mova m3, m22 - punpcklwd m4, m0, m1 ; 45a 56a 67a 78a - punpckhwd m5, m0, m1 ; 45b 56b 67b 78b - punpcklwd m6, m2, m3 ; 67a 78a 89a 9Aa - punpckhwd m7, m2, m3 ; 67b 78b 89b 9Ab - vpdpwssd m20, m4, m14 - vpdpwssd m21, m5, m14 - vpdpwssd m20, m6, m15 - vpdpwssd m21, m7, m15 - psrad m20, 6 - psrad m21, 6 - packssdw m20, m21 - mova [tmpq], m20 + vinserti128 ym18, [srcq+strideq*0], 1 + vinserti32x4 m18, [srcq+strideq*1], 2 + vinserti32x4 m18, [srcq+strideq*2], 3 + pshufb m17, m18, m19 ; 7 8 9 a 0123 + mova m16, m8 + pshufb m5, m18, m21 ; 7 8 9 a 89ab + vpdpbusd m16, m17, m10 + mova m17, m8 + pshufb m18, m20 ; 7 8 9 a 4567 + vpdpbusd m17, m5, m11 + mova m5, m9 + vpdpwssd m5, m3, m13 ; a1 b1 c1 d1 + mova m6, m9 + vpdpwssd m6, m4, m13 + vpdpbusd m16, m18, m11 + vpdpbusd m17, m18, m10 + vpdpwssd m5, m1, m12 ; a0 b0 c0 d0 + mova m1, m3 + vpdpwssd m6, m2, m12 + mova m2, m4 + packssdw m16, m17 + psraw m16, 2 ; 7 8 9 a + valignq m4, m16, m0, 6 ; 6 7 8 9 + mova m0, m16 + punpcklwd m3, m4, m16 ; 67 78 89 9a + punpckhwd m4, m16 + vpdpwssd m5, m3, m15 ; a3 b3 c3 d3 + vpdpwssd m6, m4, m15 + vshufi32x4 m1, m3, q1032 ; 45 56 67 78 + vshufi32x4 m2, m4, q1032 + vpdpwssd m5, m1, m14 ; a2 b2 c2 d2 + vpdpwssd m6, m2, m14 + psrad m5, 6 + psrad m6, 6 + packssdw m5, m6 + mova [tmpq], m5 add tmpq, 64 sub hd, 4 jg .hv_w8_loop + vzeroupper RET .hv_w16: - mov wd, 16*2 - jmp .hv_start -.hv_w32: - mov wd, 32*2 - jmp .hv_start -.hv_w64: - mov wd, 64*2 - jmp .hv_start -.hv_w128: - mov wd, 128*2 -.hv_start: - WIN64_SPILL_XMM 31 - mova m16, [spel_h_perm16a] - mova m17, [spel_h_perm16b] - mova m18, [spel_h_perm16c] + WIN64_SPILL_XMM 23 + mova m16, [spel_h_perm16] + vpbroadcastd m18, [pb_4] + add wd, wd + paddb m17, m18, m16 lea r6d, [hq+wq*8-256] - mov r5, srcq + paddb m18, m17 +.hv_w16_loop0: + movu ym19, [srcq+strideq*0] + vinserti32x8 m19, [srcq+strideq*1], 1 + lea r5, [srcq+strideq*2] + movu ym20, [r5 +strideq*0] + vinserti32x8 m20, [r5 +strideq*1], 1 + lea r5, [r5 +strideq*2] + movu ym21, [r5 +strideq*0] + vinserti32x8 m21, [r5 +strideq*1], 1 + lea r5, [r5 +strideq*2] + movu ym22, [r5 +strideq*0] mov r7, tmpq -.hv_loop0: - movu ym0, [srcq+strideq*0] - vinserti32x8 m0, [srcq+strideq*1], 1 - lea srcq, [srcq+strideq*2] - movu ym1, [srcq+strideq*0] - vinserti32x8 m1, [srcq+strideq*1], 1 - lea srcq, [srcq+strideq*2] - movu ym2, [srcq+strideq*0] - vinserti32x8 m2, [srcq+strideq*1], 1 - lea srcq, [srcq+strideq*2] - movu ym3, [srcq+strideq*0] + vpermb m3, m16, m19 ; 0 1 0123 89ab + mova m2, m8 + vpermb m4, m18, m19 ; 0 1 89ab ghij + vpdpbusd m2, m3, m10 + mova m3, m8 + vpermb m5, m16, m20 ; 2 3 0123 89ab + vpdpbusd m3, m4, m11 mova m4, m8 + vpermb m6, m18, m20 ; 2 3 89ab ghij + vpdpbusd m4, m5, m10 mova m5, m8 + vpermb m7, m16, m21 ; 4 5 0123 89ab + vpdpbusd m5, m6, m11 mova m6, m8 + vpermb m0, m18, m21 ; 4 5 89ab ghij + vpdpbusd m6, m7, m10 mova m7, m8 - vpermb m19, m16, m0 - vpermb m20, m17, m0 - vpermb m21, m18, m0 - vpermb m22, m16, m1 - vpermb m23, m17, m1 - vpermb m24, m18, m1 - vpermb m25, m16, m2 - vpermb m26, m17, m2 - vpermb m27, m18, m2 - vpermb ym28, ym16, ym3 - vpermb ym29, ym17, ym3 - vpermb ym30, ym18, ym3 - mova m0, m8 - mova m1, m8 - mova ym2, ym8 - mova ym3, ym8 - vpdpbusd m4, m19, m10 - vpdpbusd m5, m20, m10 - vpdpbusd m6, m22, m10 - vpdpbusd m7, m23, m10 - vpdpbusd m0, m25, m10 - vpdpbusd m1, m26, m10 - vpdpbusd ym2, ym28, ym10 - vpdpbusd ym3, ym29, ym10 + vpermb ym1, ym16, ym22 ; 6 0123 89ab + vpdpbusd m7, m0, m11 + mova ym0, ym8 + vpermb m19, m17, m19 ; 0 1 4567 cdef + vpdpbusd ym0, ym1, ym10 + vpermb ym1, ym18, ym22 ; 6 89ab ghij + vpdpbusd m2, m19, m11 + vpdpbusd m3, m19, m10 + mova ym19, ym8 + vpermb m20, m17, m20 ; 2 3 4567 cdef + vpdpbusd ym19, ym1, ym11 + vpermb m21, m17, m21 ; 4 5 4567 cdef vpdpbusd m4, m20, m11 - vpdpbusd m5, m21, m11 - vpdpbusd m6, m23, m11 - vpdpbusd m7, m24, m11 - vpdpbusd m0, m26, m11 - vpdpbusd m1, m27, m11 - vpdpbusd ym2, ym29, ym11 - vpdpbusd ym3, ym30, ym11 - packssdw m4, m5 - packssdw m6, m7 - packssdw m0, m1 - packssdw ym2, ym3 - psraw m4, 2 ; 0a 0b 1a 1b - psraw m6, 2 ; 2a 2b 3a 3b - psraw m0, 2 ; 4a 4b 5a 5b - psraw ym2, 2 ; 6a 6b __ __ - vshufi32x4 m5, m4, m6, q1032 ; 1a 1b 2a 2b - vshufi32x4 m7, m6, m0, q1032 ; 3a 3b 4a 4b - vshufi32x4 m1, m0, m2, q1032 ; 5a 5b 6a 6b - punpcklwd m2, m4, m5 ; 01a 01c 12a 12c - punpckhwd m3, m4, m5 ; 01b 01d 12b 12d - punpcklwd m4, m6, m7 ; 23a 23c 34a 34c - punpckhwd m5, m6, m7 ; 23b 23d 34b 34d - punpcklwd m6, m0, m1 ; 45a 45c 56a 56c - punpckhwd m7, m0, m1 ; 45b 45d 56b 56d -.hv_loop: - movu ym19, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vinserti32x8 m19, [srcq+strideq*0], 1 + vpdpbusd m5, m20, m10 + vpermb ym22, ym17, ym22 ; 6 4567 cdef + vpdpbusd m6, m21, m11 + vpdpbusd m7, m21, m10 + packssdw m2, m3 ; 0 1 + vpdpbusd ym0, ym22, ym11 + packssdw m4, m5 ; 2 3 + vpdpbusd ym19, ym22, ym10 + packssdw m6, m7 ; 4 5 + packssdw ym0, ym19 ; 6 + REPX {psraw x, 2}, m2, m4, m6, ym0 + vshufi32x4 m3, m2, m4, q1032 ; 1 2 + vshufi32x4 m5, m4, m6, q1032 ; 3 4 + vshufi32x4 m0, m6, m0, q1032 ; 5 6 + punpcklwd m1, m2, m3 ; 01 12 + punpckhwd m2, m3 + punpcklwd m3, m4, m5 ; 23 34 + punpckhwd m4, m5 + punpcklwd m5, m6, m0 ; 45 56 + punpckhwd m6, m0 +.hv_w16_loop: + movu ym19, [r5+strideq*1] + lea r5, [r5+strideq*2] + vinserti32x8 m19, [r5+strideq*0], 1 mova m20, m9 + vpdpwssd m20, m1, m12 ; a0 + vpermb m1, m16, m19 mova m21, m9 + vpdpwssd m21, m2, m12 ; b0 + vpermb m2, m17, m19 mova m22, m8 - mova m23, m8 - vpdpwssd m20, m2, m12 - vpdpwssd m21, m3, m12 - vpdpwssd m20, m4, m13 - vpdpwssd m21, m5, m13 - vpermb m24, m16, m19 - vpermb m25, m17, m19 - vpermb m26, m18, m19 - vpdpbusd m22, m24, m10 - vpdpbusd m23, m25, m10 - vpdpbusd m22, m25, m11 - vpdpbusd m23, m26, m11 - packssdw m22, m23 - psraw m22, 2 ; 7a 7b 8a 8b - vshufi32x4 m0, m1, m22, q1032 ; 6a 6b 7a 7b + vpdpbusd m22, m1, m10 + mova m1, m8 + vpermb m19, m18, m19 + vpdpbusd m1, m2, m10 + vpdpwssd m20, m3, m13 ; a1 + vpdpwssd m21, m4, m13 ; b1 + vpdpbusd m22, m2, m11 mova m2, m4 - mova m3, m5 - mova m1, m22 + vpdpbusd m1, m19, m11 mova m4, m6 - mova m5, m7 - punpcklwd m6, m0, m1 ; 67a 67c 78a 78c - punpckhwd m7, m0, m1 ; 67b 67d 78b 78d - vpdpwssd m20, m4, m14 - vpdpwssd m21, m5, m14 - vpdpwssd m20, m6, m15 - vpdpwssd m21, m7, m15 + vpdpwssd m20, m5, m14 ; a2 + vpdpwssd m21, m6, m14 ; b2 + packssdw m22, m1 + mova m1, m3 + psraw m22, 2 ; 7 8 + mova m3, m5 + vshufi32x4 m6, m0, m22, q1032 ; 6 7 + mova m0, m22 + punpcklwd m5, m6, m0 ; 67 78 + punpckhwd m6, m0 + vpdpwssd m20, m5, m15 ; a3 + vpdpwssd m21, m6, m15 ; b3 psrad m20, 6 psrad m21, 6 packssdw m20, m21 - mova [tmpq+wq*0], ym20 - vextracti32x8 [tmpq+wq*1], m20, 1 - lea tmpq, [tmpq+wq*2] + mova [r7+wq*0], ym20 + vextracti32x8 [r7+wq*1], m20, 1 + lea r7, [r7+wq*2] sub hd, 2 - jg .hv_loop - add r5, 16 - add r7, 32 + jg .hv_w16_loop + add srcq, 16 + add tmpq, 32 movzx hd, r6b - mov srcq, r5 - mov tmpq, r7 sub r6d, 1<<8 - jg .hv_loop0 + jg .hv_w16_loop0 RET cglobal warp_affine_8x8t_8bpc, 4, 7, 22, tmp, ts diff --git a/third_party/dav1d/tests/meson.build b/third_party/dav1d/tests/meson.build index 11db0a56e9..38a591b5b4 100644 --- a/third_party/dav1d/tests/meson.build +++ b/third_party/dav1d/tests/meson.build @@ -100,7 +100,7 @@ if is_asm_enabled ], ) - test('checkasm', checkasm, suite: 'checkasm', timeout: 180, is_parallel: false) + test('checkasm', checkasm, suite: 'checkasm', timeout: 180) benchmark('checkasm', checkasm, suite: 'checkasm', timeout: 3600, args: '--bench') endif -- cgit v1.2.3