summaryrefslogtreecommitdiffstats
path: root/third_party/dav1d
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-06-12 05:35:37 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-06-12 05:35:37 +0000
commita90a5cba08fdf6c0ceb95101c275108a152a3aed (patch)
tree532507288f3defd7f4dcf1af49698bcb76034855 /third_party/dav1d
parentAdding debian version 126.0.1-1. (diff)
downloadfirefox-a90a5cba08fdf6c0ceb95101c275108a152a3aed.tar.xz
firefox-a90a5cba08fdf6c0ceb95101c275108a152a3aed.zip
Merging upstream version 127.0.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/dav1d')
-rw-r--r--third_party/dav1d/meson.build2
-rw-r--r--third_party/dav1d/meson_options.txt5
-rw-r--r--third_party/dav1d/src/arm/64/mc.S4
-rw-r--r--third_party/dav1d/src/arm/64/mc_dotprod.S1413
-rw-r--r--third_party/dav1d/src/arm/64/msac.S21
-rw-r--r--third_party/dav1d/src/arm/itx.h63
-rw-r--r--third_party/dav1d/src/arm/mc.h85
-rw-r--r--third_party/dav1d/src/cdf.c1378
-rw-r--r--third_party/dav1d/src/cdf.h48
-rw-r--r--third_party/dav1d/src/decode.c95
-rw-r--r--third_party/dav1d/src/internal.h9
-rw-r--r--third_party/dav1d/src/itx.h63
-rw-r--r--third_party/dav1d/src/lf_mask.c6
-rw-r--r--third_party/dav1d/src/meson.build1
-rw-r--r--third_party/dav1d/src/refmvs.c4
-rw-r--r--third_party/dav1d/src/riscv/itx.h63
-rw-r--r--third_party/dav1d/src/x86/ipred_avx2.asm3
-rw-r--r--third_party/dav1d/src/x86/itx.h64
-rw-r--r--third_party/dav1d/src/x86/mc16_avx2.asm1378
-rw-r--r--third_party/dav1d/src/x86/mc_avx2.asm1471
-rw-r--r--third_party/dav1d/src/x86/mc_avx512.asm2953
-rw-r--r--third_party/dav1d/tests/meson.build2
22 files changed, 6606 insertions, 2525 deletions
diff --git a/third_party/dav1d/meson.build b/third_party/dav1d/meson.build
index e371415d53..a2637ed797 100644
--- a/third_party/dav1d/meson.build
+++ b/third_party/dav1d/meson.build
@@ -81,6 +81,8 @@ cdata.set10('TRIM_DSP_FUNCTIONS', get_option('trim_dsp') == 'true' or
# Logging option
cdata.set10('CONFIG_LOG', get_option('logging'))
+cdata.set10('CONFIG_MACOS_KPERF', get_option('macos_kperf'))
+
#
# OS/Compiler checks and defines
#
diff --git a/third_party/dav1d/meson_options.txt b/third_party/dav1d/meson_options.txt
index c04deffd73..b0b45b474d 100644
--- a/third_party/dav1d/meson_options.txt
+++ b/third_party/dav1d/meson_options.txt
@@ -68,3 +68,8 @@ option('trim_dsp',
choices: ['true', 'false', 'if-release'],
value: 'if-release',
description: 'Eliminate redundant DSP functions where possible')
+
+option('macos_kperf',
+ type: 'boolean',
+ value: false,
+ description: 'Use the private macOS kperf API for benchmarking')
diff --git a/third_party/dav1d/src/arm/64/mc.S b/third_party/dav1d/src/arm/64/mc.S
index 3df0393c3a..5b493be82d 100644
--- a/third_party/dav1d/src/arm/64/mc.S
+++ b/third_party/dav1d/src/arm/64/mc.S
@@ -837,7 +837,7 @@ endfunc
// This has got the same signature as the put_8tap functions,
// and assumes that x8 is set to (clz(w)-24).
-function put_neon
+function put_neon, export=1
adr x9, L(put_tbl)
ldrh w8, [x9, x8, lsl #1]
sub x9, x9, w8, uxtw
@@ -939,7 +939,7 @@ endfunc
// This has got the same signature as the prep_8tap functions,
// and assumes that x8 is set to (clz(w)-24), and x7 to w*2.
-function prep_neon
+function prep_neon, export=1
adr x9, L(prep_tbl)
ldrh w8, [x9, x8, lsl #1]
sub x9, x9, w8, uxtw
diff --git a/third_party/dav1d/src/arm/64/mc_dotprod.S b/third_party/dav1d/src/arm/64/mc_dotprod.S
new file mode 100644
index 0000000000..fcf04ee4d0
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/mc_dotprod.S
@@ -0,0 +1,1413 @@
+/*
+ * Copyright © 2024, VideoLAN and dav1d authors
+ * Copyright © 2024, Janne Grunau
+ * Copyright © 2024, Martin Storsjo
+ * Copyright © 2024, Arm Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+
+#if HAVE_DOTPROD
+ENABLE_DOTPROD
+
+// No spaces in these expressions, due to gas-preprocessor. It is translated by
+// -1 to save the negative offset at getting the address of `mc_subpel_filters`.
+#define REGULAR1 (((0*15-1)<<7)|(3*15-1))
+#define SMOOTH1 (((1*15-1)<<7)|(4*15-1))
+#define SHARP1 (((2*15-1)<<7)|(3*15-1))
+
+#define FUNC_ALIGN 2
+#define JUMP_ALIGN 2
+#define LOOP_ALIGN 2
+
+
+// Lookup table used to help conversion of shifted 32-bit values to 8-bit.
+ .align 4
+L(hv_tbl_neon_dotprod):
+ .byte 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30
+
+// Shuffle indices to permute horizontal samples in preparation for input to
+// SDOT instructions. The 8-tap horizontal convolution uses sample indices in the
+// interval of [-3, 4] relative to the current sample position. We load samples
+// from index value -4 to keep loads word aligned, so the shuffle bytes are
+// translated by 1 to handle this.
+ .align 4
+L(h_tbl_neon_dotprod):
+ .byte 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7
+ .byte 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11
+ .byte 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15
+ .byte 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18, 16, 17, 18, 19
+
+// Vertical convolutions are also using SDOT instructions, where a 128-bit
+// register contains a transposed 4x4 matrix of values. Subsequent iterations of
+// the vertical convolution can reuse the 3x4 sub-matrix from the previous loop
+// iteration. These shuffle indices shift and merge this 4x4 matrix with the
+// values of a new line.
+ .align 4
+L(v_tbl_neon_dotprod):
+ .byte 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28
+ .byte 1, 2, 3, 16, 5, 6, 7, 17, 9, 10, 11, 18, 13, 14, 15, 19
+ .byte 1, 2, 3, 20, 5, 6, 7, 21, 9, 10, 11, 22, 13, 14, 15, 23
+ .byte 1, 2, 3, 24, 5, 6, 7, 25, 9, 10, 11, 26, 13, 14, 15, 27
+ .byte 1, 2, 3, 28, 5, 6, 7, 29, 9, 10, 11, 30, 13, 14, 15, 31
+
+
+.macro make_8tap_fn op, type, type_h, type_v, isa, jump=1
+function \op\()_8tap_\type\()_8bpc_\isa, export=1, align=FUNC_ALIGN
+ mov x9, \type_h
+ mov x10, \type_v
+ .if \jump
+ b \op\()_8tap_\isa
+ .endif
+endfunc
+.endm
+
+.macro filter_8tap_fn type, isa, dst, d_strd, src, s_strd, w, h, mx, my, xmx, xmy, ldst, lsrc, wd_strd
+make_8tap_fn \type, sharp, SHARP1, SHARP1, \isa
+make_8tap_fn \type, sharp_smooth, SHARP1, SMOOTH1, \isa
+make_8tap_fn \type, sharp_regular, SHARP1, REGULAR1, \isa
+make_8tap_fn \type, smooth_sharp, SMOOTH1, SHARP1, \isa
+make_8tap_fn \type, smooth, SMOOTH1, SMOOTH1, \isa
+make_8tap_fn \type, smooth_regular, SMOOTH1, REGULAR1, \isa
+make_8tap_fn \type, regular_sharp, REGULAR1, SHARP1, \isa
+make_8tap_fn \type, regular_smooth, REGULAR1, SMOOTH1, \isa
+make_8tap_fn \type, regular, REGULAR1, REGULAR1, \isa, jump=0
+
+function \type\()_8tap_\isa, align=FUNC_ALIGN
+ clz w8, \w
+ mov w11, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0)
+ sub w8, w8, #24 // for jump tables
+ movrel x12, X(mc_subpel_filters)
+ cbnz \mx, L(\type\()_8tap_h_hv_\isa)
+ cbnz \my, L(\type\()_8tap_v_\isa)
+.ifc \type, prep
+ add \wd_strd, \w, \w // prep_neon needs w * 2 as stride
+.endif
+ b X(\type\()_neon)
+
+ .align JUMP_ALIGN
+L(\type\()_8tap_v_\isa):
+ madd \my, \my, w11, w10
+.ifc \type, prep
+ mov w8, 0x2002 // FILTER_WEIGHT * 128 + rounding
+.endif
+ sub \src, \src, \s_strd
+ ldr q6, L(v_tbl_neon_dotprod)
+.ifc \type, prep
+ dup v4.4s, w8
+.endif
+ ubfx w11, \my, #7, #7
+ and \my, \my, #0x7F
+ ldr q28, L(v_tbl_neon_dotprod) + 16
+ cmp \h, #4
+ csel \my, \my, w11, le
+ sub \src, \src, \s_strd, lsl #1 // src - src_stride * 3
+ ldr q29, L(v_tbl_neon_dotprod) + 32
+ add \xmy, x12, \xmy, lsl #3 // subpel V filter address
+ movi v5.16b, #128
+ ldr d7, [\xmy]
+ cmp \w, #8
+ b.eq 80f
+ b.lt 40f
+
+ // .align JUMP_ALIGN // fallthrough
+160: // V - 16xN+
+ ldr q30, L(v_tbl_neon_dotprod) + 48
+ ldr q31, L(v_tbl_neon_dotprod) + 64
+.ifc \type, prep
+ add \wd_strd, \w, \w
+.endif
+ .align LOOP_ALIGN
+161:
+ mov \lsrc, \src
+ mov \ldst, \dst
+ sub w8, \h, #1
+
+ ldr q16, [\lsrc]
+ ldr q17, [\lsrc, \s_strd]
+ add \lsrc, \lsrc, \s_strd, lsl #1
+ ldr q18, [\lsrc]
+ ldr q19, [\lsrc, \s_strd]
+ add \lsrc, \lsrc, \s_strd, lsl #1
+
+ zip1 v0.16b, v16.16b, v17.16b
+ zip2 v1.16b, v16.16b, v17.16b
+ zip1 v2.16b, v18.16b, v19.16b
+ zip2 v3.16b, v18.16b, v19.16b
+
+ ldr q20, [\lsrc]
+ ldr q21, [\lsrc, \s_strd]
+ add \lsrc, \lsrc, \s_strd, lsl #1
+ ldr q22, [\lsrc]
+ ldr q23, [\lsrc, \s_strd]
+ add \lsrc, \lsrc, \s_strd, lsl #1
+
+ zip1 v18.16b, v20.16b, v21.16b
+ zip2 v21.16b, v20.16b, v21.16b
+ zip1 v24.16b, v22.16b, v23.16b
+ zip2 v27.16b, v22.16b, v23.16b
+
+ zip1 v16.8h, v0.8h, v2.8h
+ zip2 v19.8h, v0.8h, v2.8h
+ zip1 v22.8h, v1.8h, v3.8h
+ zip2 v25.8h, v1.8h, v3.8h
+
+ zip1 v17.8h, v18.8h, v24.8h
+ zip2 v20.8h, v18.8h, v24.8h
+ zip1 v23.8h, v21.8h, v27.8h
+ zip2 v26.8h, v21.8h, v27.8h
+
+ sub v16.16b, v16.16b, v5.16b
+ sub v19.16b, v19.16b, v5.16b
+ sub v22.16b, v22.16b, v5.16b
+ sub v25.16b, v25.16b, v5.16b
+
+ sub v17.16b, v17.16b, v5.16b
+ sub v20.16b, v20.16b, v5.16b
+ sub v23.16b, v23.16b, v5.16b
+ sub v26.16b, v26.16b, v5.16b
+
+ .align LOOP_ALIGN
+16:
+ ldr q27, [\lsrc]
+ add \lsrc, \lsrc, \s_strd
+.ifc \type, prep
+ mov v0.16b, v4.16b
+ mov v1.16b, v4.16b
+ mov v2.16b, v4.16b
+ mov v3.16b, v4.16b
+.else
+ movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT
+ movi v1.4s, #32, lsl 8
+ movi v2.4s, #32, lsl 8
+ movi v3.4s, #32, lsl 8
+.endif
+ sub v18.16b, v27.16b, v5.16b
+ sub v21.16b, v27.16b, v5.16b
+ sub v24.16b, v27.16b, v5.16b
+ sub v27.16b, v27.16b, v5.16b
+
+ sdot v0.4s, v16.16b, v7.4b[0]
+ sdot v1.4s, v19.16b, v7.4b[0]
+ sdot v2.4s, v22.16b, v7.4b[0]
+ sdot v3.4s, v25.16b, v7.4b[0]
+
+ tbl v16.16b, {v16.16b, v17.16b}, v6.16b
+ tbl v19.16b, {v19.16b, v20.16b}, v6.16b
+ tbl v22.16b, {v22.16b, v23.16b}, v6.16b
+ tbl v25.16b, {v25.16b, v26.16b}, v6.16b
+
+ sdot v0.4s, v17.16b, v7.4b[1]
+ sdot v1.4s, v20.16b, v7.4b[1]
+ sdot v2.4s, v23.16b, v7.4b[1]
+ sdot v3.4s, v26.16b, v7.4b[1]
+
+ tbl v17.16b, {v17.16b, v18.16b}, v28.16b
+ tbl v20.16b, {v20.16b, v21.16b}, v29.16b
+ tbl v23.16b, {v23.16b, v24.16b}, v30.16b
+ tbl v26.16b, {v26.16b, v27.16b}, v31.16b
+
+ subs w8, w8, #1
+ uzp1 v0.8h, v0.8h, v1.8h
+ uzp1 v2.8h, v2.8h, v3.8h
+.ifc \type, prep
+ sshr v0.8h, v0.8h, #2
+ sshr v1.8h, v2.8h, #2
+ st1 {v0.8h, v1.8h}, [\ldst], \d_strd
+.else
+ sqrshrun v0.8b, v0.8h, #6
+ sqrshrun2 v0.16b, v2.8h, #6
+ st1 {v0.16b}, [\ldst], \d_strd
+.endif
+ b.gt 16b
+
+.ifc \type, prep
+ mov v0.16b, v4.16b
+ mov v1.16b, v4.16b
+ mov v2.16b, v4.16b
+ mov v3.16b, v4.16b
+.else
+ movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT
+ movi v1.4s, #32, lsl 8
+ movi v2.4s, #32, lsl 8
+ movi v3.4s, #32, lsl 8
+.endif
+ sdot v0.4s, v16.16b, v7.4b[0]
+ sdot v1.4s, v19.16b, v7.4b[0]
+ sdot v2.4s, v22.16b, v7.4b[0]
+ sdot v3.4s, v25.16b, v7.4b[0]
+
+ sdot v0.4s, v17.16b, v7.4b[1]
+ sdot v1.4s, v20.16b, v7.4b[1]
+ sdot v2.4s, v23.16b, v7.4b[1]
+ sdot v3.4s, v26.16b, v7.4b[1]
+
+ subs \w, \w, #16
+ uzp1 v0.8h, v0.8h, v1.8h
+ uzp1 v2.8h, v2.8h, v3.8h
+.ifc \type, prep
+ sshr v0.8h, v0.8h, #2
+ sshr v1.8h, v2.8h, #2
+ stp q0, q1, [\ldst]
+ add \dst, \dst, #32
+.else
+ sqrshrun v0.8b, v0.8h, #6
+ sqrshrun2 v0.16b, v2.8h, #6
+ str q0, [\ldst]
+ add \dst, \dst, #16
+.endif
+ add \src, \src, #16
+ b.gt 161b
+ ret
+
+ .align JUMP_ALIGN
+80: // V - 8xN
+ ldr d16, [\src]
+ ldr d17, [\src, \s_strd]
+ add \src, \src, \s_strd, lsl #1
+ ldr d18, [\src]
+ ldr d19, [\src, \s_strd]
+ add \src, \src, \s_strd, lsl #1
+
+ ldr d20, [\src]
+ ldr d21, [\src, \s_strd]
+ add \src, \src, \s_strd, lsl #1
+ ldr d22, [\src]
+ ldr d23, [\src, \s_strd]
+ add \src, \src, \s_strd, lsl #1
+ subs \h, \h, #2 // for prep: sub is enough
+
+ zip1 v0.16b, v16.16b, v17.16b
+ zip1 v2.16b, v18.16b, v19.16b
+ zip1 v18.16b, v20.16b, v21.16b
+ zip1 v24.16b, v22.16b, v23.16b
+
+ zip1 v16.8h, v0.8h, v2.8h
+ zip2 v19.8h, v0.8h, v2.8h
+ zip1 v17.8h, v18.8h, v24.8h
+ zip2 v20.8h, v18.8h, v24.8h
+
+ sub v16.16b, v16.16b, v5.16b
+ sub v19.16b, v19.16b, v5.16b
+ sub v17.16b, v17.16b, v5.16b
+ sub v20.16b, v20.16b, v5.16b
+.ifc \type, put
+ b.eq 82f
+.endif
+
+ .align LOOP_ALIGN
+8:
+ ldr d21, [\src]
+ ldr d27, [\src, \s_strd]
+ add \src, \src, \s_strd, lsl #1
+.ifc \type, prep
+ mov v0.16b, v4.16b
+ mov v1.16b, v4.16b
+ mov v2.16b, v4.16b
+ mov v3.16b, v4.16b
+.else
+ movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT
+ movi v1.4s, #32, lsl 8
+ movi v2.4s, #32, lsl 8
+ movi v3.4s, #32, lsl 8
+.endif
+ sub v18.16b, v21.16b, v5.16b
+ sub v21.16b, v21.16b, v5.16b
+ sub v24.16b, v27.16b, v5.16b
+ sub v27.16b, v27.16b, v5.16b
+
+ tbl v22.16b, {v16.16b, v17.16b}, v6.16b
+ tbl v25.16b, {v19.16b, v20.16b}, v6.16b
+ tbl v23.16b, {v17.16b, v18.16b}, v28.16b
+ tbl v26.16b, {v20.16b, v21.16b}, v29.16b
+
+ sdot v0.4s, v16.16b, v7.4b[0]
+ sdot v0.4s, v17.16b, v7.4b[1]
+ sdot v1.4s, v19.16b, v7.4b[0]
+ sdot v1.4s, v20.16b, v7.4b[1]
+
+ tbl v16.16b, {v22.16b, v23.16b}, v6.16b
+ tbl v19.16b, {v25.16b, v26.16b}, v6.16b
+ tbl v17.16b, {v23.16b, v24.16b}, v28.16b
+ tbl v20.16b, {v26.16b, v27.16b}, v29.16b
+
+ sdot v2.4s, v22.16b, v7.4b[0]
+ sdot v2.4s, v23.16b, v7.4b[1]
+ sdot v3.4s, v25.16b, v7.4b[0]
+ sdot v3.4s, v26.16b, v7.4b[1]
+
+ subs \h, \h, #2
+ uzp1 v0.8h, v0.8h, v1.8h
+ uzp1 v2.8h, v2.8h, v3.8h
+.ifc \type, prep
+ sshr v0.8h, v0.8h, #2
+ sshr v1.8h, v2.8h, #2
+ stp q0, q1, [\dst], #32
+.else
+ sqrshrun v0.8b, v0.8h, #6
+ sqrshrun v1.8b, v2.8h, #6
+ str d0, [\dst]
+ str d1, [\dst, \d_strd]
+ add \dst, \dst, \d_strd, lsl #1
+.endif
+ b.gt 8b
+
+.ifc \type, put
+ .align JUMP_ALIGN
+82:
+ ldr d21, [\src]
+ movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT
+ movi v1.4s, #32, lsl 8
+ movi v2.4s, #32, lsl 8
+ movi v3.4s, #32, lsl 8
+.else
+ ldr d21, [\src]
+ mov v0.16b, v4.16b
+ mov v1.16b, v4.16b
+ mov v2.16b, v4.16b
+ mov v3.16b, v4.16b
+.endif
+ sub v18.16b, v21.16b, v5.16b
+ sub v21.16b, v21.16b, v5.16b
+
+ tbl v22.16b, {v16.16b, v17.16b}, v6.16b
+ tbl v25.16b, {v19.16b, v20.16b}, v6.16b
+ tbl v23.16b, {v17.16b, v18.16b}, v28.16b
+ tbl v26.16b, {v20.16b, v21.16b}, v29.16b
+
+ sdot v0.4s, v16.16b, v7.4b[0]
+ sdot v0.4s, v17.16b, v7.4b[1]
+ sdot v1.4s, v19.16b, v7.4b[0]
+ sdot v1.4s, v20.16b, v7.4b[1]
+
+ sdot v2.4s, v22.16b, v7.4b[0]
+ sdot v2.4s, v23.16b, v7.4b[1]
+ sdot v3.4s, v25.16b, v7.4b[0]
+ sdot v3.4s, v26.16b, v7.4b[1]
+
+ uzp1 v0.8h, v0.8h, v1.8h
+ uzp1 v2.8h, v2.8h, v3.8h
+.ifc \type, prep
+ sshr v0.8h, v0.8h, #2
+ sshr v1.8h, v2.8h, #2
+ stp q0, q1, [\dst]
+.else
+ sqrshrun v0.8b, v0.8h, #6
+ sqrshrun v1.8b, v2.8h, #6
+ str d0, [\dst]
+ str d1, [\dst, \d_strd]
+.endif
+ ret
+
+ .align JUMP_ALIGN
+40: // V - 4xN or 2xN (put only)
+.ifc \type, put
+ cmp \w, #2
+ b.eq 20f
+.endif
+ ldr s16, [\src]
+ ldr s17, [\src, \s_strd]
+ add \src, \src, \s_strd, lsl #1
+ ldr s18, [\src]
+ ldr s19, [\src, \s_strd]
+ add \src, \src, \s_strd, lsl #1
+
+ ldr s20, [\src]
+ ldr s21, [\src, \s_strd]
+ add \src, \src, \s_strd, lsl #1
+ ldr s22, [\src]
+ ldr s23, [\src, \s_strd]
+ add \src, \src, \s_strd, lsl #1
+ subs \h, \h, #2 // for prep: sub is enough
+
+ zip1 v0.8b, v16.8b, v17.8b
+ zip1 v2.8b, v18.8b, v19.8b
+ zip1 v18.8b, v20.8b, v21.8b
+ zip1 v24.8b, v22.8b, v23.8b
+
+ zip1 v16.8h, v0.8h, v2.8h
+ zip1 v17.8h, v18.8h, v24.8h
+
+ sub v16.16b, v16.16b, v5.16b
+ sub v17.16b, v17.16b, v5.16b
+.ifc \type, put
+ b.eq 42f
+.endif
+
+ .align LOOP_ALIGN
+4:
+ ldr s18, [\src]
+ ldr s21, [\src, \s_strd]
+ add \src, \src, \s_strd, lsl #1
+.ifc \type, prep
+ mov v0.16b, v4.16b
+ mov v1.16b, v4.16b
+.else
+ movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT
+ movi v1.4s, #32, lsl 8
+.endif
+ sub v18.16b, v18.16b, v5.16b
+ sub v21.16b, v21.16b, v5.16b
+
+ tbl v19.16b, {v16.16b, v17.16b}, v6.16b
+ tbl v20.16b, {v17.16b, v18.16b}, v28.16b
+
+ sdot v0.4s, v16.16b, v7.4b[0]
+ sdot v0.4s, v17.16b, v7.4b[1]
+
+ tbl v16.16b, {v19.16b, v20.16b}, v6.16b
+ tbl v17.16b, {v20.16b, v21.16b}, v28.16b
+
+ sdot v1.4s, v19.16b, v7.4b[0]
+ sdot v1.4s, v20.16b, v7.4b[1]
+.ifc \type, prep
+ subs \h, \h, #2
+ shrn v0.4h, v0.4s, #2
+ shrn2 v0.8h, v1.4s, #2
+ str q0, [\dst], #16
+.else
+ uzp1 v0.8h, v0.8h, v1.8h
+ sqrshrun v0.8b, v0.8h, #6
+ subs \h, \h, #2
+ fmov x8, d0
+ lsr x9, x8, #32
+ str w8, [\dst]
+ str w9, [\dst, \d_strd]
+ add \dst, \dst, \d_strd, lsl #1
+.endif
+ b.gt 4b
+
+.ifc \type, put
+ .align JUMP_ALIGN
+42:
+ ldr s18, [\src]
+ movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT
+ movi v1.4s, #32, lsl 8
+.else
+ ldr s18, [\src]
+ mov v0.16b, v4.16b
+ mov v1.16b, v4.16b
+.endif
+ sub v18.16b, v18.16b, v5.16b
+
+ tbl v19.16b, {v16.16b, v17.16b}, v6.16b
+ tbl v20.16b, {v17.16b, v18.16b}, v28.16b
+
+ sdot v0.4s, v16.16b, v7.4b[0]
+ sdot v0.4s, v17.16b, v7.4b[1]
+
+ sdot v1.4s, v19.16b, v7.4b[0]
+ sdot v1.4s, v20.16b, v7.4b[1]
+.ifc \type, prep
+ shrn v0.4h, v0.4s, #2
+ shrn2 v0.8h, v1.4s, #2
+ str q0, [\dst]
+ ret
+.else
+ uzp1 v0.8h, v0.8h, v1.8h
+ sqrshrun v0.8b, v0.8h, #6
+ fmov x8, d0
+ lsr x9, x8, #32
+ str w8, [\dst]
+ str w9, [\dst, \d_strd]
+ ret
+
+ .align JUMP_ALIGN
+20: // V - 2xN
+ ldr h16, [\src]
+ ldr h17, [\src, \s_strd]
+ add \src, \src, \s_strd, lsl #1
+ ldr h18, [\src]
+ ldr h19, [\src, \s_strd]
+ add \src, \src, \s_strd, lsl #1
+
+ ldr h20, [\src]
+ ldr h21, [\src, \s_strd]
+ add \src, \src, \s_strd, lsl #1
+ ldr h22, [\src]
+ ldr h23, [\src, \s_strd]
+ add \src, \src, \s_strd, lsl #1
+ subs \h, \h, #2
+
+ zip1 v0.8b, v16.8b, v17.8b
+ zip1 v2.8b, v18.8b, v19.8b
+ zip1 v18.8b, v20.8b, v21.8b
+ zip1 v24.8b, v22.8b, v23.8b
+
+ zip1 v16.4h, v0.4h, v2.4h
+ zip1 v17.4h, v18.4h, v24.4h
+
+ sub v16.8b, v16.8b, v5.8b
+ sub v17.8b, v17.8b, v5.8b
+
+ b.eq 22f
+
+ .align LOOP_ALIGN
+2:
+ ldr h18, [\src]
+ ldr h21, [\src, \s_strd]
+ add \src, \src, \s_strd, lsl #1
+
+ movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT
+ movi v1.4s, #32, lsl 8
+
+ sub v18.8b, v18.8b, v5.8b
+ sub v21.8b, v21.8b, v5.8b
+
+ tbl v19.16b, {v16.16b, v17.16b}, v6.16b
+ tbl v20.16b, {v17.16b, v18.16b}, v28.16b
+
+ sdot v0.4s, v16.16b, v7.4b[0]
+ sdot v0.4s, v17.16b, v7.4b[1]
+
+ tbl v16.16b, {v19.16b, v20.16b}, v6.16b
+ tbl v17.16b, {v20.16b, v21.16b}, v28.16b
+
+ sdot v1.4s, v19.16b, v7.4b[0]
+ sdot v1.4s, v20.16b, v7.4b[1]
+
+ uzp1 v0.8h, v0.8h, v1.8h
+ sqrshrun v0.8b, v0.8h, #6
+
+ subs \h, \h, #2
+ fmov x8, d0
+ lsr x9, x8, #32
+ strh w8, [\dst]
+ strh w9, [\dst, \d_strd]
+ add \dst, \dst, \d_strd, lsl #1
+ b.gt 2b
+
+ .align JUMP_ALIGN
+22:
+ ldr h18, [\src]
+
+ movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT
+ movi v1.4s, #32, lsl 8
+
+ sub v18.8b, v18.8b, v5.8b
+
+ tbl v19.16b, {v16.16b, v17.16b}, v6.16b
+ tbl v20.16b, {v17.16b, v18.16b}, v28.16b
+
+ sdot v0.4s, v16.16b, v7.4b[0]
+ sdot v0.4s, v17.16b, v7.4b[1]
+
+ sdot v1.4s, v19.16b, v7.4b[0]
+ sdot v1.4s, v20.16b, v7.4b[1]
+
+ uzp1 v0.8h, v0.8h, v1.8h
+ sqrshrun v0.8b, v0.8h, #6
+
+ fmov x8, d0
+ lsr x9, x8, #32
+ strh w8, [\dst]
+ strh w9, [\dst, \d_strd]
+ ret
+.endif
+
+ .align JUMP_ALIGN
+L(\type\()_8tap_h_hv_\isa):
+ madd \mx, \mx, w11, w9
+ madd w14, \my, w11, w10 // for HV
+ ldr q28, L(h_tbl_neon_dotprod)
+ mov w13, 0x2002 // FILTER_WEIGHT * 128 + rounding
+ sub \src, \src, #4 // src - 4
+ dup v27.4s, w13
+ ubfx w9, \mx, #7, #7
+ and \mx, \mx, #0x7F
+ ubfx w11, w14, #7, #7 // for HV
+ and w14, w14, #0x7F // for HV
+ cmp \w, #4
+ csel \mx, \mx, w9, le
+ add \xmx, x12, \xmx, lsl #3 // subpel H filter address
+ movi v24.16b, #128
+ cbz \my, L(\type\()_8tap_h_\isa)
+
+ // HV cases
+ cmp \h, #4
+ csel w14, w14, w11, le
+ sub \src, \src, \s_strd, lsl #1 // src - src_stride * 2 - 4
+ add \xmy, x12, x14, lsl #3 // subpel V filter address
+ mov x15, x30
+ ldr d7, [\xmy]
+.ifc \type, put
+ ldr q25, L(hv_tbl_neon_dotprod)
+.endif
+ sxtl v7.8h, v7.8b
+ cmp w10, SHARP1
+ b.ne L(\type\()_6tap_hv_\isa) // vertical != SHARP1
+
+ // HV 8-tap cases
+ sub \src, \src, \s_strd // src - src_stride * 3 - 4
+ cmp \w, #4
+ b.eq 40f
+.ifc \type, put
+ b.lt 20f
+.endif
+
+ // .align JUMP_ALIGN // fallthrough
+80: // HV8 - 8xN+
+ ldr q29, L(h_tbl_neon_dotprod) + 16
+ ldr q30, L(h_tbl_neon_dotprod) + 32
+ ldr d26, [\xmx]
+.ifc \type, prep
+ add \wd_strd, \w, \w
+.endif
+
+ .align LOOP_ALIGN
+81:
+ mov \lsrc, \src
+ mov \ldst, \dst
+ mov w8, \h
+
+ bl L(\type\()_hv_filter8_\isa)
+ mov v16.16b, v22.16b
+ bl L(\type\()_hv_filter8_\isa)
+ mov v17.16b, v22.16b
+ bl L(\type\()_hv_filter8_\isa)
+ mov v18.16b, v22.16b
+ bl L(\type\()_hv_filter8_\isa)
+ mov v19.16b, v22.16b
+ bl L(\type\()_hv_filter8_\isa)
+ mov v20.16b, v22.16b
+ bl L(\type\()_hv_filter8_\isa)
+ mov v21.16b, v22.16b
+ bl L(\type\()_hv_filter8_\isa)
+
+ .align LOOP_ALIGN
+8:
+ ldr q23, [\lsrc]
+ add \lsrc, \lsrc, \s_strd
+
+ smull v0.4s, v16.4h, v7.h[0]
+ smull2 v1.4s, v16.8h, v7.h[0]
+ mov v16.16b, v17.16b
+
+ sub v23.16b, v23.16b, v24.16b
+
+ mov v5.16b, v27.16b
+ mov v6.16b, v27.16b
+
+ smlal v0.4s, v17.4h, v7.h[1]
+ smlal2 v1.4s, v17.8h, v7.h[1]
+ mov v17.16b, v18.16b
+
+ tbl v2.16b, {v23.16b}, v28.16b
+ tbl v3.16b, {v23.16b}, v29.16b
+ tbl v4.16b, {v23.16b}, v30.16b
+
+ smlal v0.4s, v18.4h, v7.h[2]
+ smlal2 v1.4s, v18.8h, v7.h[2]
+ mov v18.16b, v19.16b
+
+ sdot v5.4s, v2.16b, v26.4b[0]
+ sdot v6.4s, v3.16b, v26.4b[0]
+
+ smlal v0.4s, v19.4h, v7.h[3]
+ smlal2 v1.4s, v19.8h, v7.h[3]
+ mov v19.16b, v20.16b
+
+ sdot v5.4s, v3.16b, v26.4b[1]
+ sdot v6.4s, v4.16b, v26.4b[1]
+
+ smlal v0.4s, v20.4h, v7.h[4]
+ smlal2 v1.4s, v20.8h, v7.h[4]
+ mov v20.16b, v21.16b
+
+ smlal v0.4s, v21.4h, v7.h[5]
+ smlal2 v1.4s, v21.8h, v7.h[5]
+.ifc \type, prep
+ uzp1 v23.8h, v5.8h, v6.8h
+.endif
+ mov v21.16b, v22.16b
+
+ smlal v0.4s, v22.4h, v7.h[6]
+ smlal2 v1.4s, v22.8h, v7.h[6]
+.ifc \type, prep
+ sshr v22.8h, v23.8h, #2
+ smlal v0.4s, v22.4h, v7.h[7]
+ smlal2 v1.4s, v22.8h, v7.h[7]
+ rshrn v0.4h, v0.4s, #6
+ rshrn2 v0.8h, v1.4s, #6
+ subs w8, w8, #1
+ st1 {v0.8h}, [\ldst], \d_strd
+ b.gt 8b
+ add \dst, \dst, #16
+.else
+ shrn v22.4h, v5.4s, #2
+ shrn2 v22.8h, v6.4s, #2
+ smlal v0.4s, v22.4h, v7.h[7]
+ smlal2 v1.4s, v22.8h, v7.h[7]
+ tbl v0.16b, {v0.16b, v1.16b}, v25.16b
+ subs w8, w8, #1
+ sqrshrun v0.8b, v0.8h, #2
+ st1 {v0.8b}, [\ldst], \d_strd
+ b.gt 8b
+ add \dst, \dst, #8
+.endif
+ add \src, \src, #8
+ subs \w, \w, #8
+ b.gt 81b
+ ret x15
+
+ .align JUMP_ALIGN
+40: // HV8 - 4xN
+ ldr s26, [\xmx, #2]
+ add \src, \src, #2
+
+ bl L(\type\()_hv_filter4_\isa)
+ mov v16.16b, v22.16b
+ bl L(\type\()_hv_filter4_\isa)
+ mov v17.16b, v22.16b
+ bl L(\type\()_hv_filter4_\isa)
+ mov v18.16b, v22.16b
+ bl L(\type\()_hv_filter4_\isa)
+ mov v19.16b, v22.16b
+ bl L(\type\()_hv_filter4_\isa)
+ mov v20.16b, v22.16b
+ bl L(\type\()_hv_filter4_\isa)
+ mov v21.16b, v22.16b
+ bl L(\type\()_hv_filter4_\isa)
+
+ .align LOOP_ALIGN
+4:
+ ld1 {v4.8b}, [\src], \s_strd
+
+ smull v0.4s, v16.4h, v7.h[0]
+ smlal v0.4s, v17.4h, v7.h[1]
+ mov v16.16b, v17.16b
+ mov v17.16b, v18.16b
+ sub v4.16b, v4.16b, v24.16b
+
+ smlal v0.4s, v18.4h, v7.h[2]
+ smlal v0.4s, v19.4h, v7.h[3]
+ tbl v2.16b, {v4.16b}, v28.16b
+ mov v5.16b, v27.16b
+
+ mov v18.16b, v19.16b
+ mov v19.16b, v20.16b
+
+ smlal v0.4s, v20.4h, v7.h[4]
+ smlal v0.4s, v21.4h, v7.h[5]
+
+ sdot v5.4s, v2.16b, v26.4b[0]
+ mov v20.16b, v21.16b
+ mov v21.16b, v22.16b
+.ifc \type, put
+ subs \h, \h, #1
+.endif
+ smlal v0.4s, v22.4h, v7.h[6]
+ shrn v22.4h, v5.4s, #2
+
+ smlal v0.4s, v22.4h, v7.h[7]
+.ifc \type, prep
+ rshrn v0.4h, v0.4s, #6
+ str d0, [\dst], #8
+ subs \h, \h, #1
+.else
+ tbl v0.16b, {v0.16b, v1.16b}, v25.16b
+ sqrshrun v0.8b, v0.8h, #2
+ str s0, [\dst]
+ add \dst, \dst, \d_strd
+.endif
+ b.gt 4b
+ ret x15
+
+.ifc \type, put
+ .align JUMP_ALIGN
+20: // HV8 - 2xN
+ ldr s26, [\xmx, #2]
+ add \src, \src, #2
+
+ bl L(\type\()_hv_filter4_\isa)
+ mov v16.16b, v22.16b
+ bl L(\type\()_hv_filter4_\isa)
+ mov v17.16b, v22.16b
+ bl L(\type\()_hv_filter4_\isa)
+ mov v18.16b, v22.16b
+ bl L(\type\()_hv_filter4_\isa)
+ mov v19.16b, v22.16b
+ bl L(\type\()_hv_filter4_\isa)
+ mov v20.16b, v22.16b
+ bl L(\type\()_hv_filter4_\isa)
+ mov v21.16b, v22.16b
+ bl L(\type\()_hv_filter4_\isa)
+
+ .align LOOP_ALIGN
+2:
+ ld1 {v4.8b}, [\src], \s_strd
+
+ smull v0.4s, v16.4h, v7.h[0]
+ smlal v0.4s, v17.4h, v7.h[1]
+ mov v16.16b, v17.16b
+ mov v17.16b, v18.16b
+ sub v4.16b, v4.16b, v24.16b
+
+ smlal v0.4s, v18.4h, v7.h[2]
+ smlal v0.4s, v19.4h, v7.h[3]
+ tbl v2.16b, {v4.16b}, v28.16b
+ mov v5.16b, v27.16b
+
+ mov v18.16b, v19.16b
+ mov v19.16b, v20.16b
+
+ smlal v0.4s, v20.4h, v7.h[4]
+ smlal v0.4s, v21.4h, v7.h[5]
+
+ sdot v5.4s, v2.16b, v26.4b[0]
+ mov v20.16b, v21.16b
+ mov v21.16b, v22.16b
+
+ subs \h, \h, #1
+ smlal v0.4s, v22.4h, v7.h[6]
+ shrn v22.4h, v5.4s, #2
+
+ smlal v0.4s, v22.4h, v7.h[7]
+ tbl v0.16b, {v0.16b, v1.16b}, v25.16b
+ sqrshrun v0.8b, v0.8h, #2
+
+ str h0, [\dst]
+ add \dst, \dst, \d_strd
+ b.gt 2b
+ ret x15
+.endif
+
+ .align JUMP_ALIGN
+L(\type\()_6tap_hv_\isa):
+ cmp \w, #4
+ b.eq 40f
+.ifc \type, put
+ b.lt 20f
+.endif
+
+ // .align JUMP_ALIGN // fallthrough
+80: // HV6 - 8xN+
+ ldr q29, L(h_tbl_neon_dotprod) + 16
+ ldr q30, L(h_tbl_neon_dotprod) + 32
+ ldr d26, [\xmx]
+.ifc \type, prep
+ add \wd_strd, \w, \w
+.endif
+
+ .align LOOP_ALIGN
+81:
+ mov \lsrc, \src
+ mov \ldst, \dst
+ mov w8, \h
+
+ bl L(\type\()_hv_filter8_\isa)
+ mov v16.16b, v22.16b
+ bl L(\type\()_hv_filter8_\isa)
+ mov v17.16b, v22.16b
+ bl L(\type\()_hv_filter8_\isa)
+ mov v18.16b, v22.16b
+ bl L(\type\()_hv_filter8_\isa)
+ mov v19.16b, v22.16b
+ bl L(\type\()_hv_filter8_\isa)
+ mov v20.16b, v22.16b
+
+ .align LOOP_ALIGN
+8:
+ ldr q23, [\xmy]
+ add \xmy, \xmy, \s_strd
+
+ smull v0.4s, v16.4h, v7.h[1]
+ smull2 v1.4s, v16.8h, v7.h[1]
+ sub v23.16b, v23.16b, v24.16b
+ mov v16.16b, v17.16b
+
+ mov v5.16b, v27.16b
+ mov v6.16b, v27.16b
+
+ tbl v2.16b, {v23.16b}, v28.16b
+ tbl v3.16b, {v23.16b}, v29.16b
+
+ smlal v0.4s, v17.4h, v7.h[2]
+ smlal2 v1.4s, v17.8h, v7.h[2]
+ tbl v4.16b, {v23.16b}, v30.16b
+ mov v17.16b, v18.16b
+
+ sdot v5.4s, v2.16b, v26.4b[0]
+ sdot v6.4s, v3.16b, v26.4b[0]
+ smlal v0.4s, v18.4h, v7.h[3]
+ smlal2 v1.4s, v18.8h, v7.h[3]
+ mov v18.16b, v19.16b
+
+ sdot v5.4s, v3.16b, v26.4b[1]
+ sdot v6.4s, v4.16b, v26.4b[1]
+ smlal v0.4s, v19.4h, v7.h[4]
+ smlal2 v1.4s, v19.8h, v7.h[4]
+ mov v19.16b, v20.16b
+ uzp1 v23.8h, v5.8h, v6.8h
+
+ smlal v0.4s, v20.4h, v7.h[5]
+ smlal2 v1.4s, v20.8h, v7.h[5]
+ sshr v20.8h, v23.8h, #2
+.ifc \type, prep
+ smlal v0.4s, v20.4h, v7.h[6]
+ smlal2 v1.4s, v20.8h, v7.h[6]
+ rshrn v0.4h, v0.4s, #6
+ rshrn2 v0.8h, v1.4s, #6
+ st1 {v0.8h}, [\ldst], \d_strd
+ subs w8, w8, #1
+ b.gt 8b
+ add \dst, \dst, #16
+.else
+ subs w8, w8, #1
+ smlal v0.4s, v20.4h, v7.h[6]
+ smlal2 v1.4s, v20.8h, v7.h[6]
+ tbl v0.16b, {v0.16b, v1.16b}, v25.16b
+ sqrshrun v0.8b, v0.8h, #2
+ st1 {v0.8b}, [\ldst], \d_strd
+ b.gt 8b
+ add \dst, \dst, #8
+.endif
+ add \src, \src, #8
+ subs \w, \w, #8
+ b.gt 81b
+ ret x15
+
+ .align FUNC_ALIGN
+L(\type\()_hv_filter8_\isa):
+ ldr q4, [\lsrc]
+ add \lsrc, \lsrc, \s_strd
+ sub v4.16b, v4.16b, v24.16b
+ mov v22.16b, v27.16b
+ mov v23.16b, v27.16b
+ tbl v2.16b, {v4.16b}, v28.16b
+ tbl v3.16b, {v4.16b}, v29.16b
+ tbl v4.16b, {v4.16b}, v30.16b
+ sdot v22.4s, v2.16b, v26.4b[0]
+ sdot v22.4s, v3.16b, v26.4b[1]
+ sdot v23.4s, v3.16b, v26.4b[0]
+ sdot v23.4s, v4.16b, v26.4b[1]
+ shrn v22.4h, v22.4s, #2
+ shrn2 v22.8h, v23.4s, #2
+ ret
+
+ .align FUNC_ALIGN
+L(\type\()_hv_filter4_\isa):
+ mov v22.16b, v27.16b
+ ld1 {v4.8b}, [\src], \s_strd
+ sub v4.16b, v4.16b, v24.16b
+ tbl v2.16b, {v4.16b}, v28.16b
+ sdot v22.4s, v2.16b, v26.4b[0]
+ shrn v22.4h, v22.4s, #2
+ ret
+
+ .align JUMP_ALIGN
+40: // HV6 - 4xN
+ ldr s26, [\xmx, #2]
+ add \src, \src, #2
+
+ bl L(\type\()_hv_filter4_\isa)
+ mov v16.16b, v22.16b
+ bl L(\type\()_hv_filter4_\isa)
+ mov v17.16b, v22.16b
+ bl L(\type\()_hv_filter4_\isa)
+ mov v18.16b, v22.16b
+ bl L(\type\()_hv_filter4_\isa)
+ mov v19.16b, v22.16b
+ bl L(\type\()_hv_filter4_\isa)
+ mov v20.16b, v22.16b
+
+ .align LOOP_ALIGN
+4:
+ ld1 {v4.8b}, [\src], \s_strd
+
+ smull v0.4s, v16.4h, v7.h[1]
+ smlal v0.4s, v17.4h, v7.h[2]
+ sub v4.16b, v4.16b, v24.16b
+ mov v16.16b, v17.16b
+ mov v17.16b, v18.16b
+
+ smlal v0.4s, v18.4h, v7.h[3]
+ smlal v0.4s, v19.4h, v7.h[4]
+ tbl v2.16b, {v4.16b}, v28.16b
+ mov v5.16b, v27.16b
+
+ mov v18.16b, v19.16b
+ mov v19.16b, v20.16b
+ sdot v5.4s, v2.16b, v26.4b[0]
+
+ smlal v0.4s, v20.4h, v7.h[5]
+ shrn v20.4h, v5.4s, #2
+.ifc \type, prep
+ smlal v0.4s, v20.4h, v7.h[6]
+ rshrn v0.4h, v0.4s, #6
+ str d0, [\dst], #8
+ subs \h, \h, #1
+.else
+ subs \h, \h, #1
+ smlal v0.4s, v20.4h, v7.h[6]
+ tbl v0.16b, {v0.16b}, v25.16b
+ sqrshrun v0.8b, v0.8h, #2
+ str s0, [\dst]
+ add \dst, \dst, \d_strd
+.endif
+ b.gt 4b
+ ret x15
+
+.ifc \type, put
+ .align JUMP_ALIGN
+20: // HV6 - 2xN
+ ldr s26, [\xmx, #2]
+ add \src, \src, #2
+
+ bl L(\type\()_hv_filter4_\isa)
+ mov v16.16b, v22.16b
+ bl L(\type\()_hv_filter4_\isa)
+ mov v17.16b, v22.16b
+ bl L(\type\()_hv_filter4_\isa)
+ mov v18.16b, v22.16b
+ bl L(\type\()_hv_filter4_\isa)
+ mov v19.16b, v22.16b
+ bl L(\type\()_hv_filter4_\isa)
+ mov v20.16b, v22.16b
+
+ .align LOOP_ALIGN
+2:
+ ld1 {v4.8b}, [\src], \s_strd
+
+ smull v0.4s, v16.4h, v7.h[1]
+ smlal v0.4s, v17.4h, v7.h[2]
+ sub v4.16b, v4.16b, v24.16b
+ mov v16.16b, v17.16b
+ mov v17.16b, v18.16b
+
+ smlal v0.4s, v18.4h, v7.h[3]
+ smlal v0.4s, v19.4h, v7.h[4]
+ tbl v2.16b, {v4.16b}, v28.16b
+ mov v5.16b, v27.16b
+
+ mov v18.16b, v19.16b
+ mov v19.16b, v20.16b
+ sdot v5.4s, v2.16b, v26.4b[0]
+
+ smlal v0.4s, v20.4h, v7.h[5]
+ shrn v20.4h, v5.4s, #2
+
+ subs \h, \h, #1
+ smlal v0.4s, v20.4h, v7.h[6]
+
+ tbl v0.16b, {v0.16b}, v25.16b
+ sqrshrun v0.8b, v0.8h, #2
+
+ str h0, [\dst]
+ add \dst, \dst, \d_strd
+ b.gt 2b
+ ret x15
+.endif
+
+ .align JUMP_ALIGN
+L(\type\()_8tap_h_\isa):
+ adr x9, L(\type\()_8tap_h_\isa\()_tbl)
+ ldrh w8, [x9, x8, lsl #1]
+.ifc \type, put
+ mov w10, #0x2022 // 64 * 128 + 34, bias and rounding for SDOT
+ dup v27.4s, w10
+.endif
+ sub x9, x9, x8
+ br x9
+
+.ifc \type, put
+ .align JUMP_ALIGN
+20: // H - 2xN
+ AARCH64_VALID_JUMP_TARGET
+ add \src, \src, #2
+ ldr s6, [\xmx, #2]
+
+ .align LOOP_ALIGN
+2:
+ ldr d0, [\src]
+ ldr d1, [\src, \s_strd]
+ add \src, \src, \s_strd, lsl #1
+
+ sub v0.8b, v0.8b, v24.8b
+ sub v1.8b, v1.8b, v24.8b
+
+ mov v4.16b, v27.16b
+ mov v5.16b, v27.16b
+
+ tbl v2.16b, {v0.16b}, v28.16b
+ tbl v3.16b, {v1.16b}, v28.16b
+
+ sdot v4.4s, v2.16b, v6.4b[0]
+ sdot v5.4s, v3.16b, v6.4b[0]
+
+ uzp1 v4.8h, v4.8h, v5.8h
+ sqshrun v4.8b, v4.8h, #6
+
+ subs \h, \h, #2
+ fmov x8, d4
+ lsr x9, x8, #32
+ strh w8, [\dst]
+ strh w9, [\dst, \d_strd]
+ add \dst, \dst, \d_strd, lsl #1
+ b.gt 2b
+ ret
+
+.endif
+
+ .align JUMP_ALIGN
+40: // H - 4xN
+ AARCH64_VALID_JUMP_TARGET
+ add \src, \src, #2
+ ldr s26, [\xmx, #2]
+
+ .align LOOP_ALIGN
+4:
+ ldr d0, [\src]
+ ldr d1, [\src, \s_strd]
+ add \src, \src, \s_strd, lsl #1
+
+ sub v0.8b, v0.8b, v24.8b
+ sub v1.8b, v1.8b, v24.8b
+
+ mov v4.16b, v27.16b
+ mov v5.16b, v27.16b
+
+ tbl v2.16b, {v0.16b}, v28.16b
+ tbl v3.16b, {v1.16b}, v28.16b
+
+ sdot v4.4s, v2.16b, v26.4b[0]
+ sdot v5.4s, v3.16b, v26.4b[0]
+.ifc \type, prep
+ subs \h, \h, #2
+ shrn v4.4h, v4.4s, #2
+ shrn2 v4.8h, v5.4s, #2
+ str q4, [\dst], #16
+.else
+ uzp1 v4.8h, v4.8h, v5.8h
+ sqshrun v4.8b, v4.8h, #6
+ subs \h, \h, #2
+ fmov x8, d4
+ lsr x9, x8, #32
+ str w8, [\dst]
+ str w9, [\dst, \d_strd]
+ add \dst, \dst, \d_strd, lsl #1
+.endif
+ b.gt 4b
+ ret
+
+ .align JUMP_ALIGN
+80: // H - 8xN
+ AARCH64_VALID_JUMP_TARGET
+ ldr q29, L(h_tbl_neon_dotprod) + 16
+ ldr q30, L(h_tbl_neon_dotprod) + 32
+ ldr d26, [\xmx]
+
+ .align LOOP_ALIGN
+8:
+ ldr q0, [\src]
+ ldr q16, [\src, \s_strd]
+ add \src, \src, \s_strd, lsl #1
+
+ sub v0.16b, v0.16b, v24.16b
+ sub v16.16b, v16.16b, v24.16b
+
+ mov v4.16b, v27.16b
+ mov v5.16b, v27.16b
+ mov v20.16b, v27.16b
+ mov v21.16b, v27.16b
+
+ tbl v1.16b, {v0.16b}, v28.16b
+ tbl v2.16b, {v0.16b}, v29.16b
+ tbl v3.16b, {v0.16b}, v30.16b
+ tbl v17.16b, {v16.16b}, v28.16b
+ tbl v18.16b, {v16.16b}, v29.16b
+ tbl v19.16b, {v16.16b}, v30.16b
+
+ sdot v4.4s, v1.16b, v26.4b[0]
+ sdot v5.4s, v2.16b, v26.4b[0]
+ sdot v20.4s, v17.16b, v26.4b[0]
+ sdot v21.4s, v18.16b, v26.4b[0]
+ sdot v4.4s, v2.16b, v26.4b[1]
+ sdot v5.4s, v3.16b, v26.4b[1]
+ sdot v20.4s, v18.16b, v26.4b[1]
+ sdot v21.4s, v19.16b, v26.4b[1]
+
+ uzp1 v4.8h, v4.8h, v5.8h
+ uzp1 v20.8h, v20.8h, v21.8h
+.ifc \type, prep
+ sshr v4.8h, v4.8h, #2
+ sshr v20.8h, v20.8h, #2
+ subs \h, \h, #2
+ stp q4, q20, [\dst], #32
+.else
+ sqshrun v4.8b, v4.8h, #6
+ sqshrun v20.8b, v20.8h, #6
+ subs \h, \h, #2
+ str d4, [\dst]
+ str d20, [\dst, \d_strd]
+ add \dst, \dst, \d_strd, lsl #1
+.endif
+ b.gt 8b
+ ret
+
+ .align JUMP_ALIGN
+160: // H - 16xN
+ AARCH64_VALID_JUMP_TARGET
+ ldr q29, L(h_tbl_neon_dotprod) + 16
+ ldr q30, L(h_tbl_neon_dotprod) + 32
+ ldr q31, L(h_tbl_neon_dotprod) + 48
+ ldr d26, [\xmx]
+
+ .align LOOP_ALIGN
+16:
+ ldp q16, q17, [\src]
+ add \src, \src, \s_strd
+
+ sub v16.16b, v16.16b, v24.16b
+ sub v17.16b, v17.16b, v24.16b
+
+ mov v6.16b, v27.16b
+ mov v7.16b, v27.16b
+ mov v22.16b, v27.16b
+ mov v23.16b, v27.16b
+
+ tbl v0.16b, {v16.16b}, v28.16b
+ tbl v1.16b, {v16.16b}, v29.16b
+ tbl v2.16b, {v16.16b}, v30.16b
+ tbl v3.16b, {v16.16b, v17.16b}, v31.16b
+ tbl v4.16b, {v17.16b}, v28.16b
+
+ sdot v6.4s, v0.16b, v26.4b[0]
+ sdot v7.4s, v1.16b, v26.4b[0]
+ sdot v22.4s, v2.16b, v26.4b[0]
+ sdot v23.4s, v3.16b, v26.4b[0]
+ sdot v6.4s, v1.16b, v26.4b[1]
+ sdot v7.4s, v2.16b, v26.4b[1]
+ sdot v22.4s, v3.16b, v26.4b[1]
+ sdot v23.4s, v4.16b, v26.4b[1]
+
+ uzp1 v6.8h, v6.8h, v7.8h
+ uzp1 v22.8h, v22.8h, v23.8h
+.ifc \type, prep
+ sshr v6.8h, v6.8h, #2
+ sshr v22.8h, v22.8h, #2
+ subs \h, \h, #1
+ stp q6, q22, [\dst], #32
+.else
+ sqshrun v6.8b, v6.8h, #6
+ sqshrun2 v6.16b, v22.8h, #6
+ subs \h, \h, #1
+ str q6, [\dst]
+ add \dst, \dst, \d_strd
+.endif
+ b.gt 16b
+ ret
+
+ .align JUMP_ALIGN
+320: // H - 32xN+
+640:
+1280:
+ AARCH64_VALID_JUMP_TARGET
+ ldr q29, L(h_tbl_neon_dotprod) + 16
+ ldr q30, L(h_tbl_neon_dotprod) + 32
+ ldr q31, L(h_tbl_neon_dotprod) + 48
+ ldr d26, [\xmx]
+.ifc \type, put
+ sub \d_strd, \d_strd, \w, uxtw
+.endif
+ sub \s_strd, \s_strd, \w, uxtw
+ mov w8, \w
+
+ .align LOOP_ALIGN
+32:
+ ldp q16, q17, [\src], #16
+
+ sub v16.16b, v16.16b, v24.16b
+ sub v17.16b, v17.16b, v24.16b
+
+ mov v6.16b, v27.16b
+ mov v7.16b, v27.16b
+ mov v22.16b, v27.16b
+ mov v23.16b, v27.16b
+
+ tbl v0.16b, {v16.16b}, v28.16b
+ tbl v1.16b, {v16.16b}, v29.16b
+ tbl v2.16b, {v16.16b}, v30.16b
+ tbl v3.16b, {v16.16b, v17.16b}, v31.16b
+ tbl v4.16b, {v17.16b}, v28.16b
+
+ sdot v6.4s, v0.16b, v26.4b[0]
+ sdot v7.4s, v1.16b, v26.4b[0]
+ sdot v22.4s, v2.16b, v26.4b[0]
+ sdot v23.4s, v3.16b, v26.4b[0]
+ sdot v6.4s, v1.16b, v26.4b[1]
+ sdot v7.4s, v2.16b, v26.4b[1]
+ sdot v22.4s, v3.16b, v26.4b[1]
+ sdot v23.4s, v4.16b, v26.4b[1]
+
+ uzp1 v6.8h, v6.8h, v7.8h
+ uzp1 v22.8h, v22.8h, v23.8h
+.ifc \type, prep
+ sshr v6.8h, v6.8h, #2
+ sshr v22.8h, v22.8h, #2
+ subs w8, w8, #16
+ stp q6, q22, [\dst], #32
+.else
+ sqshrun v6.8b, v6.8h, #6
+ sqshrun2 v6.16b, v22.8h, #6
+ subs w8, w8, #16
+ str q6, [\dst], #16
+.endif
+ b.gt 32b
+
+ add \src, \src, \s_strd
+.ifc \type, put
+ add \dst, \dst, \d_strd
+.endif
+ mov w8, \w
+ subs \h, \h, #1
+ b.gt 32b
+ ret
+
+L(\type\()_8tap_h_\isa\()_tbl):
+ .hword (L(\type\()_8tap_h_\isa\()_tbl) - 1280b)
+ .hword (L(\type\()_8tap_h_\isa\()_tbl) - 640b)
+ .hword (L(\type\()_8tap_h_\isa\()_tbl) - 320b)
+ .hword (L(\type\()_8tap_h_\isa\()_tbl) - 160b)
+ .hword (L(\type\()_8tap_h_\isa\()_tbl) - 80b)
+ .hword (L(\type\()_8tap_h_\isa\()_tbl) - 40b)
+.ifc \type, put
+ .hword (L(\type\()_8tap_h_\isa\()_tbl) - 20b)
+.endif
+endfunc
+.endm
+
+// dst(x0), d_strd(x7), src(x1), s_strd(x2), w(w3), h(w4), mx(w5), my(w6)
+// xmx(x5), xmy(x6), ldst(x5), lsrc(x6), wd_strd(w7)
+filter_8tap_fn prep, neon_dotprod, x0, x7, x1, x2, w3, w4, w5, w6, x5, x6, x5, x6, w7
+
+// dst(x0) d_strd(x1) src(x2) s_strd(x3) w(w4) h(w5) mx(w6) my(w7)
+// xmx(x6), xmy(x7), ldst(x6), lsrc(x7), wd_strd(w1)
+filter_8tap_fn put, neon_dotprod, x0, x1, x2, x3, w4, w5, w6, w7, x6, x7, x6, x7, w1
+
+DISABLE_DOTPROD
+#endif // HAVE_DOTPROD
diff --git a/third_party/dav1d/src/arm/64/msac.S b/third_party/dav1d/src/arm/64/msac.S
index 7bef9243fb..9033072a82 100644
--- a/third_party/dav1d/src/arm/64/msac.S
+++ b/third_party/dav1d/src/arm/64/msac.S
@@ -288,10 +288,8 @@ function msac_decode_hi_tok_neon, export=1
mvni v30.4h, #0x3f // 0xffc0
ldrh w9, [x1, #6] // count = cdf[n_symbols]
ld1r {v3.4h}, [x16] // rng
- movrel x16, bits
ld1 {v29.4h}, [x17] // EC_MIN_PROB * (n_symbols - ret)
add x17, x0, #DIF + 6
- ld1 {v16.8h}, [x16]
mov w13, #-24
and v17.8b, v0.8b, v30.8b // cdf & 0xffc0
ldr w10, [x0, #ALLOW_UPDATE_CDF]
@@ -305,30 +303,27 @@ function msac_decode_hi_tok_neon, export=1
add v4.4h, v17.4h, v29.4h // v = cdf + EC_MIN_PROB * (n_symbols - ret)
add v4.4h, v6.4h, v4.4h // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
str h3, [sp, #14] // store original u = s->rng
- cmhs v2.8h, v1.8h, v4.8h // c >= v
+ cmhs v2.4h, v1.4h, v4.4h // c >= v
str q4, [sp, #16] // store v values to allow indexed access
- and v6.16b, v2.16b, v16.16b // One bit per halfword set in the mask
- addv h6, v6.8h // Aggregate mask bits
- umov w3, v6.h[0]
+ addv h6, v2.4h // -4 + ret
add w13, w13, #5
- rbit w3, w3
+ smov w15, v6.h[0]
add x8, sp, #16
- clz w15, w3 // ret
+ add w15, w15, #4 // ret
cbz w10, 2f
// update_cdf
- movi v5.8b, #0xff
+ sub v5.4h, v0.4h, v2.4h // cdf[i] + (i >= val ? 1 : 0)
mov w4, #-5
- urhadd v4.4h, v5.4h, v2.4h // i >= val ? -1 : 32768
+ orr v2.4h, #0x80, lsl #8 // i >= val ? -1 : 32768
sub w4, w4, w9, lsr #4 // -((count >> 4) + 5)
- sub v4.4h, v4.4h, v0.4h // (32768 - cdf[i]) or (-1 - cdf[i])
+ sub v4.4h, v2.4h, v0.4h // (32768 - cdf[i]) or (-1 - cdf[i])
dup v6.4h, w4 // -rate
sub w9, w9, w9, lsr #5 // count - (count == 32)
- sub v0.4h, v0.4h, v2.4h // cdf + (i >= val ? 1 : 0)
sshl v4.4h, v4.4h, v6.4h // ({32768,-1} - cdf[i]) >> rate
add w9, w9, #1 // count + (count < 32)
- add v0.4h, v0.4h, v4.4h // cdf + (32768 - cdf[i]) >> rate
+ add v0.4h, v5.4h, v4.4h // cdf[i] + (32768 - cdf[i]) >> rate
st1 {v0.4h}, [x1]
and v17.8b, v0.8b, v30.8b // cdf & 0xffc0
strh w9, [x1, #6]
diff --git a/third_party/dav1d/src/arm/itx.h b/third_party/dav1d/src/arm/itx.h
index 17234e027a..2a58a31322 100644
--- a/third_party/dav1d/src/arm/itx.h
+++ b/third_party/dav1d/src/arm/itx.h
@@ -28,34 +28,6 @@
#include "src/cpu.h"
#include "src/itx.h"
-#define decl_itx2_fns(w, h, opt) \
-decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt))
-
-#define decl_itx12_fns(w, h, opt) \
-decl_itx2_fns(w, h, opt); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt))
-
-#define decl_itx16_fns(w, h, opt) \
-decl_itx12_fns(w, h, opt); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt))
-
-#define decl_itx17_fns(w, h, opt) \
-decl_itx16_fns(w, h, opt); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt))
-
decl_itx17_fns( 4, 4, neon);
decl_itx16_fns( 4, 8, neon);
decl_itx16_fns( 4, 16, neon);
@@ -78,41 +50,6 @@ decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, neon));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, neon));
static ALWAYS_INLINE void itx_dsp_init_arm(Dav1dInvTxfmDSPContext *const c, int bpc) {
-#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
- c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
- BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
-
-#define assign_itx1_fn(pfx, w, h, ext) \
- assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext)
-
-#define assign_itx2_fn(pfx, w, h, ext) \
- assign_itx1_fn(pfx, w, h, ext); \
- assign_itx_fn(pfx, w, h, identity_identity, IDTX, ext)
-
-#define assign_itx12_fn(pfx, w, h, ext) \
- assign_itx2_fn(pfx, w, h, ext); \
- assign_itx_fn(pfx, w, h, dct_adst, ADST_DCT, ext); \
- assign_itx_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, ext); \
- assign_itx_fn(pfx, w, h, dct_identity, H_DCT, ext); \
- assign_itx_fn(pfx, w, h, adst_dct, DCT_ADST, ext); \
- assign_itx_fn(pfx, w, h, adst_adst, ADST_ADST, ext); \
- assign_itx_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, ext); \
- assign_itx_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, ext); \
- assign_itx_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, ext); \
- assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
- assign_itx_fn(pfx, w, h, identity_dct, V_DCT, ext)
-
-#define assign_itx16_fn(pfx, w, h, ext) \
- assign_itx12_fn(pfx, w, h, ext); \
- assign_itx_fn(pfx, w, h, adst_identity, H_ADST, ext); \
- assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST, ext); \
- assign_itx_fn(pfx, w, h, identity_adst, V_ADST, ext); \
- assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST, ext)
-
-#define assign_itx17_fn(pfx, w, h, ext) \
- assign_itx16_fn(pfx, w, h, ext); \
- assign_itx_fn(pfx, w, h, wht_wht, WHT_WHT, ext)
-
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
diff --git a/third_party/dav1d/src/arm/mc.h b/third_party/dav1d/src/arm/mc.h
index 06cd533a9b..7e57fd37cb 100644
--- a/third_party/dav1d/src/arm/mc.h
+++ b/third_party/dav1d/src/arm/mc.h
@@ -30,26 +30,40 @@
#include "src/mc.h"
#include "src/cpu.h"
-decl_mc_fn(BF(dav1d_put_8tap_regular, neon));
-decl_mc_fn(BF(dav1d_put_8tap_regular_smooth, neon));
-decl_mc_fn(BF(dav1d_put_8tap_regular_sharp, neon));
-decl_mc_fn(BF(dav1d_put_8tap_smooth, neon));
-decl_mc_fn(BF(dav1d_put_8tap_smooth_regular, neon));
-decl_mc_fn(BF(dav1d_put_8tap_smooth_sharp, neon));
-decl_mc_fn(BF(dav1d_put_8tap_sharp, neon));
-decl_mc_fn(BF(dav1d_put_8tap_sharp_regular, neon));
-decl_mc_fn(BF(dav1d_put_8tap_sharp_smooth, neon));
-decl_mc_fn(BF(dav1d_put_bilin, neon));
+#define decl_8tap_gen(decl_name, fn_name, opt) \
+ decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_regular, opt)); \
+ decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_regular_smooth, opt)); \
+ decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_regular_sharp, opt)); \
+ decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_smooth_regular, opt)); \
+ decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_smooth, opt)); \
+ decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_smooth_sharp, opt)); \
+ decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_sharp_regular, opt)); \
+ decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_sharp_smooth, opt)); \
+ decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_sharp, opt))
+
+#define decl_8tap_fns(opt) \
+ decl_8tap_gen(mc, put, opt); \
+ decl_8tap_gen(mct, prep, opt)
+
+#define init_8tap_gen(name, opt) \
+ init_##name##_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, opt); \
+ init_##name##_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, opt); \
+ init_##name##_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, opt); \
+ init_##name##_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, opt); \
+ init_##name##_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, opt); \
+ init_##name##_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, opt); \
+ init_##name##_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, opt); \
+ init_##name##_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, opt); \
+ init_##name##_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, opt)
+
+#define init_8tap_fns(opt) \
+ init_8tap_gen(mc, opt); \
+ init_8tap_gen(mct, opt)
+
+decl_8tap_fns(neon);
+decl_8tap_fns(neon_dotprod);
-decl_mct_fn(BF(dav1d_prep_8tap_regular, neon));
-decl_mct_fn(BF(dav1d_prep_8tap_regular_smooth, neon));
-decl_mct_fn(BF(dav1d_prep_8tap_regular_sharp, neon));
-decl_mct_fn(BF(dav1d_prep_8tap_smooth, neon));
-decl_mct_fn(BF(dav1d_prep_8tap_smooth_regular, neon));
-decl_mct_fn(BF(dav1d_prep_8tap_smooth_sharp, neon));
-decl_mct_fn(BF(dav1d_prep_8tap_sharp, neon));
-decl_mct_fn(BF(dav1d_prep_8tap_sharp_regular, neon));
-decl_mct_fn(BF(dav1d_prep_8tap_sharp_smooth, neon));
+decl_mc_fn(BF(dav1d_put_bilin, neon));
decl_mct_fn(BF(dav1d_prep_bilin, neon));
decl_avg_fn(BF(dav1d_avg, neon));
@@ -77,27 +91,10 @@ static ALWAYS_INLINE void mc_dsp_init_arm(Dav1dMCDSPContext *const c) {
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
- init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, neon);
- init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon);
- init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon);
- init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon);
- init_mc_fn (FILTER_2D_8TAP_SMOOTH, 8tap_smooth, neon);
- init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, neon);
- init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, neon);
- init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, neon);
- init_mc_fn (FILTER_2D_8TAP_SHARP, 8tap_sharp, neon);
- init_mc_fn (FILTER_2D_BILINEAR, bilin, neon);
-
- init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, neon);
- init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon);
- init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon);
- init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon);
- init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, neon);
- init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, neon);
- init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, neon);
- init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, neon);
- init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, neon);
- init_mct_fn(FILTER_2D_BILINEAR, bilin, neon);
+ init_8tap_fns(neon);
+
+ init_mc_fn (FILTER_2D_BILINEAR, bilin, neon);
+ init_mct_fn(FILTER_2D_BILINEAR, bilin, neon);
c->avg = BF(dav1d_avg, neon);
c->w_avg = BF(dav1d_w_avg, neon);
@@ -111,4 +108,12 @@ static ALWAYS_INLINE void mc_dsp_init_arm(Dav1dMCDSPContext *const c) {
c->warp8x8 = BF(dav1d_warp_affine_8x8, neon);
c->warp8x8t = BF(dav1d_warp_affine_8x8t, neon);
c->emu_edge = BF(dav1d_emu_edge, neon);
+
+#if ARCH_AARCH64
+#if HAVE_DOTPROD && BITDEPTH == 8
+ if (!(flags & DAV1D_ARM_CPU_FLAG_DOTPROD)) return;
+
+ init_8tap_fns(neon_dotprod);
+#endif // HAVE_DOTPROD && BITDEPTH == 8
+#endif // ARCH_AARCH64
}
diff --git a/third_party/dav1d/src/cdf.c b/third_party/dav1d/src/cdf.c
index e0f2132e00..d9721dad46 100644
--- a/third_party/dav1d/src/cdf.c
+++ b/third_party/dav1d/src/cdf.c
@@ -65,631 +65,638 @@
#define CDF15(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o) \
CDF1(a), CDF14(b,c,d,e,f,g,h,i,j,k,l,m,n,o)
-static const CdfModeContext av1_default_cdf = {
- .y_mode = {
- { CDF12(22801, 23489, 24293, 24756, 25601, 26123,
- 26606, 27418, 27945, 29228, 29685, 30349) },
- { CDF12(18673, 19845, 22631, 23318, 23950, 24649,
- 25527, 27364, 28152, 29701, 29984, 30852) },
- { CDF12(19770, 20979, 23396, 23939, 24241, 24654,
- 25136, 27073, 27830, 29360, 29730, 30659) },
- { CDF12(20155, 21301, 22838, 23178, 23261, 23533,
- 23703, 24804, 25352, 26575, 27016, 28049) },
- }, .use_filter_intra = {
- [BS_4x4] = { CDF1( 4621) },
- [BS_4x8] = { CDF1( 6743) },
- [BS_8x4] = { CDF1( 5893) },
- [BS_8x8] = { CDF1( 7866) },
- [BS_8x16] = { CDF1(12551) },
- [BS_16x8] = { CDF1( 9394) },
- [BS_16x16] = { CDF1(12408) },
- [BS_16x32] = { CDF1(14301) },
- [BS_32x16] = { CDF1(12756) },
- [BS_32x32] = { CDF1(22343) },
- [BS_32x64] = { CDF1(16384) },
- [BS_64x32] = { CDF1(16384) },
- [BS_64x64] = { CDF1(16384) },
- [BS_64x128] = { CDF1(16384) },
- [BS_128x64] = { CDF1(16384) },
- [BS_128x128] = { CDF1(16384) },
- [BS_4x16] = { CDF1(12770) },
- [BS_16x4] = { CDF1(10368) },
- [BS_8x32] = { CDF1(20229) },
- [BS_32x8] = { CDF1(18101) },
- [BS_16x64] = { CDF1(16384) },
- [BS_64x16] = { CDF1(16384) },
- }, .filter_intra = {
- CDF4(8949, 12776, 17211, 29558),
- }, .uv_mode = {
- {
- { CDF12(22631, 24152, 25378, 25661, 25986, 26520,
- 27055, 27923, 28244, 30059, 30941, 31961) },
- { CDF12( 9513, 26881, 26973, 27046, 27118, 27664,
- 27739, 27824, 28359, 29505, 29800, 31796) },
- { CDF12( 9845, 9915, 28663, 28704, 28757, 28780,
- 29198, 29822, 29854, 30764, 31777, 32029) },
- { CDF12(13639, 13897, 14171, 25331, 25606, 25727,
- 25953, 27148, 28577, 30612, 31355, 32493) },
- { CDF12( 9764, 9835, 9930, 9954, 25386, 27053,
- 27958, 28148, 28243, 31101, 31744, 32363) },
- { CDF12(11825, 13589, 13677, 13720, 15048, 29213,
- 29301, 29458, 29711, 31161, 31441, 32550) },
- { CDF12(14175, 14399, 16608, 16821, 17718, 17775,
- 28551, 30200, 30245, 31837, 32342, 32667) },
- { CDF12(12885, 13038, 14978, 15590, 15673, 15748,
- 16176, 29128, 29267, 30643, 31961, 32461) },
- { CDF12(12026, 13661, 13874, 15305, 15490, 15726,
- 15995, 16273, 28443, 30388, 30767, 32416) },
- { CDF12(19052, 19840, 20579, 20916, 21150, 21467,
- 21885, 22719, 23174, 28861, 30379, 32175) },
- { CDF12(18627, 19649, 20974, 21219, 21492, 21816,
- 22199, 23119, 23527, 27053, 31397, 32148) },
- { CDF12(17026, 19004, 19997, 20339, 20586, 21103,
- 21349, 21907, 22482, 25896, 26541, 31819) },
- { CDF12(12124, 13759, 14959, 14992, 15007, 15051,
- 15078, 15166, 15255, 15753, 16039, 16606) },
- }, {
- { CDF13(10407, 11208, 12900, 13181, 13823, 14175, 14899,
- 15656, 15986, 20086, 20995, 22455, 24212) },
- { CDF13( 4532, 19780, 20057, 20215, 20428, 21071, 21199,
- 21451, 22099, 24228, 24693, 27032, 29472) },
- { CDF13( 5273, 5379, 20177, 20270, 20385, 20439, 20949,
- 21695, 21774, 23138, 24256, 24703, 26679) },
- { CDF13( 6740, 7167, 7662, 14152, 14536, 14785, 15034,
- 16741, 18371, 21520, 22206, 23389, 24182) },
- { CDF13( 4987, 5368, 5928, 6068, 19114, 20315, 21857,
- 22253, 22411, 24911, 25380, 26027, 26376) },
- { CDF13( 5370, 6889, 7247, 7393, 9498, 21114, 21402,
- 21753, 21981, 24780, 25386, 26517, 27176) },
- { CDF13( 4816, 4961, 7204, 7326, 8765, 8930, 20169,
- 20682, 20803, 23188, 23763, 24455, 24940) },
- { CDF13( 6608, 6740, 8529, 9049, 9257, 9356, 9735,
- 18827, 19059, 22336, 23204, 23964, 24793) },
- { CDF13( 5998, 7419, 7781, 8933, 9255, 9549, 9753,
- 10417, 18898, 22494, 23139, 24764, 25989) },
- { CDF13(10660, 11298, 12550, 12957, 13322, 13624, 14040,
- 15004, 15534, 20714, 21789, 23443, 24861) },
- { CDF13(10522, 11530, 12552, 12963, 13378, 13779, 14245,
- 15235, 15902, 20102, 22696, 23774, 25838) },
- { CDF13(10099, 10691, 12639, 13049, 13386, 13665, 14125,
- 15163, 15636, 19676, 20474, 23519, 25208) },
- { CDF13( 3144, 5087, 7382, 7504, 7593, 7690, 7801,
- 8064, 8232, 9248, 9875, 10521, 29048) },
- },
- }, .angle_delta = {
- { CDF6( 2180, 5032, 7567, 22776, 26989, 30217) },
- { CDF6( 2301, 5608, 8801, 23487, 26974, 30330) },
- { CDF6( 3780, 11018, 13699, 19354, 23083, 31286) },
- { CDF6( 4581, 11226, 15147, 17138, 21834, 28397) },
- { CDF6( 1737, 10927, 14509, 19588, 22745, 28823) },
- { CDF6( 2664, 10176, 12485, 17650, 21600, 30495) },
- { CDF6( 2240, 11096, 15453, 20341, 22561, 28917) },
- { CDF6( 3605, 10428, 12459, 17676, 21244, 30655) },
- }, .filter = {
- {
- { CDF2(31935, 32720) }, { CDF2( 5568, 32719) },
- { CDF2( 422, 2938) }, { CDF2(28244, 32608) },
- { CDF2(31206, 31953) }, { CDF2( 4862, 32121) },
- { CDF2( 770, 1152) }, { CDF2(20889, 25637) },
- }, {
- { CDF2(31910, 32724) }, { CDF2( 4120, 32712) },
- { CDF2( 305, 2247) }, { CDF2(27403, 32636) },
- { CDF2(31022, 32009) }, { CDF2( 2963, 32093) },
- { CDF2( 601, 943) }, { CDF2(14969, 21398) },
- },
- }, .newmv_mode = {
- { CDF1(24035) }, { CDF1(16630) }, { CDF1(15339) },
- { CDF1( 8386) }, { CDF1(12222) }, { CDF1( 4676) },
- }, .globalmv_mode = {
- { CDF1( 2175) }, { CDF1( 1054) },
- }, .refmv_mode = {
- { CDF1(23974) }, { CDF1(24188) }, { CDF1(17848) },
- { CDF1(28622) }, { CDF1(24312) }, { CDF1(19923) },
- }, .drl_bit = {
- { CDF1(13104) }, { CDF1(24560) }, { CDF1(18945) },
- }, .comp_inter_mode = {
- { CDF7( 7760, 13823, 15808, 17641, 19156, 20666, 26891) },
- { CDF7(10730, 19452, 21145, 22749, 24039, 25131, 28724) },
- { CDF7(10664, 20221, 21588, 22906, 24295, 25387, 28436) },
- { CDF7(13298, 16984, 20471, 24182, 25067, 25736, 26422) },
- { CDF7(18904, 23325, 25242, 27432, 27898, 28258, 30758) },
- { CDF7(10725, 17454, 20124, 22820, 24195, 25168, 26046) },
- { CDF7(17125, 24273, 25814, 27492, 28214, 28704, 30592) },
- { CDF7(13046, 23214, 24505, 25942, 27435, 28442, 29330) },
- }, .intra = {
- { CDF1( 806) }, { CDF1(16662) }, { CDF1(20186) },
- { CDF1(26538) },
- }, .comp = {
- { CDF1(26828) }, { CDF1(24035) }, { CDF1(12031) },
- { CDF1(10640) }, { CDF1( 2901) },
- }, .comp_dir = {
- { CDF1( 1198) }, { CDF1( 2070) }, { CDF1( 9166) },
- { CDF1( 7499) }, { CDF1(22475) },
- }, .jnt_comp = {
- { CDF1(18244) }, { CDF1(12865) }, { CDF1( 7053) },
- { CDF1(13259) }, { CDF1( 9334) }, { CDF1( 4644) },
- }, .mask_comp = {
- { CDF1(26607) }, { CDF1(22891) }, { CDF1(18840) },
- { CDF1(24594) }, { CDF1(19934) }, { CDF1(22674) },
- }, .wedge_comp = {
- { CDF1(23431) }, { CDF1(13171) }, { CDF1(11470) },
- { CDF1( 9770) }, { CDF1( 9100) }, { CDF1( 8233) },
- { CDF1( 6172) }, { CDF1(11820) }, { CDF1( 7701) },
- }, .wedge_idx = {
- { CDF15( 2438, 4440, 6599, 8663, 11005, 12874, 15751, 18094,
- 20359, 22362, 24127, 25702, 27752, 29450, 31171) },
- { CDF15( 806, 3266, 6005, 6738, 7218, 7367, 7771, 14588,
- 16323, 17367, 18452, 19422, 22839, 26127, 29629) },
- { CDF15( 2779, 3738, 4683, 7213, 7775, 8017, 8655, 14357,
- 17939, 21332, 24520, 27470, 29456, 30529, 31656) },
- { CDF15( 1684, 3625, 5675, 7108, 9302, 11274, 14429, 17144,
- 19163, 20961, 22884, 24471, 26719, 28714, 30877) },
- { CDF15( 1142, 3491, 6277, 7314, 8089, 8355, 9023, 13624,
- 15369, 16730, 18114, 19313, 22521, 26012, 29550) },
- { CDF15( 2742, 4195, 5727, 8035, 8980, 9336, 10146, 14124,
- 17270, 20533, 23434, 25972, 27944, 29570, 31416) },
- { CDF15( 1727, 3948, 6101, 7796, 9841, 12344, 15766, 18944,
- 20638, 22038, 23963, 25311, 26988, 28766, 31012) },
- { CDF15( 154, 987, 1925, 2051, 2088, 2111, 2151, 23033,
- 23703, 24284, 24985, 25684, 27259, 28883, 30911) },
- { CDF15( 1135, 1322, 1493, 2635, 2696, 2737, 2770, 21016,
- 22935, 25057, 27251, 29173, 30089, 30960, 31933) },
- }, .interintra = {
- { CDF1(16384) }, { CDF1(26887) }, { CDF1(27597) },
- { CDF1(30237) },
- }, .interintra_mode = {
- { CDF3(8192, 16384, 24576) },
- { CDF3(1875, 11082, 27332) },
- { CDF3(2473, 9996, 26388) },
- { CDF3(4238, 11537, 25926) },
- }, .interintra_wedge = {
- { CDF1(20036) }, { CDF1(24957) }, { CDF1(26704) },
- { CDF1(27530) }, { CDF1(29564) }, { CDF1(29444) },
- { CDF1(26872) },
- }, .ref = {
- { { CDF1( 4897) }, { CDF1(16973) }, { CDF1(29744) } },
- { { CDF1( 1555) }, { CDF1(16751) }, { CDF1(30279) } },
- { { CDF1( 4236) }, { CDF1(19647) }, { CDF1(31194) } },
- { { CDF1( 8650) }, { CDF1(24773) }, { CDF1(31895) } },
- { { CDF1( 904) }, { CDF1(11014) }, { CDF1(26875) } },
- { { CDF1( 1444) }, { CDF1(15087) }, { CDF1(30304) } },
- }, .comp_fwd_ref = {
- { { CDF1( 4946) }, { CDF1(19891) }, { CDF1(30731) } },
- { { CDF1( 9468) }, { CDF1(22441) }, { CDF1(31059) } },
- { { CDF1( 1503) }, { CDF1(15160) }, { CDF1(27544) } },
- }, .comp_bwd_ref = {
- { { CDF1( 2235) }, { CDF1(17182) }, { CDF1(30606) } },
- { { CDF1( 1423) }, { CDF1(15175) }, { CDF1(30489) } },
- }, .comp_uni_ref = {
- { { CDF1( 5284) }, { CDF1(23152) }, { CDF1(31774) } },
- { { CDF1( 3865) }, { CDF1(14173) }, { CDF1(25120) } },
- { { CDF1( 3128) }, { CDF1(15270) }, { CDF1(26710) } },
- }, .txsz = {
- {
- { CDF1(19968) }, { CDF1(19968) }, { CDF1(24320) },
- }, {
- { CDF2(12272, 30172) }, { CDF2(12272, 30172) },
- { CDF2(18677, 30848) },
- }, {
- { CDF2(12986, 15180) }, { CDF2(12986, 15180) },
- { CDF2(24302, 25602) },
- }, {
- { CDF2( 5782, 11475) }, { CDF2( 5782, 11475) },
- { CDF2(16803, 22759) },
- },
- }, .txpart = {
- { { CDF1(28581) }, { CDF1(23846) }, { CDF1(20847) } },
- { { CDF1(24315) }, { CDF1(18196) }, { CDF1(12133) } },
- { { CDF1(18791) }, { CDF1(10887) }, { CDF1(11005) } },
- { { CDF1(27179) }, { CDF1(20004) }, { CDF1(11281) } },
- { { CDF1(26549) }, { CDF1(19308) }, { CDF1(14224) } },
- { { CDF1(28015) }, { CDF1(21546) }, { CDF1(14400) } },
- { { CDF1(28165) }, { CDF1(22401) }, { CDF1(16088) } },
- }, .txtp_inter1 = {
- { CDF15( 4458, 5560, 7695, 9709, 13330, 14789, 17537, 20266,
- 21504, 22848, 23934, 25474, 27727, 28915, 30631) },
- { CDF15( 1645, 2573, 4778, 5711, 7807, 8622, 10522, 15357,
- 17674, 20408, 22517, 25010, 27116, 28856, 30749) },
- }, .txtp_inter2 = {
- CDF11( 770, 2421, 5225, 12907, 15819, 18927,
- 21561, 24089, 26595, 28526, 30529)
- }, .txtp_inter3 = {
- { CDF1(16384) }, { CDF1( 4167) }, { CDF1( 1998) }, { CDF1( 748) },
- }, .txtp_intra1 = {
- {
- { CDF6( 1535, 8035, 9461, 12751, 23467, 27825) },
- { CDF6( 564, 3335, 9709, 10870, 18143, 28094) },
- { CDF6( 672, 3247, 3676, 11982, 19415, 23127) },
- { CDF6( 5279, 13885, 15487, 18044, 23527, 30252) },
- { CDF6( 4423, 6074, 7985, 10416, 25693, 29298) },
- { CDF6( 1486, 4241, 9460, 10662, 16456, 27694) },
- { CDF6( 439, 2838, 3522, 6737, 18058, 23754) },
- { CDF6( 1190, 4233, 4855, 11670, 20281, 24377) },
- { CDF6( 1045, 4312, 8647, 10159, 18644, 29335) },
- { CDF6( 202, 3734, 4747, 7298, 17127, 24016) },
- { CDF6( 447, 4312, 6819, 8884, 16010, 23858) },
- { CDF6( 277, 4369, 5255, 8905, 16465, 22271) },
- { CDF6( 3409, 5436, 10599, 15599, 19687, 24040) },
- }, {
- { CDF6( 1870, 13742, 14530, 16498, 23770, 27698) },
- { CDF6( 326, 8796, 14632, 15079, 19272, 27486) },
- { CDF6( 484, 7576, 7712, 14443, 19159, 22591) },
- { CDF6( 1126, 15340, 15895, 17023, 20896, 30279) },
- { CDF6( 655, 4854, 5249, 5913, 22099, 27138) },
- { CDF6( 1299, 6458, 8885, 9290, 14851, 25497) },
- { CDF6( 311, 5295, 5552, 6885, 16107, 22672) },
- { CDF6( 883, 8059, 8270, 11258, 17289, 21549) },
- { CDF6( 741, 7580, 9318, 10345, 16688, 29046) },
- { CDF6( 110, 7406, 7915, 9195, 16041, 23329) },
- { CDF6( 363, 7974, 9357, 10673, 15629, 24474) },
- { CDF6( 153, 7647, 8112, 9936, 15307, 19996) },
- { CDF6( 3511, 6332, 11165, 15335, 19323, 23594) },
- },
- }, .txtp_intra2 = {
- {
- { CDF4( 6554, 13107, 19661, 26214) },
- { CDF4( 6554, 13107, 19661, 26214) },
- { CDF4( 6554, 13107, 19661, 26214) },
- { CDF4( 6554, 13107, 19661, 26214) },
- { CDF4( 6554, 13107, 19661, 26214) },
- { CDF4( 6554, 13107, 19661, 26214) },
- { CDF4( 6554, 13107, 19661, 26214) },
- { CDF4( 6554, 13107, 19661, 26214) },
- { CDF4( 6554, 13107, 19661, 26214) },
- { CDF4( 6554, 13107, 19661, 26214) },
- { CDF4( 6554, 13107, 19661, 26214) },
- { CDF4( 6554, 13107, 19661, 26214) },
- { CDF4( 6554, 13107, 19661, 26214) },
- }, {
- { CDF4( 6554, 13107, 19661, 26214) },
- { CDF4( 6554, 13107, 19661, 26214) },
- { CDF4( 6554, 13107, 19661, 26214) },
- { CDF4( 6554, 13107, 19661, 26214) },
- { CDF4( 6554, 13107, 19661, 26214) },
- { CDF4( 6554, 13107, 19661, 26214) },
- { CDF4( 6554, 13107, 19661, 26214) },
- { CDF4( 6554, 13107, 19661, 26214) },
- { CDF4( 6554, 13107, 19661, 26214) },
- { CDF4( 6554, 13107, 19661, 26214) },
- { CDF4( 6554, 13107, 19661, 26214) },
- { CDF4( 6554, 13107, 19661, 26214) },
- { CDF4( 6554, 13107, 19661, 26214) },
- }, {
- { CDF4( 1127, 12814, 22772, 27483) },
- { CDF4( 145, 6761, 11980, 26667) },
- { CDF4( 362, 5887, 11678, 16725) },
- { CDF4( 385, 15213, 18587, 30693) },
- { CDF4( 25, 2914, 23134, 27903) },
- { CDF4( 60, 4470, 11749, 23991) },
- { CDF4( 37, 3332, 14511, 21448) },
- { CDF4( 157, 6320, 13036, 17439) },
- { CDF4( 119, 6719, 12906, 29396) },
- { CDF4( 47, 5537, 12576, 21499) },
- { CDF4( 269, 6076, 11258, 23115) },
- { CDF4( 83, 5615, 12001, 17228) },
- { CDF4( 1968, 5556, 12023, 18547) },
- },
- }, .skip = {
- { CDF1(31671) }, { CDF1(16515) }, { CDF1( 4576) },
- }, .skip_mode = {
- { CDF1(32621) }, { CDF1(20708) }, { CDF1( 8127) },
- }, .partition = {
- {
- // 128x128 -> 64x64
- { CDF7(27899, 28219, 28529, 32484, 32539, 32619, 32639) },
- { CDF7( 6607, 6990, 8268, 32060, 32219, 32338, 32371) },
- { CDF7( 5429, 6676, 7122, 32027, 32227, 32531, 32582) },
- { CDF7( 711, 966, 1172, 32448, 32538, 32617, 32664) },
- }, {
- // 64x64 -> 32x32
- { CDF9(20137, 21547, 23078, 29566, 29837,
- 30261, 30524, 30892, 31724) },
- { CDF9( 6732, 7490, 9497, 27944, 28250,
- 28515, 28969, 29630, 30104) },
- { CDF9( 5945, 7663, 8348, 28683, 29117,
- 29749, 30064, 30298, 32238) },
- { CDF9( 870, 1212, 1487, 31198, 31394,
- 31574, 31743, 31881, 32332) },
- }, {
- // 32x32 -> 16x16
- { CDF9(18462, 20920, 23124, 27647, 28227,
- 29049, 29519, 30178, 31544) },
- { CDF9( 7689, 9060, 12056, 24992, 25660,
- 26182, 26951, 28041, 29052) },
- { CDF9( 6015, 9009, 10062, 24544, 25409,
- 26545, 27071, 27526, 32047) },
- { CDF9( 1394, 2208, 2796, 28614, 29061,
- 29466, 29840, 30185, 31899) },
- }, {
- // 16x16 -> 8x8
- { CDF9(15597, 20929, 24571, 26706, 27664,
- 28821, 29601, 30571, 31902) },
- { CDF9( 7925, 11043, 16785, 22470, 23971,
- 25043, 26651, 28701, 29834) },
- { CDF9( 5414, 13269, 15111, 20488, 22360,
- 24500, 25537, 26336, 32117) },
- { CDF9( 2662, 6362, 8614, 20860, 23053,
- 24778, 26436, 27829, 31171) },
- }, {
- // 8x8 -> 4x4 only supports the four legacy partition types
- { CDF3(19132, 25510, 30392) },
- { CDF3(13928, 19855, 28540) },
- { CDF3(12522, 23679, 28629) },
- { CDF3( 9896, 18783, 25853) },
- },
- }, .seg_pred = {
- { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
- }, .seg_id = {
- { CDF7( 5622, 7893, 16093, 18233, 27809, 28373, 32533) },
- { CDF7(14274, 18230, 22557, 24935, 29980, 30851, 32344) },
- { CDF7(27527, 28487, 28723, 28890, 32397, 32647, 32679) },
- }, .cfl_sign = {
- CDF7( 1418, 2123, 13340, 18405, 26972, 28343, 32294)
- }, .cfl_alpha = {
- { CDF15( 7637, 20719, 31401, 32481, 32657, 32688, 32692, 32696,
- 32700, 32704, 32708, 32712, 32716, 32720, 32724) },
- { CDF15(14365, 23603, 28135, 31168, 32167, 32395, 32487, 32573,
- 32620, 32647, 32668, 32672, 32676, 32680, 32684) },
- { CDF15(11532, 22380, 28445, 31360, 32349, 32523, 32584, 32649,
- 32673, 32677, 32681, 32685, 32689, 32693, 32697) },
- { CDF15(26990, 31402, 32282, 32571, 32692, 32696, 32700, 32704,
- 32708, 32712, 32716, 32720, 32724, 32728, 32732) },
- { CDF15(17248, 26058, 28904, 30608, 31305, 31877, 32126, 32321,
- 32394, 32464, 32516, 32560, 32576, 32593, 32622) },
- { CDF15(14738, 21678, 25779, 27901, 29024, 30302, 30980, 31843,
- 32144, 32413, 32520, 32594, 32622, 32656, 32660) },
- }, .restore_wiener = {
- CDF1(11570)
- }, .restore_sgrproj = {
- CDF1(16855)
- }, .restore_switchable = {
- CDF2( 9413, 22581)
- }, .delta_q = {
- CDF3(28160, 32120, 32677)
- }, .delta_lf = {
- { CDF3(28160, 32120, 32677) },
- { CDF3(28160, 32120, 32677) },
- { CDF3(28160, 32120, 32677) },
- { CDF3(28160, 32120, 32677) },
- { CDF3(28160, 32120, 32677) },
- }, .motion_mode = {
- [BS_8x8] = { CDF2( 7651, 24760) },
- [BS_8x16] = { CDF2( 4738, 24765) },
- [BS_8x32] = { CDF2(28799, 31390) },
- [BS_16x8] = { CDF2( 5391, 25528) },
- [BS_16x16] = { CDF2(19419, 26810) },
- [BS_16x32] = { CDF2( 5123, 23606) },
- [BS_16x64] = { CDF2(28973, 31594) },
- [BS_32x8] = { CDF2(26431, 30774) },
- [BS_32x16] = { CDF2(11606, 24308) },
- [BS_32x32] = { CDF2(26260, 29116) },
- [BS_32x64] = { CDF2(20360, 28062) },
- [BS_64x16] = { CDF2(29742, 31203) },
- [BS_64x32] = { CDF2(21679, 26830) },
- [BS_64x64] = { CDF2(29516, 30701) },
- [BS_64x128] = { CDF2(28898, 30397) },
- [BS_128x64] = { CDF2(30878, 31335) },
- [BS_128x128] = { CDF2(32507, 32558) },
- }, .obmc = {
- [BS_8x8] = { CDF1(10437) },
- [BS_8x16] = { CDF1( 9371) },
- [BS_8x32] = { CDF1(23664) },
- [BS_16x8] = { CDF1( 9301) },
- [BS_16x16] = { CDF1(17432) },
- [BS_16x32] = { CDF1(14423) },
- [BS_16x64] = { CDF1(24008) },
- [BS_32x8] = { CDF1(20901) },
- [BS_32x16] = { CDF1(15142) },
- [BS_32x32] = { CDF1(25817) },
- [BS_32x64] = { CDF1(22823) },
- [BS_64x16] = { CDF1(26879) },
- [BS_64x32] = { CDF1(22083) },
- [BS_64x64] = { CDF1(30128) },
- [BS_64x128] = { CDF1(31014) },
- [BS_128x64] = { CDF1(31560) },
- [BS_128x128] = { CDF1(32638) },
- }, .pal_y = {
- { { CDF1(31676) }, { CDF1( 3419) }, { CDF1( 1261) } },
- { { CDF1(31912) }, { CDF1( 2859) }, { CDF1( 980) } },
- { { CDF1(31823) }, { CDF1( 3400) }, { CDF1( 781) } },
- { { CDF1(32030) }, { CDF1( 3561) }, { CDF1( 904) } },
- { { CDF1(32309) }, { CDF1( 7337) }, { CDF1( 1462) } },
- { { CDF1(32265) }, { CDF1( 4015) }, { CDF1( 1521) } },
- { { CDF1(32450) }, { CDF1( 7946) }, { CDF1( 129) } },
- }, .pal_sz = {
- {
- { CDF6( 7952, 13000, 18149, 21478, 25527, 29241) },
- { CDF6( 7139, 11421, 16195, 19544, 23666, 28073) },
- { CDF6( 7788, 12741, 17325, 20500, 24315, 28530) },
- { CDF6( 8271, 14064, 18246, 21564, 25071, 28533) },
- { CDF6(12725, 19180, 21863, 24839, 27535, 30120) },
- { CDF6( 9711, 14888, 16923, 21052, 25661, 27875) },
- { CDF6(14940, 20797, 21678, 24186, 27033, 28999) },
- }, {
- { CDF6( 8713, 19979, 27128, 29609, 31331, 32272) },
- { CDF6( 5839, 15573, 23581, 26947, 29848, 31700) },
- { CDF6( 4426, 11260, 17999, 21483, 25863, 29430) },
- { CDF6( 3228, 9464, 14993, 18089, 22523, 27420) },
- { CDF6( 3768, 8886, 13091, 17852, 22495, 27207) },
- { CDF6( 2464, 8451, 12861, 21632, 25525, 28555) },
- { CDF6( 1269, 5435, 10433, 18963, 21700, 25865) },
- },
- }, .pal_uv = {
- { CDF1(32461) }, { CDF1(21488) },
- }, .color_map = {
- { /* y */
+typedef struct CdfDefaultContext {
+ CdfModeContext m;
+ struct {
+ CdfMvComponent comp;
+ ALIGN(uint16_t joint[N_MV_JOINTS], 8);
+ } mv;
+ ALIGN(uint16_t kfym[5][5][N_INTRA_PRED_MODES + 3], 32);
+} CdfDefaultContext;
+
+static const CdfDefaultContext default_cdf = {
+ .m = {
+ .y_mode = {
+ { CDF12(22801, 23489, 24293, 24756, 25601, 26123,
+ 26606, 27418, 27945, 29228, 29685, 30349) },
+ { CDF12(18673, 19845, 22631, 23318, 23950, 24649,
+ 25527, 27364, 28152, 29701, 29984, 30852) },
+ { CDF12(19770, 20979, 23396, 23939, 24241, 24654,
+ 25136, 27073, 27830, 29360, 29730, 30659) },
+ { CDF12(20155, 21301, 22838, 23178, 23261, 23533,
+ 23703, 24804, 25352, 26575, 27016, 28049) },
+ }, .use_filter_intra = {
+ [BS_4x4] = { CDF1( 4621) },
+ [BS_4x8] = { CDF1( 6743) },
+ [BS_8x4] = { CDF1( 5893) },
+ [BS_8x8] = { CDF1( 7866) },
+ [BS_8x16] = { CDF1(12551) },
+ [BS_16x8] = { CDF1( 9394) },
+ [BS_16x16] = { CDF1(12408) },
+ [BS_16x32] = { CDF1(14301) },
+ [BS_32x16] = { CDF1(12756) },
+ [BS_32x32] = { CDF1(22343) },
+ [BS_32x64] = { CDF1(16384) },
+ [BS_64x32] = { CDF1(16384) },
+ [BS_64x64] = { CDF1(16384) },
+ [BS_64x128] = { CDF1(16384) },
+ [BS_128x64] = { CDF1(16384) },
+ [BS_128x128] = { CDF1(16384) },
+ [BS_4x16] = { CDF1(12770) },
+ [BS_16x4] = { CDF1(10368) },
+ [BS_8x32] = { CDF1(20229) },
+ [BS_32x8] = { CDF1(18101) },
+ [BS_16x64] = { CDF1(16384) },
+ [BS_64x16] = { CDF1(16384) },
+ }, .filter_intra = {
+ CDF4(8949, 12776, 17211, 29558),
+ }, .uv_mode = {
{
- { CDF1(28710) }, { CDF1(16384) }, { CDF1(10553) },
- { CDF1(27036) }, { CDF1(31603) },
+ { CDF12(22631, 24152, 25378, 25661, 25986, 26520,
+ 27055, 27923, 28244, 30059, 30941, 31961) },
+ { CDF12( 9513, 26881, 26973, 27046, 27118, 27664,
+ 27739, 27824, 28359, 29505, 29800, 31796) },
+ { CDF12( 9845, 9915, 28663, 28704, 28757, 28780,
+ 29198, 29822, 29854, 30764, 31777, 32029) },
+ { CDF12(13639, 13897, 14171, 25331, 25606, 25727,
+ 25953, 27148, 28577, 30612, 31355, 32493) },
+ { CDF12( 9764, 9835, 9930, 9954, 25386, 27053,
+ 27958, 28148, 28243, 31101, 31744, 32363) },
+ { CDF12(11825, 13589, 13677, 13720, 15048, 29213,
+ 29301, 29458, 29711, 31161, 31441, 32550) },
+ { CDF12(14175, 14399, 16608, 16821, 17718, 17775,
+ 28551, 30200, 30245, 31837, 32342, 32667) },
+ { CDF12(12885, 13038, 14978, 15590, 15673, 15748,
+ 16176, 29128, 29267, 30643, 31961, 32461) },
+ { CDF12(12026, 13661, 13874, 15305, 15490, 15726,
+ 15995, 16273, 28443, 30388, 30767, 32416) },
+ { CDF12(19052, 19840, 20579, 20916, 21150, 21467,
+ 21885, 22719, 23174, 28861, 30379, 32175) },
+ { CDF12(18627, 19649, 20974, 21219, 21492, 21816,
+ 22199, 23119, 23527, 27053, 31397, 32148) },
+ { CDF12(17026, 19004, 19997, 20339, 20586, 21103,
+ 21349, 21907, 22482, 25896, 26541, 31819) },
+ { CDF12(12124, 13759, 14959, 14992, 15007, 15051,
+ 15078, 15166, 15255, 15753, 16039, 16606) },
}, {
- { CDF2(27877, 30490) }, { CDF2(11532, 25697) },
- { CDF2( 6544, 30234) }, { CDF2(23018, 28072) },
- { CDF2(31915, 32385) },
+ { CDF13(10407, 11208, 12900, 13181, 13823, 14175, 14899,
+ 15656, 15986, 20086, 20995, 22455, 24212) },
+ { CDF13( 4532, 19780, 20057, 20215, 20428, 21071, 21199,
+ 21451, 22099, 24228, 24693, 27032, 29472) },
+ { CDF13( 5273, 5379, 20177, 20270, 20385, 20439, 20949,
+ 21695, 21774, 23138, 24256, 24703, 26679) },
+ { CDF13( 6740, 7167, 7662, 14152, 14536, 14785, 15034,
+ 16741, 18371, 21520, 22206, 23389, 24182) },
+ { CDF13( 4987, 5368, 5928, 6068, 19114, 20315, 21857,
+ 22253, 22411, 24911, 25380, 26027, 26376) },
+ { CDF13( 5370, 6889, 7247, 7393, 9498, 21114, 21402,
+ 21753, 21981, 24780, 25386, 26517, 27176) },
+ { CDF13( 4816, 4961, 7204, 7326, 8765, 8930, 20169,
+ 20682, 20803, 23188, 23763, 24455, 24940) },
+ { CDF13( 6608, 6740, 8529, 9049, 9257, 9356, 9735,
+ 18827, 19059, 22336, 23204, 23964, 24793) },
+ { CDF13( 5998, 7419, 7781, 8933, 9255, 9549, 9753,
+ 10417, 18898, 22494, 23139, 24764, 25989) },
+ { CDF13(10660, 11298, 12550, 12957, 13322, 13624, 14040,
+ 15004, 15534, 20714, 21789, 23443, 24861) },
+ { CDF13(10522, 11530, 12552, 12963, 13378, 13779, 14245,
+ 15235, 15902, 20102, 22696, 23774, 25838) },
+ { CDF13(10099, 10691, 12639, 13049, 13386, 13665, 14125,
+ 15163, 15636, 19676, 20474, 23519, 25208) },
+ { CDF13( 3144, 5087, 7382, 7504, 7593, 7690, 7801,
+ 8064, 8232, 9248, 9875, 10521, 29048) },
+ },
+ }, .angle_delta = {
+ { CDF6( 2180, 5032, 7567, 22776, 26989, 30217) },
+ { CDF6( 2301, 5608, 8801, 23487, 26974, 30330) },
+ { CDF6( 3780, 11018, 13699, 19354, 23083, 31286) },
+ { CDF6( 4581, 11226, 15147, 17138, 21834, 28397) },
+ { CDF6( 1737, 10927, 14509, 19588, 22745, 28823) },
+ { CDF6( 2664, 10176, 12485, 17650, 21600, 30495) },
+ { CDF6( 2240, 11096, 15453, 20341, 22561, 28917) },
+ { CDF6( 3605, 10428, 12459, 17676, 21244, 30655) },
+ }, .filter = {
+ {
+ { CDF2(31935, 32720) }, { CDF2( 5568, 32719) },
+ { CDF2( 422, 2938) }, { CDF2(28244, 32608) },
+ { CDF2(31206, 31953) }, { CDF2( 4862, 32121) },
+ { CDF2( 770, 1152) }, { CDF2(20889, 25637) },
}, {
- { CDF3(25572, 28046, 30045) },
- { CDF3( 9478, 21590, 27256) },
- { CDF3( 7248, 26837, 29824) },
- { CDF3(19167, 24486, 28349) },
- { CDF3(31400, 31825, 32250) },
+ { CDF2(31910, 32724) }, { CDF2( 4120, 32712) },
+ { CDF2( 305, 2247) }, { CDF2(27403, 32636) },
+ { CDF2(31022, 32009) }, { CDF2( 2963, 32093) },
+ { CDF2( 601, 943) }, { CDF2(14969, 21398) },
+ },
+ }, .newmv_mode = {
+ { CDF1(24035) }, { CDF1(16630) }, { CDF1(15339) },
+ { CDF1( 8386) }, { CDF1(12222) }, { CDF1( 4676) },
+ }, .globalmv_mode = {
+ { CDF1( 2175) }, { CDF1( 1054) },
+ }, .refmv_mode = {
+ { CDF1(23974) }, { CDF1(24188) }, { CDF1(17848) },
+ { CDF1(28622) }, { CDF1(24312) }, { CDF1(19923) },
+ }, .drl_bit = {
+ { CDF1(13104) }, { CDF1(24560) }, { CDF1(18945) },
+ }, .comp_inter_mode = {
+ { CDF7( 7760, 13823, 15808, 17641, 19156, 20666, 26891) },
+ { CDF7(10730, 19452, 21145, 22749, 24039, 25131, 28724) },
+ { CDF7(10664, 20221, 21588, 22906, 24295, 25387, 28436) },
+ { CDF7(13298, 16984, 20471, 24182, 25067, 25736, 26422) },
+ { CDF7(18904, 23325, 25242, 27432, 27898, 28258, 30758) },
+ { CDF7(10725, 17454, 20124, 22820, 24195, 25168, 26046) },
+ { CDF7(17125, 24273, 25814, 27492, 28214, 28704, 30592) },
+ { CDF7(13046, 23214, 24505, 25942, 27435, 28442, 29330) },
+ }, .intra = {
+ { CDF1( 806) }, { CDF1(16662) }, { CDF1(20186) },
+ { CDF1(26538) },
+ }, .comp = {
+ { CDF1(26828) }, { CDF1(24035) }, { CDF1(12031) },
+ { CDF1(10640) }, { CDF1( 2901) },
+ }, .comp_dir = {
+ { CDF1( 1198) }, { CDF1( 2070) }, { CDF1( 9166) },
+ { CDF1( 7499) }, { CDF1(22475) },
+ }, .jnt_comp = {
+ { CDF1(18244) }, { CDF1(12865) }, { CDF1( 7053) },
+ { CDF1(13259) }, { CDF1( 9334) }, { CDF1( 4644) },
+ }, .mask_comp = {
+ { CDF1(26607) }, { CDF1(22891) }, { CDF1(18840) },
+ { CDF1(24594) }, { CDF1(19934) }, { CDF1(22674) },
+ }, .wedge_comp = {
+ { CDF1(23431) }, { CDF1(13171) }, { CDF1(11470) },
+ { CDF1( 9770) }, { CDF1( 9100) }, { CDF1( 8233) },
+ { CDF1( 6172) }, { CDF1(11820) }, { CDF1( 7701) },
+ }, .wedge_idx = {
+ { CDF15( 2438, 4440, 6599, 8663, 11005, 12874, 15751, 18094,
+ 20359, 22362, 24127, 25702, 27752, 29450, 31171) },
+ { CDF15( 806, 3266, 6005, 6738, 7218, 7367, 7771, 14588,
+ 16323, 17367, 18452, 19422, 22839, 26127, 29629) },
+ { CDF15( 2779, 3738, 4683, 7213, 7775, 8017, 8655, 14357,
+ 17939, 21332, 24520, 27470, 29456, 30529, 31656) },
+ { CDF15( 1684, 3625, 5675, 7108, 9302, 11274, 14429, 17144,
+ 19163, 20961, 22884, 24471, 26719, 28714, 30877) },
+ { CDF15( 1142, 3491, 6277, 7314, 8089, 8355, 9023, 13624,
+ 15369, 16730, 18114, 19313, 22521, 26012, 29550) },
+ { CDF15( 2742, 4195, 5727, 8035, 8980, 9336, 10146, 14124,
+ 17270, 20533, 23434, 25972, 27944, 29570, 31416) },
+ { CDF15( 1727, 3948, 6101, 7796, 9841, 12344, 15766, 18944,
+ 20638, 22038, 23963, 25311, 26988, 28766, 31012) },
+ { CDF15( 154, 987, 1925, 2051, 2088, 2111, 2151, 23033,
+ 23703, 24284, 24985, 25684, 27259, 28883, 30911) },
+ { CDF15( 1135, 1322, 1493, 2635, 2696, 2737, 2770, 21016,
+ 22935, 25057, 27251, 29173, 30089, 30960, 31933) },
+ }, .interintra = {
+ { CDF1(16384) }, { CDF1(26887) }, { CDF1(27597) },
+ { CDF1(30237) },
+ }, .interintra_mode = {
+ { CDF3(8192, 16384, 24576) },
+ { CDF3(1875, 11082, 27332) },
+ { CDF3(2473, 9996, 26388) },
+ { CDF3(4238, 11537, 25926) },
+ }, .interintra_wedge = {
+ { CDF1(20036) }, { CDF1(24957) }, { CDF1(26704) },
+ { CDF1(27530) }, { CDF1(29564) }, { CDF1(29444) },
+ { CDF1(26872) },
+ }, .ref = {
+ { { CDF1( 4897) }, { CDF1(16973) }, { CDF1(29744) } },
+ { { CDF1( 1555) }, { CDF1(16751) }, { CDF1(30279) } },
+ { { CDF1( 4236) }, { CDF1(19647) }, { CDF1(31194) } },
+ { { CDF1( 8650) }, { CDF1(24773) }, { CDF1(31895) } },
+ { { CDF1( 904) }, { CDF1(11014) }, { CDF1(26875) } },
+ { { CDF1( 1444) }, { CDF1(15087) }, { CDF1(30304) } },
+ }, .comp_fwd_ref = {
+ { { CDF1( 4946) }, { CDF1(19891) }, { CDF1(30731) } },
+ { { CDF1( 9468) }, { CDF1(22441) }, { CDF1(31059) } },
+ { { CDF1( 1503) }, { CDF1(15160) }, { CDF1(27544) } },
+ }, .comp_bwd_ref = {
+ { { CDF1( 2235) }, { CDF1(17182) }, { CDF1(30606) } },
+ { { CDF1( 1423) }, { CDF1(15175) }, { CDF1(30489) } },
+ }, .comp_uni_ref = {
+ { { CDF1( 5284) }, { CDF1(23152) }, { CDF1(31774) } },
+ { { CDF1( 3865) }, { CDF1(14173) }, { CDF1(25120) } },
+ { { CDF1( 3128) }, { CDF1(15270) }, { CDF1(26710) } },
+ }, .txsz = {
+ {
+ { CDF1(19968) }, { CDF1(19968) }, { CDF1(24320) },
}, {
- { CDF4(24779, 26955, 28576, 30282) },
- { CDF4( 8669, 20364, 24073, 28093) },
- { CDF4( 4255, 27565, 29377, 31067) },
- { CDF4(19864, 23674, 26716, 29530) },
- { CDF4(31646, 31893, 32147, 32426) },
+ { CDF2(12272, 30172) }, { CDF2(12272, 30172) },
+ { CDF2(18677, 30848) },
}, {
- { CDF5(23132, 25407, 26970, 28435, 30073) },
- { CDF5( 7443, 17242, 20717, 24762, 27982) },
- { CDF5( 6300, 24862, 26944, 28784, 30671) },
- { CDF5(18916, 22895, 25267, 27435, 29652) },
- { CDF5(31270, 31550, 31808, 32059, 32353) },
+ { CDF2(12986, 15180) }, { CDF2(12986, 15180) },
+ { CDF2(24302, 25602) },
}, {
- { CDF6(23105, 25199, 26464, 27684, 28931, 30318) },
- { CDF6( 6950, 15447, 18952, 22681, 25567, 28563) },
- { CDF6( 7560, 23474, 25490, 27203, 28921, 30708) },
- { CDF6(18544, 22373, 24457, 26195, 28119, 30045) },
- { CDF6(31198, 31451, 31670, 31882, 32123, 32391) },
+ { CDF2( 5782, 11475) }, { CDF2( 5782, 11475) },
+ { CDF2(16803, 22759) },
+ },
+ }, .txpart = {
+ { { CDF1(28581) }, { CDF1(23846) }, { CDF1(20847) } },
+ { { CDF1(24315) }, { CDF1(18196) }, { CDF1(12133) } },
+ { { CDF1(18791) }, { CDF1(10887) }, { CDF1(11005) } },
+ { { CDF1(27179) }, { CDF1(20004) }, { CDF1(11281) } },
+ { { CDF1(26549) }, { CDF1(19308) }, { CDF1(14224) } },
+ { { CDF1(28015) }, { CDF1(21546) }, { CDF1(14400) } },
+ { { CDF1(28165) }, { CDF1(22401) }, { CDF1(16088) } },
+ }, .txtp_inter1 = {
+ { CDF15( 4458, 5560, 7695, 9709, 13330, 14789, 17537, 20266,
+ 21504, 22848, 23934, 25474, 27727, 28915, 30631) },
+ { CDF15( 1645, 2573, 4778, 5711, 7807, 8622, 10522, 15357,
+ 17674, 20408, 22517, 25010, 27116, 28856, 30749) },
+ }, .txtp_inter2 = {
+ CDF11( 770, 2421, 5225, 12907, 15819, 18927,
+ 21561, 24089, 26595, 28526, 30529)
+ }, .txtp_inter3 = {
+ { CDF1(16384) }, { CDF1( 4167) }, { CDF1( 1998) }, { CDF1( 748) },
+ }, .txtp_intra1 = {
+ {
+ { CDF6( 1535, 8035, 9461, 12751, 23467, 27825) },
+ { CDF6( 564, 3335, 9709, 10870, 18143, 28094) },
+ { CDF6( 672, 3247, 3676, 11982, 19415, 23127) },
+ { CDF6( 5279, 13885, 15487, 18044, 23527, 30252) },
+ { CDF6( 4423, 6074, 7985, 10416, 25693, 29298) },
+ { CDF6( 1486, 4241, 9460, 10662, 16456, 27694) },
+ { CDF6( 439, 2838, 3522, 6737, 18058, 23754) },
+ { CDF6( 1190, 4233, 4855, 11670, 20281, 24377) },
+ { CDF6( 1045, 4312, 8647, 10159, 18644, 29335) },
+ { CDF6( 202, 3734, 4747, 7298, 17127, 24016) },
+ { CDF6( 447, 4312, 6819, 8884, 16010, 23858) },
+ { CDF6( 277, 4369, 5255, 8905, 16465, 22271) },
+ { CDF6( 3409, 5436, 10599, 15599, 19687, 24040) },
}, {
- { CDF7(21689, 23883, 25163, 26352, 27506, 28827, 30195) },
- { CDF7( 6892, 15385, 17840, 21606, 24287, 26753, 29204) },
- { CDF7( 5651, 23182, 25042, 26518, 27982, 29392, 30900) },
- { CDF7(19349, 22578, 24418, 25994, 27524, 29031, 30448) },
- { CDF7(31028, 31270, 31504, 31705, 31927, 32153, 32392) },
+ { CDF6( 1870, 13742, 14530, 16498, 23770, 27698) },
+ { CDF6( 326, 8796, 14632, 15079, 19272, 27486) },
+ { CDF6( 484, 7576, 7712, 14443, 19159, 22591) },
+ { CDF6( 1126, 15340, 15895, 17023, 20896, 30279) },
+ { CDF6( 655, 4854, 5249, 5913, 22099, 27138) },
+ { CDF6( 1299, 6458, 8885, 9290, 14851, 25497) },
+ { CDF6( 311, 5295, 5552, 6885, 16107, 22672) },
+ { CDF6( 883, 8059, 8270, 11258, 17289, 21549) },
+ { CDF6( 741, 7580, 9318, 10345, 16688, 29046) },
+ { CDF6( 110, 7406, 7915, 9195, 16041, 23329) },
+ { CDF6( 363, 7974, 9357, 10673, 15629, 24474) },
+ { CDF6( 153, 7647, 8112, 9936, 15307, 19996) },
+ { CDF6( 3511, 6332, 11165, 15335, 19323, 23594) },
},
- }, { /* uv */
+ }, .txtp_intra2 = {
{
- { CDF1(29089) }, { CDF1(16384) }, { CDF1( 8713) },
- { CDF1(29257) }, { CDF1(31610) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ }, {
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
}, {
- { CDF2(25257, 29145) }, { CDF2(12287, 27293) },
- { CDF2( 7033, 27960) }, { CDF2(20145, 25405) },
- { CDF2(30608, 31639) },
+ { CDF4( 1127, 12814, 22772, 27483) },
+ { CDF4( 145, 6761, 11980, 26667) },
+ { CDF4( 362, 5887, 11678, 16725) },
+ { CDF4( 385, 15213, 18587, 30693) },
+ { CDF4( 25, 2914, 23134, 27903) },
+ { CDF4( 60, 4470, 11749, 23991) },
+ { CDF4( 37, 3332, 14511, 21448) },
+ { CDF4( 157, 6320, 13036, 17439) },
+ { CDF4( 119, 6719, 12906, 29396) },
+ { CDF4( 47, 5537, 12576, 21499) },
+ { CDF4( 269, 6076, 11258, 23115) },
+ { CDF4( 83, 5615, 12001, 17228) },
+ { CDF4( 1968, 5556, 12023, 18547) },
+ },
+ }, .skip = {
+ { CDF1(31671) }, { CDF1(16515) }, { CDF1( 4576) },
+ }, .skip_mode = {
+ { CDF1(32621) }, { CDF1(20708) }, { CDF1( 8127) },
+ }, .partition = {
+ {
+ // 128x128 -> 64x64
+ { CDF7(27899, 28219, 28529, 32484, 32539, 32619, 32639) },
+ { CDF7( 6607, 6990, 8268, 32060, 32219, 32338, 32371) },
+ { CDF7( 5429, 6676, 7122, 32027, 32227, 32531, 32582) },
+ { CDF7( 711, 966, 1172, 32448, 32538, 32617, 32664) },
}, {
- { CDF3(24210, 27175, 29903) },
- { CDF3( 9888, 22386, 27214) },
- { CDF3( 5901, 26053, 29293) },
- { CDF3(18318, 22152, 28333) },
- { CDF3(30459, 31136, 31926) },
+ // 64x64 -> 32x32
+ { CDF9(20137, 21547, 23078, 29566, 29837,
+ 30261, 30524, 30892, 31724) },
+ { CDF9( 6732, 7490, 9497, 27944, 28250,
+ 28515, 28969, 29630, 30104) },
+ { CDF9( 5945, 7663, 8348, 28683, 29117,
+ 29749, 30064, 30298, 32238) },
+ { CDF9( 870, 1212, 1487, 31198, 31394,
+ 31574, 31743, 31881, 32332) },
}, {
- { CDF4(22980, 25479, 27781, 29986) },
- { CDF4( 8413, 21408, 24859, 28874) },
- { CDF4( 2257, 29449, 30594, 31598) },
- { CDF4(19189, 21202, 25915, 28620) },
- { CDF4(31844, 32044, 32281, 32518) },
+ // 32x32 -> 16x16
+ { CDF9(18462, 20920, 23124, 27647, 28227,
+ 29049, 29519, 30178, 31544) },
+ { CDF9( 7689, 9060, 12056, 24992, 25660,
+ 26182, 26951, 28041, 29052) },
+ { CDF9( 6015, 9009, 10062, 24544, 25409,
+ 26545, 27071, 27526, 32047) },
+ { CDF9( 1394, 2208, 2796, 28614, 29061,
+ 29466, 29840, 30185, 31899) },
}, {
- { CDF5(22217, 24567, 26637, 28683, 30548) },
- { CDF5( 7307, 16406, 19636, 24632, 28424) },
- { CDF5( 4441, 25064, 26879, 28942, 30919) },
- { CDF5(17210, 20528, 23319, 26750, 29582) },
- { CDF5(30674, 30953, 31396, 31735, 32207) },
+ // 16x16 -> 8x8
+ { CDF9(15597, 20929, 24571, 26706, 27664,
+ 28821, 29601, 30571, 31902) },
+ { CDF9( 7925, 11043, 16785, 22470, 23971,
+ 25043, 26651, 28701, 29834) },
+ { CDF9( 5414, 13269, 15111, 20488, 22360,
+ 24500, 25537, 26336, 32117) },
+ { CDF9( 2662, 6362, 8614, 20860, 23053,
+ 24778, 26436, 27829, 31171) },
}, {
- { CDF6(21239, 23168, 25044, 26962, 28705, 30506) },
- { CDF6( 6545, 15012, 18004, 21817, 25503, 28701) },
- { CDF6( 3448, 26295, 27437, 28704, 30126, 31442) },
- { CDF6(15889, 18323, 21704, 24698, 26976, 29690) },
- { CDF6(30988, 31204, 31479, 31734, 31983, 32325) },
+ // 8x8 -> 4x4 only supports the four legacy partition types
+ { CDF3(19132, 25510, 30392) },
+ { CDF3(13928, 19855, 28540) },
+ { CDF3(12522, 23679, 28629) },
+ { CDF3( 9896, 18783, 25853) },
+ },
+ }, .seg_pred = {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ }, .seg_id = {
+ { CDF7( 5622, 7893, 16093, 18233, 27809, 28373, 32533) },
+ { CDF7(14274, 18230, 22557, 24935, 29980, 30851, 32344) },
+ { CDF7(27527, 28487, 28723, 28890, 32397, 32647, 32679) },
+ }, .cfl_sign = {
+ CDF7( 1418, 2123, 13340, 18405, 26972, 28343, 32294)
+ }, .cfl_alpha = {
+ { CDF15( 7637, 20719, 31401, 32481, 32657, 32688, 32692, 32696,
+ 32700, 32704, 32708, 32712, 32716, 32720, 32724) },
+ { CDF15(14365, 23603, 28135, 31168, 32167, 32395, 32487, 32573,
+ 32620, 32647, 32668, 32672, 32676, 32680, 32684) },
+ { CDF15(11532, 22380, 28445, 31360, 32349, 32523, 32584, 32649,
+ 32673, 32677, 32681, 32685, 32689, 32693, 32697) },
+ { CDF15(26990, 31402, 32282, 32571, 32692, 32696, 32700, 32704,
+ 32708, 32712, 32716, 32720, 32724, 32728, 32732) },
+ { CDF15(17248, 26058, 28904, 30608, 31305, 31877, 32126, 32321,
+ 32394, 32464, 32516, 32560, 32576, 32593, 32622) },
+ { CDF15(14738, 21678, 25779, 27901, 29024, 30302, 30980, 31843,
+ 32144, 32413, 32520, 32594, 32622, 32656, 32660) },
+ }, .restore_wiener = {
+ CDF1(11570)
+ }, .restore_sgrproj = {
+ CDF1(16855)
+ }, .restore_switchable = {
+ CDF2( 9413, 22581)
+ }, .delta_q = {
+ CDF3(28160, 32120, 32677)
+ }, .delta_lf = {
+ { CDF3(28160, 32120, 32677) },
+ { CDF3(28160, 32120, 32677) },
+ { CDF3(28160, 32120, 32677) },
+ { CDF3(28160, 32120, 32677) },
+ { CDF3(28160, 32120, 32677) },
+ }, .motion_mode = {
+ [BS_8x8] = { CDF2( 7651, 24760) },
+ [BS_8x16] = { CDF2( 4738, 24765) },
+ [BS_8x32] = { CDF2(28799, 31390) },
+ [BS_16x8] = { CDF2( 5391, 25528) },
+ [BS_16x16] = { CDF2(19419, 26810) },
+ [BS_16x32] = { CDF2( 5123, 23606) },
+ [BS_16x64] = { CDF2(28973, 31594) },
+ [BS_32x8] = { CDF2(26431, 30774) },
+ [BS_32x16] = { CDF2(11606, 24308) },
+ [BS_32x32] = { CDF2(26260, 29116) },
+ [BS_32x64] = { CDF2(20360, 28062) },
+ [BS_64x16] = { CDF2(29742, 31203) },
+ [BS_64x32] = { CDF2(21679, 26830) },
+ [BS_64x64] = { CDF2(29516, 30701) },
+ [BS_64x128] = { CDF2(28898, 30397) },
+ [BS_128x64] = { CDF2(30878, 31335) },
+ [BS_128x128] = { CDF2(32507, 32558) },
+ }, .obmc = {
+ [BS_8x8] = { CDF1(10437) },
+ [BS_8x16] = { CDF1( 9371) },
+ [BS_8x32] = { CDF1(23664) },
+ [BS_16x8] = { CDF1( 9301) },
+ [BS_16x16] = { CDF1(17432) },
+ [BS_16x32] = { CDF1(14423) },
+ [BS_16x64] = { CDF1(24008) },
+ [BS_32x8] = { CDF1(20901) },
+ [BS_32x16] = { CDF1(15142) },
+ [BS_32x32] = { CDF1(25817) },
+ [BS_32x64] = { CDF1(22823) },
+ [BS_64x16] = { CDF1(26879) },
+ [BS_64x32] = { CDF1(22083) },
+ [BS_64x64] = { CDF1(30128) },
+ [BS_64x128] = { CDF1(31014) },
+ [BS_128x64] = { CDF1(31560) },
+ [BS_128x128] = { CDF1(32638) },
+ }, .pal_y = {
+ { { CDF1(31676) }, { CDF1( 3419) }, { CDF1( 1261) } },
+ { { CDF1(31912) }, { CDF1( 2859) }, { CDF1( 980) } },
+ { { CDF1(31823) }, { CDF1( 3400) }, { CDF1( 781) } },
+ { { CDF1(32030) }, { CDF1( 3561) }, { CDF1( 904) } },
+ { { CDF1(32309) }, { CDF1( 7337) }, { CDF1( 1462) } },
+ { { CDF1(32265) }, { CDF1( 4015) }, { CDF1( 1521) } },
+ { { CDF1(32450) }, { CDF1( 7946) }, { CDF1( 129) } },
+ }, .pal_sz = {
+ {
+ { CDF6( 7952, 13000, 18149, 21478, 25527, 29241) },
+ { CDF6( 7139, 11421, 16195, 19544, 23666, 28073) },
+ { CDF6( 7788, 12741, 17325, 20500, 24315, 28530) },
+ { CDF6( 8271, 14064, 18246, 21564, 25071, 28533) },
+ { CDF6(12725, 19180, 21863, 24839, 27535, 30120) },
+ { CDF6( 9711, 14888, 16923, 21052, 25661, 27875) },
+ { CDF6(14940, 20797, 21678, 24186, 27033, 28999) },
}, {
- { CDF7(21442, 23288, 24758, 26246, 27649, 28980, 30563) },
- { CDF7( 5863, 14933, 17552, 20668, 23683, 26411, 29273) },
- { CDF7( 3415, 25810, 26877, 27990, 29223, 30394, 31618) },
- { CDF7(17965, 20084, 22232, 23974, 26274, 28402, 30390) },
- { CDF7(31190, 31329, 31516, 31679, 31825, 32026, 32322) },
+ { CDF6( 8713, 19979, 27128, 29609, 31331, 32272) },
+ { CDF6( 5839, 15573, 23581, 26947, 29848, 31700) },
+ { CDF6( 4426, 11260, 17999, 21483, 25863, 29430) },
+ { CDF6( 3228, 9464, 14993, 18089, 22523, 27420) },
+ { CDF6( 3768, 8886, 13091, 17852, 22495, 27207) },
+ { CDF6( 2464, 8451, 12861, 21632, 25525, 28555) },
+ { CDF6( 1269, 5435, 10433, 18963, 21700, 25865) },
+ },
+ }, .pal_uv = {
+ { CDF1(32461) }, { CDF1(21488) },
+ }, .color_map = {
+ { /* y */
+ {
+ { CDF1(28710) }, { CDF1(16384) }, { CDF1(10553) },
+ { CDF1(27036) }, { CDF1(31603) },
+ }, {
+ { CDF2(27877, 30490) }, { CDF2(11532, 25697) },
+ { CDF2( 6544, 30234) }, { CDF2(23018, 28072) },
+ { CDF2(31915, 32385) },
+ }, {
+ { CDF3(25572, 28046, 30045) },
+ { CDF3( 9478, 21590, 27256) },
+ { CDF3( 7248, 26837, 29824) },
+ { CDF3(19167, 24486, 28349) },
+ { CDF3(31400, 31825, 32250) },
+ }, {
+ { CDF4(24779, 26955, 28576, 30282) },
+ { CDF4( 8669, 20364, 24073, 28093) },
+ { CDF4( 4255, 27565, 29377, 31067) },
+ { CDF4(19864, 23674, 26716, 29530) },
+ { CDF4(31646, 31893, 32147, 32426) },
+ }, {
+ { CDF5(23132, 25407, 26970, 28435, 30073) },
+ { CDF5( 7443, 17242, 20717, 24762, 27982) },
+ { CDF5( 6300, 24862, 26944, 28784, 30671) },
+ { CDF5(18916, 22895, 25267, 27435, 29652) },
+ { CDF5(31270, 31550, 31808, 32059, 32353) },
+ }, {
+ { CDF6(23105, 25199, 26464, 27684, 28931, 30318) },
+ { CDF6( 6950, 15447, 18952, 22681, 25567, 28563) },
+ { CDF6( 7560, 23474, 25490, 27203, 28921, 30708) },
+ { CDF6(18544, 22373, 24457, 26195, 28119, 30045) },
+ { CDF6(31198, 31451, 31670, 31882, 32123, 32391) },
+ }, {
+ { CDF7(21689, 23883, 25163, 26352, 27506, 28827, 30195) },
+ { CDF7( 6892, 15385, 17840, 21606, 24287, 26753, 29204) },
+ { CDF7( 5651, 23182, 25042, 26518, 27982, 29392, 30900) },
+ { CDF7(19349, 22578, 24418, 25994, 27524, 29031, 30448) },
+ { CDF7(31028, 31270, 31504, 31705, 31927, 32153, 32392) },
+ },
+ }, { /* uv */
+ {
+ { CDF1(29089) }, { CDF1(16384) }, { CDF1( 8713) },
+ { CDF1(29257) }, { CDF1(31610) },
+ }, {
+ { CDF2(25257, 29145) }, { CDF2(12287, 27293) },
+ { CDF2( 7033, 27960) }, { CDF2(20145, 25405) },
+ { CDF2(30608, 31639) },
+ }, {
+ { CDF3(24210, 27175, 29903) },
+ { CDF3( 9888, 22386, 27214) },
+ { CDF3( 5901, 26053, 29293) },
+ { CDF3(18318, 22152, 28333) },
+ { CDF3(30459, 31136, 31926) },
+ }, {
+ { CDF4(22980, 25479, 27781, 29986) },
+ { CDF4( 8413, 21408, 24859, 28874) },
+ { CDF4( 2257, 29449, 30594, 31598) },
+ { CDF4(19189, 21202, 25915, 28620) },
+ { CDF4(31844, 32044, 32281, 32518) },
+ }, {
+ { CDF5(22217, 24567, 26637, 28683, 30548) },
+ { CDF5( 7307, 16406, 19636, 24632, 28424) },
+ { CDF5( 4441, 25064, 26879, 28942, 30919) },
+ { CDF5(17210, 20528, 23319, 26750, 29582) },
+ { CDF5(30674, 30953, 31396, 31735, 32207) },
+ }, {
+ { CDF6(21239, 23168, 25044, 26962, 28705, 30506) },
+ { CDF6( 6545, 15012, 18004, 21817, 25503, 28701) },
+ { CDF6( 3448, 26295, 27437, 28704, 30126, 31442) },
+ { CDF6(15889, 18323, 21704, 24698, 26976, 29690) },
+ { CDF6(30988, 31204, 31479, 31734, 31983, 32325) },
+ }, {
+ { CDF7(21442, 23288, 24758, 26246, 27649, 28980, 30563) },
+ { CDF7( 5863, 14933, 17552, 20668, 23683, 26411, 29273) },
+ { CDF7( 3415, 25810, 26877, 27990, 29223, 30394, 31618) },
+ { CDF7(17965, 20084, 22232, 23974, 26274, 28402, 30390) },
+ { CDF7(31190, 31329, 31516, 31679, 31825, 32026, 32322) },
+ },
},
+ }, .intrabc = {
+ CDF1(30531)
+ },
+ }, .mv = {
+ .comp = {
+ .classes = {
+ CDF10(28672, 30976, 31858, 32320, 32551,
+ 32656, 32740, 32757, 32762, 32767)
+ }, .class0 = {
+ CDF1(27648)
+ }, .classN = {
+ { CDF1(17408) }, { CDF1(17920) }, { CDF1(18944) },
+ { CDF1(20480) }, { CDF1(22528) }, { CDF1(24576) },
+ { CDF1(28672) }, { CDF1(29952) }, { CDF1(29952) },
+ { CDF1(30720) },
+ }, .class0_fp = {
+ { CDF3(16384, 24576, 26624) },
+ { CDF3(12288, 21248, 24128) },
+ }, .classN_fp = {
+ CDF3( 8192, 17408, 21248)
+ }, .class0_hp = {
+ CDF1(20480)
+ }, .classN_hp = {
+ CDF1(16384)
+ }, .sign = {
+ CDF1(16384)
+ },
+ }, .joint = {
+ CDF3( 4096, 11264, 19328)
+ },
+ }, .kfym = {
+ {
+ { CDF12(15588, 17027, 19338, 20218, 20682, 21110,
+ 21825, 23244, 24189, 28165, 29093, 30466) },
+ { CDF12(12016, 18066, 19516, 20303, 20719, 21444,
+ 21888, 23032, 24434, 28658, 30172, 31409) },
+ { CDF12(10052, 10771, 22296, 22788, 23055, 23239,
+ 24133, 25620, 26160, 29336, 29929, 31567) },
+ { CDF12(14091, 15406, 16442, 18808, 19136, 19546,
+ 19998, 22096, 24746, 29585, 30958, 32462) },
+ { CDF12(12122, 13265, 15603, 16501, 18609, 20033,
+ 22391, 25583, 26437, 30261, 31073, 32475) },
+ }, {
+ { CDF12(10023, 19585, 20848, 21440, 21832, 22760,
+ 23089, 24023, 25381, 29014, 30482, 31436) },
+ { CDF12( 5983, 24099, 24560, 24886, 25066, 25795,
+ 25913, 26423, 27610, 29905, 31276, 31794) },
+ { CDF12( 7444, 12781, 20177, 20728, 21077, 21607,
+ 22170, 23405, 24469, 27915, 29090, 30492) },
+ { CDF12( 8537, 14689, 15432, 17087, 17408, 18172,
+ 18408, 19825, 24649, 29153, 31096, 32210) },
+ { CDF12( 7543, 14231, 15496, 16195, 17905, 20717,
+ 21984, 24516, 26001, 29675, 30981, 31994) },
+ }, {
+ { CDF12(12613, 13591, 21383, 22004, 22312, 22577,
+ 23401, 25055, 25729, 29538, 30305, 32077) },
+ { CDF12( 9687, 13470, 18506, 19230, 19604, 20147,
+ 20695, 22062, 23219, 27743, 29211, 30907) },
+ { CDF12( 6183, 6505, 26024, 26252, 26366, 26434,
+ 27082, 28354, 28555, 30467, 30794, 32086) },
+ { CDF12(10718, 11734, 14954, 17224, 17565, 17924,
+ 18561, 21523, 23878, 28975, 30287, 32252) },
+ { CDF12( 9194, 9858, 16501, 17263, 18424, 19171,
+ 21563, 25961, 26561, 30072, 30737, 32463) },
+ }, {
+ { CDF12(12602, 14399, 15488, 18381, 18778, 19315,
+ 19724, 21419, 25060, 29696, 30917, 32409) },
+ { CDF12( 8203, 13821, 14524, 17105, 17439, 18131,
+ 18404, 19468, 25225, 29485, 31158, 32342) },
+ { CDF12( 8451, 9731, 15004, 17643, 18012, 18425,
+ 19070, 21538, 24605, 29118, 30078, 32018) },
+ { CDF12( 7714, 9048, 9516, 16667, 16817, 16994,
+ 17153, 18767, 26743, 30389, 31536, 32528) },
+ { CDF12( 8843, 10280, 11496, 15317, 16652, 17943,
+ 19108, 22718, 25769, 29953, 30983, 32485) },
+ }, {
+ { CDF12(12578, 13671, 15979, 16834, 19075, 20913,
+ 22989, 25449, 26219, 30214, 31150, 32477) },
+ { CDF12( 9563, 13626, 15080, 15892, 17756, 20863,
+ 22207, 24236, 25380, 29653, 31143, 32277) },
+ { CDF12( 8356, 8901, 17616, 18256, 19350, 20106,
+ 22598, 25947, 26466, 29900, 30523, 32261) },
+ { CDF12(10835, 11815, 13124, 16042, 17018, 18039,
+ 18947, 22753, 24615, 29489, 30883, 32482) },
+ { CDF12( 7618, 8288, 9859, 10509, 15386, 18657,
+ 22903, 28776, 29180, 31355, 31802, 32593) },
},
- }, .intrabc = {
- CDF1(30531)
- },
-};
-
-static const CdfMvComponent default_mv_component_cdf = {
- .classes = {
- CDF10(28672, 30976, 31858, 32320, 32551,
- 32656, 32740, 32757, 32762, 32767)
- }, .class0 = {
- CDF1(27648)
- }, .classN = {
- { CDF1(17408) }, { CDF1(17920) }, { CDF1(18944) },
- { CDF1(20480) }, { CDF1(22528) }, { CDF1(24576) },
- { CDF1(28672) }, { CDF1(29952) }, { CDF1(29952) },
- { CDF1(30720) },
- }, .class0_fp = {
- { CDF3(16384, 24576, 26624) },
- { CDF3(12288, 21248, 24128) },
- }, .classN_fp = {
- CDF3( 8192, 17408, 21248)
- }, .class0_hp = {
- CDF1(20480)
- }, .classN_hp = {
- CDF1(16384)
- }, .sign = {
- CDF1(16384)
- },
-};
-
-static const uint16_t ALIGN(default_mv_joint_cdf[N_MV_JOINTS], 8) = {
- CDF3( 4096, 11264, 19328)
-};
-
-static const uint16_t ALIGN(default_kf_y_mode_cdf[5][5][N_INTRA_PRED_MODES + 3], 32) = {
- {
- { CDF12(15588, 17027, 19338, 20218, 20682, 21110,
- 21825, 23244, 24189, 28165, 29093, 30466) },
- { CDF12(12016, 18066, 19516, 20303, 20719, 21444,
- 21888, 23032, 24434, 28658, 30172, 31409) },
- { CDF12(10052, 10771, 22296, 22788, 23055, 23239,
- 24133, 25620, 26160, 29336, 29929, 31567) },
- { CDF12(14091, 15406, 16442, 18808, 19136, 19546,
- 19998, 22096, 24746, 29585, 30958, 32462) },
- { CDF12(12122, 13265, 15603, 16501, 18609, 20033,
- 22391, 25583, 26437, 30261, 31073, 32475) },
- }, {
- { CDF12(10023, 19585, 20848, 21440, 21832, 22760,
- 23089, 24023, 25381, 29014, 30482, 31436) },
- { CDF12( 5983, 24099, 24560, 24886, 25066, 25795,
- 25913, 26423, 27610, 29905, 31276, 31794) },
- { CDF12( 7444, 12781, 20177, 20728, 21077, 21607,
- 22170, 23405, 24469, 27915, 29090, 30492) },
- { CDF12( 8537, 14689, 15432, 17087, 17408, 18172,
- 18408, 19825, 24649, 29153, 31096, 32210) },
- { CDF12( 7543, 14231, 15496, 16195, 17905, 20717,
- 21984, 24516, 26001, 29675, 30981, 31994) },
- }, {
- { CDF12(12613, 13591, 21383, 22004, 22312, 22577,
- 23401, 25055, 25729, 29538, 30305, 32077) },
- { CDF12( 9687, 13470, 18506, 19230, 19604, 20147,
- 20695, 22062, 23219, 27743, 29211, 30907) },
- { CDF12( 6183, 6505, 26024, 26252, 26366, 26434,
- 27082, 28354, 28555, 30467, 30794, 32086) },
- { CDF12(10718, 11734, 14954, 17224, 17565, 17924,
- 18561, 21523, 23878, 28975, 30287, 32252) },
- { CDF12( 9194, 9858, 16501, 17263, 18424, 19171,
- 21563, 25961, 26561, 30072, 30737, 32463) },
- }, {
- { CDF12(12602, 14399, 15488, 18381, 18778, 19315,
- 19724, 21419, 25060, 29696, 30917, 32409) },
- { CDF12( 8203, 13821, 14524, 17105, 17439, 18131,
- 18404, 19468, 25225, 29485, 31158, 32342) },
- { CDF12( 8451, 9731, 15004, 17643, 18012, 18425,
- 19070, 21538, 24605, 29118, 30078, 32018) },
- { CDF12( 7714, 9048, 9516, 16667, 16817, 16994,
- 17153, 18767, 26743, 30389, 31536, 32528) },
- { CDF12( 8843, 10280, 11496, 15317, 16652, 17943,
- 19108, 22718, 25769, 29953, 30983, 32485) },
- }, {
- { CDF12(12578, 13671, 15979, 16834, 19075, 20913,
- 22989, 25449, 26219, 30214, 31150, 32477) },
- { CDF12( 9563, 13626, 15080, 15892, 17756, 20863,
- 22207, 24236, 25380, 29653, 31143, 32277) },
- { CDF12( 8356, 8901, 17616, 18256, 19350, 20106,
- 22598, 25947, 26466, 29900, 30523, 32261) },
- { CDF12(10835, 11815, 13124, 16042, 17018, 18039,
- 18947, 22753, 24615, 29489, 30883, 32482) },
- { CDF12( 7618, 8288, 9859, 10509, 15386, 18657,
- 22903, 28776, 29180, 31355, 31802, 32593) },
},
};
-static const CdfCoefContext av1_default_coef_cdf[4] = {
+static const CdfCoefContext default_coef_cdf[4] = {
[0] = {
.skip = {
{
@@ -3951,10 +3958,8 @@ void dav1d_cdf_thread_update(const Dav1dFrameHeader *const hdr,
{
#define update_cdf_1d(n1d, name) \
do { \
- memcpy(dst->name, src->name, sizeof(dst->name)); \
dst->name[n1d] = 0; \
} while (0)
-
#define update_cdf_2d(n1d, n2d, name) \
for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
#define update_cdf_3d(n1d, n2d, n3d, name) \
@@ -3962,29 +3967,8 @@ void dav1d_cdf_thread_update(const Dav1dFrameHeader *const hdr,
#define update_cdf_4d(n1d, n2d, n3d, n4d, name) \
for (int l = 0; l < (n1d); l++) update_cdf_3d(n2d, n3d, n4d, name[l])
-#define update_bit_0d(name) \
- do { \
- dst->name[0] = src->name[0]; \
- dst->name[1] = 0; \
- } while (0)
-
-#define update_bit_1d(n1d, name) \
- for (int i = 0; i < (n1d); i++) update_bit_0d(name[i])
-#define update_bit_2d(n1d, n2d, name) \
- for (int j = 0; j < (n1d); j++) update_bit_1d(n2d, name[j])
-#define update_bit_3d(n1d, n2d, n3d, name) \
- for (int k = 0; k < (n1d); k++) update_bit_2d(n2d, n3d, name[k])
+ memcpy(dst, src, offsetof(CdfContext, m.intrabc));
- update_bit_1d(N_BS_SIZES, m.use_filter_intra);
- update_cdf_1d(4, m.filter_intra);
- update_cdf_3d(2, N_INTRA_PRED_MODES, N_UV_INTRA_PRED_MODES - 1 - !k, m.uv_mode);
- update_cdf_2d(8, 6, m.angle_delta);
- update_cdf_3d(N_TX_SIZES - 1, 3, imin(k + 1, 2), m.txsz);
- update_cdf_3d(2, N_INTRA_PRED_MODES, 6, m.txtp_intra1);
- update_cdf_3d(3, N_INTRA_PRED_MODES, 4, m.txtp_intra2);
- update_bit_1d(3, m.skip);
- update_cdf_3d(N_BL_LEVELS, 4, dav1d_partition_type_count[k], m.partition);
- update_bit_2d(N_TX_SIZES, 13, coef.skip);
update_cdf_3d(2, 2, 4, coef.eob_bin_16);
update_cdf_3d(2, 2, 5, coef.eob_bin_32);
update_cdf_3d(2, 2, 6, coef.eob_bin_64);
@@ -3992,106 +3976,104 @@ void dav1d_cdf_thread_update(const Dav1dFrameHeader *const hdr,
update_cdf_3d(2, 2, 8, coef.eob_bin_256);
update_cdf_2d(2, 9, coef.eob_bin_512);
update_cdf_2d(2, 10, coef.eob_bin_1024);
- update_bit_3d(N_TX_SIZES, 2, 11 /*22*/, coef.eob_hi_bit);
update_cdf_4d(N_TX_SIZES, 2, 4, 2, coef.eob_base_tok);
update_cdf_4d(N_TX_SIZES, 2, 41 /*42*/, 3, coef.base_tok);
- update_bit_2d(2, 3, coef.dc_sign);
update_cdf_4d(4, 2, 21, 3, coef.br_tok);
- update_cdf_2d(3, DAV1D_MAX_SEGMENTS - 1, m.seg_id);
- update_cdf_1d(7, m.cfl_sign);
+ update_cdf_4d(N_TX_SIZES, 2, 11 /*22*/, 1, coef.eob_hi_bit);
+ update_cdf_3d(N_TX_SIZES, 13, 1, coef.skip);
+ update_cdf_3d(2, 3, 1, coef.dc_sign);
+
+ update_cdf_3d(2, N_INTRA_PRED_MODES, N_UV_INTRA_PRED_MODES - 1 - !k, m.uv_mode);
+ update_cdf_2d(4, N_PARTITIONS - 3, m.partition[BL_128X128]);
+ for (int k = BL_64X64; k < BL_8X8; k++)
+ update_cdf_2d(4, N_PARTITIONS - 1, m.partition[k]);
+ update_cdf_2d(4, N_SUB8X8_PARTITIONS - 1, m.partition[BL_8X8]);
update_cdf_2d(6, 15, m.cfl_alpha);
- update_bit_0d(m.restore_wiener);
- update_bit_0d(m.restore_sgrproj);
- update_cdf_1d(2, m.restore_switchable);
- update_cdf_1d(3, m.delta_q);
- update_cdf_2d(5, 3, m.delta_lf);
- update_bit_2d(7, 3, m.pal_y);
- update_bit_1d(2, m.pal_uv);
- update_cdf_3d(2, 7, 6, m.pal_sz);
- update_cdf_4d(2, 7, 5, k + 1, m.color_map);
- update_bit_2d(7, 3, m.txpart);
update_cdf_2d(2, 15, m.txtp_inter1);
update_cdf_1d(11, m.txtp_inter2);
- update_bit_1d(4, m.txtp_inter3);
-
- if (IS_KEY_OR_INTRA(hdr)) {
- update_bit_0d(m.intrabc);
+ update_cdf_3d(2, N_INTRA_PRED_MODES, 6, m.txtp_intra1);
+ update_cdf_3d(3, N_INTRA_PRED_MODES, 4, m.txtp_intra2);
+ update_cdf_1d(7, m.cfl_sign);
+ update_cdf_2d(8, 6, m.angle_delta);
+ update_cdf_1d(4, m.filter_intra);
+ update_cdf_2d(3, DAV1D_MAX_SEGMENTS - 1, m.seg_id);
+ update_cdf_3d(2, 7, 6, m.pal_sz);
+ update_cdf_4d(2, 7, 5, k + 1, m.color_map);
+ update_cdf_3d(N_TX_SIZES - 1, 3, imin(k + 1, 2), m.txsz);
+ update_cdf_1d(3, m.delta_q);
+ update_cdf_2d(5, 3, m.delta_lf);
+ update_cdf_1d(2, m.restore_switchable);
+ update_cdf_1d(1, m.restore_wiener);
+ update_cdf_1d(1, m.restore_sgrproj);
+ update_cdf_2d(4, 1, m.txtp_inter3);
+ update_cdf_2d(N_BS_SIZES, 1, m.use_filter_intra);
+ update_cdf_3d(7, 3, 1, m.txpart);
+ update_cdf_2d(3, 1, m.skip);
+ update_cdf_3d(7, 3, 1, m.pal_y);
+ update_cdf_2d(2, 1, m.pal_uv);
- update_cdf_1d(N_MV_JOINTS - 1, dmv.joint);
- for (int k = 0; k < 2; k++) {
- update_cdf_1d(10, dmv.comp[k].classes);
- update_bit_0d(dmv.comp[k].class0);
- update_bit_1d(10, dmv.comp[k].classN);
- update_bit_0d(dmv.comp[k].sign);
- }
+ if (IS_KEY_OR_INTRA(hdr))
return;
- }
- update_bit_1d(3, m.skip_mode);
+ memcpy(dst->m.y_mode, src->m.y_mode,
+ offsetof(CdfContext, kfym) - offsetof(CdfContext, m.y_mode));
+
update_cdf_2d(4, N_INTRA_PRED_MODES - 1, m.y_mode);
- update_cdf_3d(2, 8, DAV1D_N_SWITCHABLE_FILTERS - 1, m.filter);
- update_bit_1d(6, m.newmv_mode);
- update_bit_1d(2, m.globalmv_mode);
- update_bit_1d(6, m.refmv_mode);
- update_bit_1d(3, m.drl_bit);
- update_cdf_2d(8, N_COMP_INTER_PRED_MODES - 1, m.comp_inter_mode);
- update_bit_1d(4, m.intra);
- update_bit_1d(5, m.comp);
- update_bit_1d(5, m.comp_dir);
- update_bit_1d(6, m.jnt_comp);
- update_bit_1d(6, m.mask_comp);
- update_bit_1d(9, m.wedge_comp);
update_cdf_2d(9, 15, m.wedge_idx);
- update_bit_2d(6, 3, m.ref);
- update_bit_2d(3, 3, m.comp_fwd_ref);
- update_bit_2d(2, 3, m.comp_bwd_ref);
- update_bit_2d(3, 3, m.comp_uni_ref);
- update_bit_1d(3, m.seg_pred);
- update_bit_1d(4, m.interintra);
- update_bit_1d(7, m.interintra_wedge);
+ update_cdf_2d(8, N_COMP_INTER_PRED_MODES - 1, m.comp_inter_mode);
+ update_cdf_3d(2, 8, DAV1D_N_SWITCHABLE_FILTERS - 1, m.filter);
update_cdf_2d(4, 3, m.interintra_mode);
update_cdf_2d(N_BS_SIZES, 2, m.motion_mode);
- update_bit_1d(N_BS_SIZES, m.obmc);
+ update_cdf_2d(3, 1, m.skip_mode);
+ update_cdf_2d(6, 1, m.newmv_mode);
+ update_cdf_2d(2, 1, m.globalmv_mode);
+ update_cdf_2d(6, 1, m.refmv_mode);
+ update_cdf_2d(3, 1, m.drl_bit);
+ update_cdf_2d(4, 1, m.intra);
+ update_cdf_2d(5, 1, m.comp);
+ update_cdf_2d(5, 1, m.comp_dir);
+ update_cdf_2d(6, 1, m.jnt_comp);
+ update_cdf_2d(6, 1, m.mask_comp);
+ update_cdf_2d(9, 1, m.wedge_comp);
+ update_cdf_3d(6, 3, 1, m.ref);
+ update_cdf_3d(3, 3, 1, m.comp_fwd_ref);
+ update_cdf_3d(2, 3, 1, m.comp_bwd_ref);
+ update_cdf_3d(3, 3, 1, m.comp_uni_ref);
+ update_cdf_2d(3, 1, m.seg_pred);
+ update_cdf_2d(4, 1, m.interintra);
+ update_cdf_2d(7, 1, m.interintra_wedge);
+ update_cdf_2d(N_BS_SIZES, 1, m.obmc);
- update_cdf_1d(N_MV_JOINTS - 1, mv.joint);
for (int k = 0; k < 2; k++) {
update_cdf_1d(10, mv.comp[k].classes);
- update_bit_0d(mv.comp[k].class0);
- update_bit_1d(10, mv.comp[k].classN);
+ update_cdf_1d(1, mv.comp[k].sign);
+ update_cdf_1d(1, mv.comp[k].class0);
update_cdf_2d(2, 3, mv.comp[k].class0_fp);
+ update_cdf_1d(1, mv.comp[k].class0_hp);
+ update_cdf_2d(10, 1, mv.comp[k].classN);
update_cdf_1d(3, mv.comp[k].classN_fp);
- update_bit_0d(mv.comp[k].class0_hp);
- update_bit_0d(mv.comp[k].classN_hp);
- update_bit_0d(mv.comp[k].sign);
+ update_cdf_1d(1, mv.comp[k].classN_hp);
}
+ update_cdf_1d(N_MV_JOINTS - 1, mv.joint);
}
/*
* CDF threading wrappers.
*/
-static inline int get_qcat_idx(const int q) {
- if (q <= 20) return 0;
- if (q <= 60) return 1;
- if (q <= 120) return 2;
- return 3;
-}
-
-void dav1d_cdf_thread_init_static(CdfThreadContext *const cdf, const int qidx) {
+void dav1d_cdf_thread_init_static(CdfThreadContext *const cdf, const unsigned qidx) {
cdf->ref = NULL;
- cdf->data.qcat = get_qcat_idx(qidx);
+ cdf->data.qcat = (qidx > 20) + (qidx > 60) + (qidx > 120);
}
void dav1d_cdf_thread_copy(CdfContext *const dst, const CdfThreadContext *const src) {
if (src->ref) {
memcpy(dst, src->data.cdf, sizeof(*dst));
} else {
- dst->m = av1_default_cdf;
- memcpy(dst->kfym, default_kf_y_mode_cdf, sizeof(default_kf_y_mode_cdf));
- dst->coef = av1_default_coef_cdf[src->data.qcat];
- memcpy(dst->mv.joint, default_mv_joint_cdf, sizeof(default_mv_joint_cdf));
- memcpy(dst->dmv.joint, default_mv_joint_cdf, sizeof(default_mv_joint_cdf));
- dst->mv.comp[0] = dst->mv.comp[1] = dst->dmv.comp[0] = dst->dmv.comp[1] =
- default_mv_component_cdf;
+ dst->coef = default_coef_cdf[src->data.qcat];
+ memcpy(&dst->m, &default_cdf.m,
+ offsetof(CdfDefaultContext, mv.joint));
+ memcpy(&dst->mv.comp[1], &default_cdf.mv.comp,
+ sizeof(default_cdf) - offsetof(CdfDefaultContext, mv.comp));
}
}
diff --git a/third_party/dav1d/src/cdf.h b/third_party/dav1d/src/cdf.h
index 4b30474baa..c9b516dc72 100644
--- a/third_party/dav1d/src/cdf.h
+++ b/third_party/dav1d/src/cdf.h
@@ -34,12 +34,10 @@
#include "src/ref.h"
#include "src/thread_data.h"
-/* Buffers padded to [8] or [16] for SIMD where needed. */
+/* Buffers padded to [4]/[8]/[16] for SIMD where needed. */
typedef struct CdfModeContext {
- ALIGN(uint16_t y_mode[4][N_INTRA_PRED_MODES + 3], 32);
ALIGN(uint16_t uv_mode[2][N_INTRA_PRED_MODES][N_UV_INTRA_PRED_MODES + 2], 32);
- ALIGN(uint16_t wedge_idx[9][16], 32);
ALIGN(uint16_t partition[N_BL_LEVELS][4][N_PARTITIONS + 6], 32);
ALIGN(uint16_t cfl_alpha[6][16], 32);
ALIGN(uint16_t txtp_inter1[2][16], 32);
@@ -49,23 +47,33 @@ typedef struct CdfModeContext {
ALIGN(uint16_t cfl_sign[8], 16);
ALIGN(uint16_t angle_delta[8][8], 16);
ALIGN(uint16_t filter_intra[5 + 3], 16);
- ALIGN(uint16_t comp_inter_mode[8][N_COMP_INTER_PRED_MODES], 16);
ALIGN(uint16_t seg_id[3][DAV1D_MAX_SEGMENTS], 16);
ALIGN(uint16_t pal_sz[2][7][7 + 1], 16);
ALIGN(uint16_t color_map[2][7][5][8], 16);
- ALIGN(uint16_t filter[2][8][DAV1D_N_SWITCHABLE_FILTERS + 1], 8);
ALIGN(uint16_t txsz[N_TX_SIZES - 1][3][4], 8);
- ALIGN(uint16_t motion_mode[N_BS_SIZES][3 + 1], 8);
ALIGN(uint16_t delta_q[4], 8);
ALIGN(uint16_t delta_lf[5][4], 8);
- ALIGN(uint16_t interintra_mode[4][4], 8);
ALIGN(uint16_t restore_switchable[3 + 1], 8);
ALIGN(uint16_t restore_wiener[2], 4);
ALIGN(uint16_t restore_sgrproj[2], 4);
- ALIGN(uint16_t interintra[7][2], 4);
- ALIGN(uint16_t interintra_wedge[7][2], 4);
ALIGN(uint16_t txtp_inter3[4][2], 4);
ALIGN(uint16_t use_filter_intra[N_BS_SIZES][2], 4);
+ ALIGN(uint16_t txpart[7][3][2], 4);
+ ALIGN(uint16_t skip[3][2], 4);
+ ALIGN(uint16_t pal_y[7][3][2], 4);
+ ALIGN(uint16_t pal_uv[2][2], 4);
+
+ /* key/intra */
+ ALIGN(uint16_t intrabc[2], 4);
+
+ /* inter/switch */
+ ALIGN(uint16_t y_mode[4][N_INTRA_PRED_MODES + 3], 32);
+ ALIGN(uint16_t wedge_idx[9][16], 32);
+ ALIGN(uint16_t comp_inter_mode[8][N_COMP_INTER_PRED_MODES], 16);
+ ALIGN(uint16_t filter[2][8][DAV1D_N_SWITCHABLE_FILTERS + 1], 8);
+ ALIGN(uint16_t interintra_mode[4][4], 8);
+ ALIGN(uint16_t motion_mode[N_BS_SIZES][3 + 1], 8);
+ ALIGN(uint16_t skip_mode[3][2], 4);
ALIGN(uint16_t newmv_mode[6][2], 4);
ALIGN(uint16_t globalmv_mode[2][2], 4);
ALIGN(uint16_t refmv_mode[6][2], 4);
@@ -80,14 +88,10 @@ typedef struct CdfModeContext {
ALIGN(uint16_t comp_fwd_ref[3][3][2], 4);
ALIGN(uint16_t comp_bwd_ref[2][3][2], 4);
ALIGN(uint16_t comp_uni_ref[3][3][2], 4);
- ALIGN(uint16_t txpart[7][3][2], 4);
- ALIGN(uint16_t skip[3][2], 4);
- ALIGN(uint16_t skip_mode[3][2], 4);
ALIGN(uint16_t seg_pred[3][2], 4);
+ ALIGN(uint16_t interintra[7][2], 4);
+ ALIGN(uint16_t interintra_wedge[7][2], 4);
ALIGN(uint16_t obmc[N_BS_SIZES][2], 4);
- ALIGN(uint16_t pal_y[7][3][2], 4);
- ALIGN(uint16_t pal_uv[2][2], 4);
- ALIGN(uint16_t intrabc[2], 4);
} CdfModeContext;
typedef struct CdfCoefContext {
@@ -108,13 +112,13 @@ typedef struct CdfCoefContext {
typedef struct CdfMvComponent {
ALIGN(uint16_t classes[11 + 5], 32);
+ ALIGN(uint16_t sign[2], 4);
+ ALIGN(uint16_t class0[2], 4);
ALIGN(uint16_t class0_fp[2][4], 8);
- ALIGN(uint16_t classN_fp[4], 8);
ALIGN(uint16_t class0_hp[2], 4);
- ALIGN(uint16_t classN_hp[2], 4);
- ALIGN(uint16_t class0[2], 4);
ALIGN(uint16_t classN[10][2], 4);
- ALIGN(uint16_t sign[2], 4);
+ ALIGN(uint16_t classN_fp[4], 8);
+ ALIGN(uint16_t classN_hp[2], 4);
} CdfMvComponent;
typedef struct CdfMvContext {
@@ -123,10 +127,10 @@ typedef struct CdfMvContext {
} CdfMvContext;
typedef struct CdfContext {
+ CdfCoefContext coef;
CdfModeContext m;
+ CdfMvContext mv;
ALIGN(uint16_t kfym[5][5][N_INTRA_PRED_MODES + 3], 32);
- CdfCoefContext coef;
- CdfMvContext mv, dmv;
} CdfContext;
typedef struct CdfThreadContext {
@@ -138,7 +142,7 @@ typedef struct CdfThreadContext {
atomic_uint *progress;
} CdfThreadContext;
-void dav1d_cdf_thread_init_static(CdfThreadContext *cdf, int qidx);
+void dav1d_cdf_thread_init_static(CdfThreadContext *cdf, unsigned qidx);
int dav1d_cdf_thread_alloc(Dav1dContext *c, CdfThreadContext *cdf,
const int have_frame_mt);
void dav1d_cdf_thread_copy(CdfContext *dst, const CdfThreadContext *src);
diff --git a/third_party/dav1d/src/decode.c b/third_party/dav1d/src/decode.c
index eed9dfb756..7427c35592 100644
--- a/third_party/dav1d/src/decode.c
+++ b/third_party/dav1d/src/decode.c
@@ -73,42 +73,29 @@ static void init_quant_tables(const Dav1dSequenceHeader *const seq_hdr,
}
}
-static int read_mv_component_diff(Dav1dTaskContext *const t,
+static int read_mv_component_diff(MsacContext *const msac,
CdfMvComponent *const mv_comp,
- const int have_fp)
+ const int mv_prec)
{
- Dav1dTileState *const ts = t->ts;
- const Dav1dFrameContext *const f = t->f;
- const int have_hp = f->frame_hdr->hp;
- const int sign = dav1d_msac_decode_bool_adapt(&ts->msac, mv_comp->sign);
- const int cl = dav1d_msac_decode_symbol_adapt16(&ts->msac,
- mv_comp->classes, 10);
- int up, fp, hp;
+ const int sign = dav1d_msac_decode_bool_adapt(msac, mv_comp->sign);
+ const int cl = dav1d_msac_decode_symbol_adapt16(msac, mv_comp->classes, 10);
+ int up, fp = 3, hp = 1;
if (!cl) {
- up = dav1d_msac_decode_bool_adapt(&ts->msac, mv_comp->class0);
- if (have_fp) {
- fp = dav1d_msac_decode_symbol_adapt4(&ts->msac,
- mv_comp->class0_fp[up], 3);
- hp = have_hp ? dav1d_msac_decode_bool_adapt(&ts->msac,
- mv_comp->class0_hp) : 1;
- } else {
- fp = 3;
- hp = 1;
+ up = dav1d_msac_decode_bool_adapt(msac, mv_comp->class0);
+ if (mv_prec >= 0) { // !force_integer_mv
+ fp = dav1d_msac_decode_symbol_adapt4(msac, mv_comp->class0_fp[up], 3);
+ if (mv_prec > 0) // allow_high_precision_mv
+ hp = dav1d_msac_decode_bool_adapt(msac, mv_comp->class0_hp);
}
} else {
up = 1 << cl;
for (int n = 0; n < cl; n++)
- up |= dav1d_msac_decode_bool_adapt(&ts->msac,
- mv_comp->classN[n]) << n;
- if (have_fp) {
- fp = dav1d_msac_decode_symbol_adapt4(&ts->msac,
- mv_comp->classN_fp, 3);
- hp = have_hp ? dav1d_msac_decode_bool_adapt(&ts->msac,
- mv_comp->classN_hp) : 1;
- } else {
- fp = 3;
- hp = 1;
+ up |= dav1d_msac_decode_bool_adapt(msac, mv_comp->classN[n]) << n;
+ if (mv_prec >= 0) { // !force_integer_mv
+ fp = dav1d_msac_decode_symbol_adapt4(msac, mv_comp->classN_fp, 3);
+ if (mv_prec > 0) // allow_high_precision_mv
+ hp = dav1d_msac_decode_bool_adapt(msac, mv_comp->classN_hp);
}
}
@@ -117,25 +104,16 @@ static int read_mv_component_diff(Dav1dTaskContext *const t,
return sign ? -diff : diff;
}
-static void read_mv_residual(Dav1dTaskContext *const t, mv *const ref_mv,
- CdfMvContext *const mv_cdf, const int have_fp)
+static void read_mv_residual(Dav1dTileState *const ts, mv *const ref_mv,
+ const int mv_prec)
{
- switch (dav1d_msac_decode_symbol_adapt4(&t->ts->msac, t->ts->cdf.mv.joint,
- N_MV_JOINTS - 1))
- {
- case MV_JOINT_HV:
- ref_mv->y += read_mv_component_diff(t, &mv_cdf->comp[0], have_fp);
- ref_mv->x += read_mv_component_diff(t, &mv_cdf->comp[1], have_fp);
- break;
- case MV_JOINT_H:
- ref_mv->x += read_mv_component_diff(t, &mv_cdf->comp[1], have_fp);
- break;
- case MV_JOINT_V:
- ref_mv->y += read_mv_component_diff(t, &mv_cdf->comp[0], have_fp);
- break;
- default:
- break;
- }
+ MsacContext *const msac = &ts->msac;
+ const enum MVJoint mv_joint =
+ dav1d_msac_decode_symbol_adapt4(msac, ts->cdf.mv.joint, N_MV_JOINTS - 1);
+ if (mv_joint & MV_JOINT_V)
+ ref_mv->y += read_mv_component_diff(msac, &ts->cdf.mv.comp[0], mv_prec);
+ if (mv_joint & MV_JOINT_H)
+ ref_mv->x += read_mv_component_diff(msac, &ts->cdf.mv.comp[1], mv_prec);
}
static void read_tx_tree(Dav1dTaskContext *const t,
@@ -1001,8 +979,7 @@ static int decode_b(Dav1dTaskContext *const t,
const int have_delta_q = f->frame_hdr->delta.q.present &&
(bs != (f->seq_hdr->sb128 ? BS_128x128 : BS_64x64) || !b->skip);
- int8_t prev_delta_lf[4];
- memcpy(prev_delta_lf, ts->last_delta_lf, 4);
+ uint32_t prev_delta_lf = ts->last_delta_lf.u32;
if (have_delta_q) {
int delta_q = dav1d_msac_decode_symbol_adapt4(&ts->msac,
@@ -1038,8 +1015,8 @@ static int decode_b(Dav1dTaskContext *const t,
delta_lf = -delta_lf;
delta_lf *= 1 << f->frame_hdr->delta.lf.res_log2;
}
- ts->last_delta_lf[i] =
- iclip(ts->last_delta_lf[i] + delta_lf, -63, 63);
+ ts->last_delta_lf.i8[i] =
+ iclip(ts->last_delta_lf.i8[i] + delta_lf, -63, 63);
if (have_delta_q && DEBUG_BLOCK_INFO)
printf("Post-delta_lf[%d:%d]: r=%d\n", i, delta_lf,
ts->msac.rng);
@@ -1054,13 +1031,13 @@ static int decode_b(Dav1dTaskContext *const t,
init_quant_tables(f->seq_hdr, f->frame_hdr, ts->last_qidx, ts->dqmem);
ts->dq = ts->dqmem;
}
- if (!memcmp(ts->last_delta_lf, (int8_t[4]) { 0, 0, 0, 0 }, 4)) {
+ if (!ts->last_delta_lf.u32) {
// assign frame-wide lf values to this sb
ts->lflvl = f->lf.lvl;
- } else if (memcmp(ts->last_delta_lf, prev_delta_lf, 4)) {
+ } else if (ts->last_delta_lf.u32 != prev_delta_lf) {
// find sb-specific lf lvl parameters
- dav1d_calc_lf_values(ts->lflvlmem, f->frame_hdr, ts->last_delta_lf);
ts->lflvl = ts->lflvlmem;
+ dav1d_calc_lf_values(ts->lflvlmem, f->frame_hdr, ts->last_delta_lf.i8);
}
}
@@ -1324,7 +1301,7 @@ static int decode_b(Dav1dTaskContext *const t,
}
const union mv ref = b->mv[0];
- read_mv_residual(t, &b->mv[0], &ts->cdf.dmv, 0);
+ read_mv_residual(ts, &b->mv[0], -1);
// clip intrabc motion vector to decoded parts of current tile
int border_left = ts->tiling.col_start * 4;
@@ -1586,8 +1563,8 @@ static int decode_b(Dav1dTaskContext *const t,
break; \
case NEWMV: \
b->mv[idx] = mvstack[b->drl_idx].mv.mv[idx]; \
- read_mv_residual(t, &b->mv[idx], &ts->cdf.mv, \
- !f->frame_hdr->force_integer_mv); \
+ const int mv_prec = f->frame_hdr->hp - f->frame_hdr->force_integer_mv; \
+ read_mv_residual(ts, &b->mv[idx], mv_prec); \
break; \
}
has_subpel_filter = imin(bw4, bh4) == 1 ||
@@ -1775,8 +1752,8 @@ static int decode_b(Dav1dTaskContext *const t,
if (DEBUG_BLOCK_INFO)
printf("Post-intermode[%d,drl=%d]: r=%d\n",
b->inter_mode, b->drl_idx, ts->msac.rng);
- read_mv_residual(t, &b->mv[0], &ts->cdf.mv,
- !f->frame_hdr->force_integer_mv);
+ const int mv_prec = f->frame_hdr->hp - f->frame_hdr->force_integer_mv;
+ read_mv_residual(ts, &b->mv[0], mv_prec);
if (DEBUG_BLOCK_INFO)
printf("Post-residualmv[mv=y:%d,x:%d]: r=%d\n",
b->mv[0].y, b->mv[0].x, ts->msac.rng);
@@ -2495,7 +2472,7 @@ static void setup_tile(Dav1dTileState *const ts,
dav1d_cdf_thread_copy(&ts->cdf, &f->in_cdf);
ts->last_qidx = f->frame_hdr->quant.yac;
- memset(ts->last_delta_lf, 0, sizeof(ts->last_delta_lf));
+ ts->last_delta_lf.u32 = 0;
dav1d_msac_init(&ts->msac, data, sz, f->frame_hdr->disable_cdf_update);
diff --git a/third_party/dav1d/src/internal.h b/third_party/dav1d/src/internal.h
index 72f65607ed..96bf409c6c 100644
--- a/third_party/dav1d/src/internal.h
+++ b/third_party/dav1d/src/internal.h
@@ -303,8 +303,8 @@ struct Dav1dFrameContext {
int lr_buf_plane_sz[2]; /* (stride*sbh*4) << sb128 if n_tc > 1, else stride*4 */
int re_sz /* h */;
ALIGN(Av1FilterLUT lim_lut, 16);
+ ALIGN(uint8_t lvl[8 /* seg_id */][4 /* dir */][8 /* ref */][2 /* is_gmv */], 16);
int last_sharpness;
- uint8_t lvl[8 /* seg_id */][4 /* dir */][8 /* ref */][2 /* is_gmv */];
uint8_t *tx_lpf_right_edge[2];
uint8_t *cdef_line_buf, *lr_line_buf;
pixel *cdef_line[2 /* pre, post */][3 /* plane */];
@@ -376,8 +376,11 @@ struct Dav1dTileState {
const uint16_t (*dq)[3][2];
int last_qidx;
- int8_t last_delta_lf[4];
- uint8_t lflvlmem[8 /* seg_id */][4 /* dir */][8 /* ref */][2 /* is_gmv */];
+ union {
+ int8_t i8[4];
+ uint32_t u32;
+ } last_delta_lf;
+ ALIGN(uint8_t lflvlmem[8 /* seg_id */][4 /* dir */][8 /* ref */][2 /* is_gmv */], 16);
const uint8_t (*lflvl)[4][8][2];
Av1RestorationUnit *lr_ref[3];
diff --git a/third_party/dav1d/src/itx.h b/third_party/dav1d/src/itx.h
index d522079907..8ef4f4df48 100644
--- a/third_party/dav1d/src/itx.h
+++ b/third_party/dav1d/src/itx.h
@@ -39,10 +39,73 @@ void (name)(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob \
HIGHBD_DECL_SUFFIX)
typedef decl_itx_fn(*itxfm_fn);
+#define decl_itx2_fns(w, h, opt) \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt))
+
+#define decl_itx12_fns(w, h, opt) \
+decl_itx2_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt))
+
+#define decl_itx16_fns(w, h, opt) \
+decl_itx12_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt))
+
+#define decl_itx17_fns(w, h, opt) \
+decl_itx16_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt))
+
typedef struct Dav1dInvTxfmDSPContext {
itxfm_fn itxfm_add[N_RECT_TX_SIZES][N_TX_TYPES_PLUS_LL];
} Dav1dInvTxfmDSPContext;
bitfn_decls(void dav1d_itx_dsp_init, Dav1dInvTxfmDSPContext *c, int bpc);
+#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
+ c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
+ BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
+
+#define assign_itx1_fn(pfx, w, h, ext) \
+ assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext)
+
+#define assign_itx2_fn(pfx, w, h, ext) \
+ assign_itx1_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, identity_identity, IDTX, ext)
+
+#define assign_itx12_fn(pfx, w, h, ext) \
+ assign_itx2_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, dct_adst, ADST_DCT, ext); \
+ assign_itx_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, ext); \
+ assign_itx_fn(pfx, w, h, dct_identity, H_DCT, ext); \
+ assign_itx_fn(pfx, w, h, adst_dct, DCT_ADST, ext); \
+ assign_itx_fn(pfx, w, h, adst_adst, ADST_ADST, ext); \
+ assign_itx_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, identity_dct, V_DCT, ext)
+
+#define assign_itx16_fn(pfx, w, h, ext) \
+ assign_itx12_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, adst_identity, H_ADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, identity_adst, V_ADST, ext); \
+ assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST, ext)
+
+#define assign_itx17_fn(pfx, w, h, ext) \
+ assign_itx16_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, wht_wht, WHT_WHT, ext)
+
#endif /* DAV1D_SRC_ITX_H */
diff --git a/third_party/dav1d/src/lf_mask.c b/third_party/dav1d/src/lf_mask.c
index 062ba67371..09a5c532c4 100644
--- a/third_party/dav1d/src/lf_mask.c
+++ b/third_party/dav1d/src/lf_mask.c
@@ -436,7 +436,7 @@ static void calc_lf_value(uint8_t (*const lflvl_values)[2],
const int base = iclip(iclip(base_lvl + lf_delta, 0, 63) + seg_delta, 0, 63);
if (!mr_delta) {
- memset(lflvl_values, base, 8 * 2);
+ memset(lflvl_values, base, sizeof(*lflvl_values) * 8);
} else {
const int sh = base >= 32;
lflvl_values[0][0] = lflvl_values[0][1] =
@@ -457,7 +457,7 @@ static inline void calc_lf_value_chroma(uint8_t (*const lflvl_values)[2],
const Dav1dLoopfilterModeRefDeltas *const mr_delta)
{
if (!base_lvl)
- memset(lflvl_values, 0, 8 * 2);
+ memset(lflvl_values, 0, sizeof(*lflvl_values) * 8);
else
calc_lf_value(lflvl_values, base_lvl, lf_delta, seg_delta, mr_delta);
}
@@ -469,7 +469,7 @@ void dav1d_calc_lf_values(uint8_t (*const lflvl_values)[4][8][2],
const int n_seg = hdr->segmentation.enabled ? 8 : 1;
if (!hdr->loopfilter.level_y[0] && !hdr->loopfilter.level_y[1]) {
- memset(lflvl_values, 0, 8 * 4 * 2 * n_seg);
+ memset(lflvl_values, 0, sizeof(*lflvl_values) * n_seg);
return;
}
diff --git a/third_party/dav1d/src/meson.build b/third_party/dav1d/src/meson.build
index dc4be5fd6f..cd19b70c38 100644
--- a/third_party/dav1d/src/meson.build
+++ b/third_party/dav1d/src/meson.build
@@ -106,6 +106,7 @@ if is_asm_enabled
'arm/64/loopfilter.S',
'arm/64/looprestoration.S',
'arm/64/mc.S',
+ 'arm/64/mc_dotprod.S',
)
endif
diff --git a/third_party/dav1d/src/refmvs.c b/third_party/dav1d/src/refmvs.c
index 200afebde7..1da024b630 100644
--- a/third_party/dav1d/src/refmvs.c
+++ b/third_party/dav1d/src/refmvs.c
@@ -817,7 +817,9 @@ int dav1d_refmvs_init_frame(refmvs_frame *const rf,
if (r_stride != rf->r_stride || n_tile_rows != rf->n_tile_rows) {
if (rf->r) dav1d_freep_aligned(&rf->r);
const int uses_2pass = n_tile_threads > 1 && n_frame_threads > 1;
- rf->r = dav1d_alloc_aligned(ALLOC_REFMVS, sizeof(*rf->r) * 35 * r_stride * n_tile_rows * (1 + uses_2pass), 64);
+ /* sizeof(refmvs_block) == 12 but it's accessed using 16-byte loads in asm,
+ * so add 4 bytes of padding to avoid buffer overreads. */
+ rf->r = dav1d_alloc_aligned(ALLOC_REFMVS, sizeof(*rf->r) * 35 * r_stride * n_tile_rows * (1 + uses_2pass) + 4, 64);
if (!rf->r) return DAV1D_ERR(ENOMEM);
rf->r_stride = r_stride;
}
diff --git a/third_party/dav1d/src/riscv/itx.h b/third_party/dav1d/src/riscv/itx.h
index d3f9a03a03..e11b138348 100644
--- a/third_party/dav1d/src/riscv/itx.h
+++ b/third_party/dav1d/src/riscv/itx.h
@@ -28,34 +28,6 @@
#include "src/cpu.h"
#include "src/itx.h"
-#define decl_itx2_fns(w, h, opt) \
-decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt))
-
-#define decl_itx12_fns(w, h, opt) \
-decl_itx2_fns(w, h, opt); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt))
-
-#define decl_itx16_fns(w, h, opt) \
-decl_itx12_fns(w, h, opt); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt))
-
-#define decl_itx17_fns(w, h, opt) \
-decl_itx16_fns(w, h, opt); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt))
-
#define decl_itx_fns(ext) \
decl_itx17_fns( 4, 4, ext); \
decl_itx16_fns( 4, 8, ext); \
@@ -70,41 +42,6 @@ decl_itx16_fns(16, 16, ext)
decl_itx_fns(rvv);
static ALWAYS_INLINE void itx_dsp_init_riscv(Dav1dInvTxfmDSPContext *const c, int const bpc) {
-#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
- c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
- BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
-
-#define assign_itx1_fn(pfx, w, h, ext) \
- assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext)
-
-#define assign_itx2_fn(pfx, w, h, ext) \
- assign_itx1_fn(pfx, w, h, ext); \
- assign_itx_fn(pfx, w, h, identity_identity, IDTX, ext)
-
-#define assign_itx12_fn(pfx, w, h, ext) \
- assign_itx2_fn(pfx, w, h, ext); \
- assign_itx_fn(pfx, w, h, dct_adst, ADST_DCT, ext); \
- assign_itx_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, ext); \
- assign_itx_fn(pfx, w, h, dct_identity, H_DCT, ext); \
- assign_itx_fn(pfx, w, h, adst_dct, DCT_ADST, ext); \
- assign_itx_fn(pfx, w, h, adst_adst, ADST_ADST, ext); \
- assign_itx_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, ext); \
- assign_itx_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, ext); \
- assign_itx_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, ext); \
- assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
- assign_itx_fn(pfx, w, h, identity_dct, V_DCT, ext)
-
-#define assign_itx16_fn(pfx, w, h, ext) \
- assign_itx12_fn(pfx, w, h, ext); \
- assign_itx_fn(pfx, w, h, adst_identity, H_ADST, ext); \
- assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST, ext); \
- assign_itx_fn(pfx, w, h, identity_adst, V_ADST, ext); \
- assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST, ext)
-
-#define assign_itx17_fn(pfx, w, h, ext) \
- assign_itx16_fn(pfx, w, h, ext); \
- assign_itx_fn(pfx, w, h, wht_wht, WHT_WHT, ext)
-
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_RISCV_CPU_FLAG_V)) return;
diff --git a/third_party/dav1d/src/x86/ipred_avx2.asm b/third_party/dav1d/src/x86/ipred_avx2.asm
index 35738e7c0b..2956ffaf29 100644
--- a/third_party/dav1d/src/x86/ipred_avx2.asm
+++ b/third_party/dav1d/src/x86/ipred_avx2.asm
@@ -66,7 +66,8 @@ z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39
z_filter_k: db 0, 16, 0, 16, 0, 20, 0, 20, 8, 16, 8, 16
db 32, 16, 32, 16, 24, 20, 24, 20, 16, 16, 16, 16
db 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 8, 0
-z_filter_s: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7
+const \
+z_filter_s, db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7
db 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15
db 15, 15, 15, 15, 15, 15, 15, 15 ; should be in one cache line
pb_128: times 4 db 128 ; those are just placed here for alignment.
diff --git a/third_party/dav1d/src/x86/itx.h b/third_party/dav1d/src/x86/itx.h
index 346fde7d90..23d7a73806 100644
--- a/third_party/dav1d/src/x86/itx.h
+++ b/third_party/dav1d/src/x86/itx.h
@@ -30,34 +30,6 @@
#define BF_BPC(x, bits, suffix) x##_##bits##bpc_##suffix
-#define decl_itx2_fns(w, h, opt) \
-decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt))
-
-#define decl_itx12_fns(w, h, opt) \
-decl_itx2_fns(w, h, opt); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt))
-
-#define decl_itx16_fns(w, h, opt) \
-decl_itx12_fns(w, h, opt); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt))
-
-#define decl_itx17_fns(w, h, opt) \
-decl_itx16_fns(w, h, opt); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt))
-
#define decl_itx_fns(ext) \
decl_itx17_fns( 4, 4, ext); \
decl_itx16_fns( 4, 8, ext); \
@@ -136,42 +108,6 @@ decl_itx_fn(dav1d_inv_txfm_add_wht_wht_4x4_16bpc_avx2);
decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_4x4, sse2));
static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, const int bpc) {
-#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
- c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
- BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
-
-#define assign_itx1_fn(pfx, w, h, ext) \
- assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext)
-
-#define assign_itx2_fn(pfx, w, h, ext) \
- assign_itx1_fn(pfx, w, h, ext); \
- assign_itx_fn(pfx, w, h, identity_identity, IDTX, ext)
-
-#define assign_itx12_fn(pfx, w, h, ext) \
- assign_itx2_fn(pfx, w, h, ext); \
- assign_itx_fn(pfx, w, h, dct_adst, ADST_DCT, ext); \
- assign_itx_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, ext); \
- assign_itx_fn(pfx, w, h, dct_identity, H_DCT, ext); \
- assign_itx_fn(pfx, w, h, adst_dct, DCT_ADST, ext); \
- assign_itx_fn(pfx, w, h, adst_adst, ADST_ADST, ext); \
- assign_itx_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, ext); \
- assign_itx_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, ext); \
- assign_itx_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, ext); \
- assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
- assign_itx_fn(pfx, w, h, identity_dct, V_DCT, ext)
-
-#define assign_itx16_fn(pfx, w, h, ext) \
- assign_itx12_fn(pfx, w, h, ext); \
- assign_itx_fn(pfx, w, h, adst_identity, H_ADST, ext); \
- assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST, ext); \
- assign_itx_fn(pfx, w, h, identity_adst, V_ADST, ext); \
- assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST, ext)
-
-#define assign_itx17_fn(pfx, w, h, ext) \
- assign_itx16_fn(pfx, w, h, ext); \
- assign_itx_fn(pfx, w, h, wht_wht, WHT_WHT, ext)
-
-
#define assign_itx_bpc_fn(pfx, w, h, type, type_enum, bpc, ext) \
c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
BF_BPC(dav1d_inv_txfm_add_##type##_##w##x##h, bpc, ext)
diff --git a/third_party/dav1d/src/x86/mc16_avx2.asm b/third_party/dav1d/src/x86/mc16_avx2.asm
index 42e2a5525e..6b4424946b 100644
--- a/third_party/dav1d/src/x86/mc16_avx2.asm
+++ b/third_party/dav1d/src/x86/mc16_avx2.asm
@@ -1222,7 +1222,7 @@ cglobal prep_bilin_16bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
%assign FILTER_SMOOTH (1*15 << 16) | 4*15
%assign FILTER_SHARP (2*15 << 16) | 3*15
-%macro FN 4 ; prefix, type, type_h, type_v
+%macro FN 4-5 ; prefix, type, type_h, type_v, jmp_to
cglobal %1_%2_16bpc
mov t0d, FILTER_%3
%ifidn %3, %4
@@ -1230,8 +1230,8 @@ cglobal %1_%2_16bpc
%else
mov t1d, FILTER_%4
%endif
-%ifnidn %2, regular ; skip the jump in the last filter
- jmp mangle(private_prefix %+ _%1_16bpc %+ SUFFIX)
+%if %0 == 5 ; skip the jump in the last filter
+ jmp mangle(private_prefix %+ _%5 %+ SUFFIX)
%endif
%endmacro
@@ -1242,22 +1242,17 @@ DECLARE_REG_TMP 7, 8
%endif
%define PUT_8TAP_FN FN put_8tap,
-PUT_8TAP_FN sharp, SHARP, SHARP
-PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH
-PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP
-PUT_8TAP_FN smooth, SMOOTH, SMOOTH
-PUT_8TAP_FN sharp_regular, SHARP, REGULAR
-PUT_8TAP_FN regular_sharp, REGULAR, SHARP
-PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR
-PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_FN smooth, SMOOTH, SMOOTH, put_6tap_16bpc
+PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR, put_6tap_16bpc
+PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH, put_6tap_16bpc
PUT_8TAP_FN regular, REGULAR, REGULAR
-cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
+cglobal put_6tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
%define base r8-put_avx2
imul mxd, mxm, 0x010101
- add mxd, t0d ; 8tap_h, mx, 4tap_h
+ add mxd, t0d ; 6tap_h, mx, 4tap_h
imul myd, mym, 0x010101
- add myd, t1d ; 8tap_v, my, 4tap_v
+ add myd, t1d ; 6tap_v, my, 4tap_v
lea r8, [put_avx2]
movifnidn wd, wm
movifnidn hd, hm
@@ -1265,6 +1260,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
jnz .h
test myd, 0xf00
jnz .v
+.put:
tzcnt wd, wd
movzx wd, word [r8+wq*2+table_offset(put,)]
add wq, r8
@@ -1337,43 +1333,36 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
cmp wd, 4
je .h_w4
jl .h_w2
- WIN64_SPILL_XMM 13
+ WIN64_SPILL_XMM 11
shr mxd, 16
- sub srcq, 6
- vpbroadcastq m0, [base+subpel_filters+mxq*8]
- vbroadcasti128 m6, [subpel_h_shufA]
- vbroadcasti128 m7, [subpel_h_shufB]
+ sub srcq, 4
+ vpbroadcastq m0, [base+subpel_filters+1+mxq*8]
+ vbroadcasti128 m6, [base+subpel_h_shufA]
punpcklbw m0, m0
psraw m0, 8 ; sign-extend
- pshufd m8, m0, q0000
- pshufd m9, m0, q1111
- pshufd m10, m0, q2222
- pshufd m11, m0, q3333
- cmp wd, 8
- jg .h_w16
+ pshufd m7, m0, q0000
+ pshufd m8, m0, q1111
+ pshufd m9, m0, q2222
+ sub wd, 16
+ jge .h_w16
.h_w8:
-%macro PUT_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
- pshufb m%4, m%1, m7 ; 2 3 3 4 4 5 5 6
- pshufb m%1, m6 ; 0 1 1 2 2 3 3 4
- pmaddwd m%5, m9, m%4 ; abcd1
- pmaddwd m%1, m8 ; abcd0
- pshufb m%2, m7 ; 6 7 7 8 8 9 9 a
- shufpd m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8
- paddd m%5, m4
- paddd m%1, m%5
- pmaddwd m%5, m11, m%2 ; abcd3
- paddd m%1, m%5
- pmaddwd m%5, m10, m%4 ; abcd2
- pshufb m%3, m7 ; a b b c c d d e
- pmaddwd m%4, m8 ; efgh0
- paddd m%1, m%5
- pmaddwd m%5, m9, m%2 ; efgh1
- shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c
- pmaddwd m%3, m11 ; efgh3
- pmaddwd m%2, m10 ; efgh2
+%macro PUT_6TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
+ pshufb m%1, m6 ; 01 12 23 34
+ pshufb m%2, m6 ; 45 56 67 78
+ pmaddwd m%4, m7, m%1 ; a0
+ pshufb m%3, m6 ; 89 9a ab bc
+ pmaddwd m%5, m9, m%2 ; a2
+ shufpd m%1, m%2, 0x05 ; 23 34 45 56
+ paddd m%4, m%5 ; a0+a2
+ pmaddwd m%5, m7, m%2 ; b0
+ shufpd m%2, m%3, 0x05 ; 67 78 89 9a
+ pmaddwd m%3, m9 ; b2
+ pmaddwd m%1, m8 ; a1
+ pmaddwd m%2, m8 ; b1
+ paddd m%3, m%5 ; b0+b2
paddd m%4, m4
- paddd m%4, m%5
- paddd m%3, m%4
+ paddd m%3, m4
+ paddd m%1, m%4
paddd m%2, m%3
psrad m%1, 6
psrad m%2, 6
@@ -1384,9 +1373,9 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
vinserti128 m0, [srcq+ssq*1+ 0], 1
movu xm2, [srcq+ssq*0+16]
vinserti128 m2, [srcq+ssq*1+16], 1
- lea srcq, [srcq+ssq*2]
shufpd m1, m0, m2, 0x05
- PUT_8TAP_H 0, 1, 2, 3, 12
+ lea srcq, [srcq+ssq*2]
+ PUT_6TAP_H 0, 1, 2, 3, 10
mova [dstq+dsq*0], xm0
vextracti128 [dstq+dsq*1], m0, 1
lea dstq, [dstq+dsq*2]
@@ -1396,13 +1385,13 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
.h_w16:
mov r6d, wd
.h_w16_loop:
- movu m0, [srcq+r6*2-32]
- movu m1, [srcq+r6*2-24]
- movu m2, [srcq+r6*2-16]
- PUT_8TAP_H 0, 1, 2, 3, 12
- mova [dstq+r6*2-32], m0
+ movu m0, [srcq+r6*2+ 0]
+ movu m1, [srcq+r6*2+ 8]
+ movu m2, [srcq+r6*2+16]
+ PUT_6TAP_H 0, 1, 2, 3, 10
+ mova [dstq+r6*2], m0
sub r6d, 16
- jg .h_w16_loop
+ jge .h_w16_loop
add srcq, ssq
add dstq, dsq
dec hd
@@ -1411,10 +1400,449 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
.v:
movzx mxd, myb
shr myd, 16
- cmp hd, 4
- cmovle myd, mxd
+ cmp hd, 6
+ cmovs myd, mxd
+ vpbroadcastq m0, [base+subpel_filters+1+myq*8]
+ WIN64_SPILL_XMM 10, 12
+ vpbroadcastd m5, [pd_32]
+ vpbroadcastw m6, r8m
+ punpcklbw m0, m0
+ mov r6, ssq
+ psraw m0, 8 ; sign-extend
+ neg r6
+ pshufd m7, m0, q0000
+ pshufd m8, m0, q1111
+ pshufd m9, m0, q2222
+ cmp wd, 4
+ jg .v_w8
+ je .v_w4
+.v_w2:
+ movd xm2, [srcq+r6 *2]
+ pinsrd xm2, [srcq+r6 *1], 1
+ pinsrd xm2, [srcq+ssq*0], 2
+ pinsrd xm2, [srcq+ssq*1], 3 ; 0 1 2 3
+ lea srcq, [srcq+ssq*2]
+ movd xm0, [srcq+ssq*0]
+ palignr xm3, xm0, xm2, 4 ; 1 2 3 4
+ punpcklwd xm1, xm2, xm3 ; 01 12
+ punpckhwd xm2, xm3 ; 23 34
+.v_w2_loop:
+ movd xm3, [srcq+ssq*1]
+ pmaddwd xm4, xm7, xm1 ; a0 b0
+ mova xm1, xm2
+ pmaddwd xm2, xm8 ; a1 b1
+ lea srcq, [srcq+ssq*2]
+ paddd xm4, xm2
+ punpckldq xm2, xm0, xm3 ; 4 5
+ movd xm0, [srcq+ssq*0]
+ punpckldq xm3, xm0 ; 5 6
+ punpcklwd xm2, xm3 ; 45 56
+ pmaddwd xm3, xm9, xm2 ; a2 b2
+ paddd xm4, xm5
+ paddd xm4, xm3
+ psrad xm4, 6
+ packusdw xm4, xm4
+ pminsw xm4, xm6
+ movd [dstq+dsq*0], xm4
+ pextrd [dstq+dsq*1], xm4, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movq xm1, [srcq+r6 *2]
+ vpbroadcastq m3, [srcq+r6 *1]
+ vpbroadcastq m2, [srcq+ssq*0]
+ vpbroadcastq m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpbroadcastq m0, [srcq+ssq*0]
+ vpblendd m1, m3, 0x30
+ vpblendd m3, m2, 0x30
+ punpcklwd m1, m3 ; 01 12
+ vpblendd m2, m4, 0x30
+ vpblendd m4, m0, 0x30
+ punpcklwd m2, m4 ; 23 34
+.v_w4_loop:
+ vpbroadcastq m3, [srcq+ssq*1]
+ pmaddwd m4, m7, m1 ; a0 b0
+ mova m1, m2
+ pmaddwd m2, m8 ; a1 b1
+ lea srcq, [srcq+ssq*2]
+ paddd m4, m2
+ vpblendd m2, m0, m3, 0x30
+ vpbroadcastq m0, [srcq+ssq*0]
+ vpblendd m3, m0, 0x30
+ punpcklwd m2, m3 ; 45 56
+ pmaddwd m3, m9, m2 ; a2 b2
+ paddd m4, m5
+ paddd m4, m3
+ psrad m4, 6
+ vextracti128 xm3, m4, 1
+ packusdw xm4, xm3
+ pminsw xm4, xm6
+ movq [dstq+dsq*0], xm4
+ movhps [dstq+dsq*1], xm4
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+ shl wd, 5
+ WIN64_PUSH_XMM 12
+ lea wd, [hq+wq-256]
+.v_w8_loop0:
+ vbroadcasti128 m3, [srcq+r6 *2]
+ vbroadcasti128 m4, [srcq+r6 *1]
+ lea r7, [srcq+ssq*2]
+ vbroadcasti128 m0, [srcq+ssq*0]
+ vbroadcasti128 m1, [srcq+ssq*1]
+ mov r8, dstq
+ vbroadcasti128 m2, [r7+ssq*0]
+ shufpd m3, m0, 0x0c
+ shufpd m4, m1, 0x0c
+ punpcklwd m1, m3, m4 ; 01
+ punpckhwd m3, m4 ; 23
+ shufpd m0, m2, 0x0c
+ punpcklwd m2, m4, m0 ; 12
+ punpckhwd m4, m0 ; 34
+.v_w8_loop:
+ vbroadcasti128 m5, [r7+ssq*1]
+ pmaddwd m10, m7, m1 ; a0
+ lea r7, [r7+ssq*2]
+ pmaddwd m11, m7, m2 ; b0
+ mova m1, m3
+ pmaddwd m3, m8 ; a1
+ mova m2, m4
+ pmaddwd m4, m8 ; b1
+ paddd m10, m3
+ vbroadcasti128 m3, [r7+ssq*0]
+ paddd m11, m4
+ shufpd m4, m0, m5, 0x0d
+ shufpd m0, m5, m3, 0x0c
+ punpcklwd m3, m4, m0 ; 45
+ punpckhwd m4, m0 ; 56
+ pmaddwd m5, m9, m3 ; a2
+ paddd m10, m5
+ pmaddwd m5, m9, m4 ; b2
+ paddd m5, m11
+ psrad m10, 5
+ psrad m5, 5
+ packusdw m10, m5
+ pxor m5, m5
+ pavgw m5, m10
+ pminsw m5, m6
+ vpermq m5, m5, q3120
+ mova [r8+dsq*0], xm5
+ vextracti128 [r8+dsq*1], m5, 1
+ lea r8, [r8+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ add srcq, 16
+ add dstq, 16
+ movzx hd, wb
+ sub wd, 1<<8
+ jg .v_w8_loop0
+ RET
+.hv:
+ WIN64_SPILL_XMM 12, 16
+ vpbroadcastd m10, [pd_512]
+ vpbroadcastw m11, r8m
+ cmp wd, 4
+ jg .hv_w8
+ movzx mxd, mxb
+ vpbroadcastd m0, [base+subpel_filters+mxq*8+2]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ vpbroadcastq m1, [base+subpel_filters+1+myq*8]
+ mov r6, ssq
+ sub srcq, 2
+ neg r6
+ pxor m6, m6
+ punpcklbw m6, m0
+ punpcklbw m1, m1
+ psraw m1, 8 ; sign-extend
+ test dword r8m, 0x800
+ jz .hv_10bit
+ psraw m6, 2
+ psllw m1, 2
+.hv_10bit:
+ pshufd m7, m1, q0000
+ pshufd m8, m1, q1111
+ pshufd m9, m1, q2222
+ cmp wd, 4
+ je .hv_w4
+ vbroadcasti128 m5, [subpel_h_shuf2]
+ vbroadcasti128 m0, [srcq+ssq*0]
+ vinserti128 m2, m0, [srcq+r6*2], 1 ; 2 0
+ movu xm1, [srcq+ssq*1]
+ vinserti128 m1, [srcq+r6 *1], 1 ; 3 1
+ lea srcq, [srcq+ssq*2]
+ vinserti128 m0, [srcq+ssq*0], 0 ; 4 2
+ REPX {pshufb x, m5}, m2, m1, m0
+ REPX {pmaddwd x, m6}, m2, m1, m0
+ phaddd m2, m1
+ phaddd m1, m0
+ paddd m2, m10
+ paddd m1, m10
+ psrad m2, 10
+ psrad m1, 10
+ packssdw m2, m1 ; 2 3 3 4 0 1 1 2
+ punpckhqdq m0, m2, m2
+ punpcklwd m2, m0 ; 23 34
+ vextracti128 xm1, m2, 1 ; 01 12
+.hv_w2_loop:
+ movu xm3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movu xm4, [srcq+ssq*0]
+ pshufb xm3, xm5
+ pshufb xm4, xm5
+ pmaddwd xm3, xm6
+ pmaddwd xm4, xm6
+ phaddd xm3, xm4
+ pmaddwd xm4, xm7, xm1 ; a0 b0
+ mova xm1, xm2
+ pmaddwd xm2, xm8 ; a1 b1
+ paddd xm4, xm2
+ paddd xm3, xm10
+ psrad xm3, 10
+ packssdw xm3, xm3
+ palignr xm2, xm3, xm0, 12
+ mova xm0, xm3
+ punpcklwd xm2, xm0 ; 45 56
+ pmaddwd xm3, xm9, xm2 ; a2 b2
+ paddd xm4, xm10
+ paddd xm4, xm3
+ psrad xm4, 10
+ packusdw xm4, xm4
+ pminsw xm4, xm11
+ movd [dstq+dsq*0], xm4
+ pextrd [dstq+dsq*1], xm4, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w4:
+ WIN64_PUSH_XMM 14
+ vbroadcasti128 m12, [subpel_h_shufA]
+ pshufd m5, m6, q0000
+ vbroadcasti128 m13, [subpel_h_shufB]
+ pshufd m6, m6, q1111
+ movu xm2, [srcq+r6 *2]
+ vinserti128 m2, [srcq+r6 *1], 1 ; 0 1
+ movu xm0, [srcq+ssq*0]
+ vinserti128 m0, [srcq+ssq*1], 1 ; 2 3
+ lea srcq, [srcq+ssq*2]
+ movu xm3, [srcq+ssq*0] ; 4
+ pshufb m1, m2, m12
+ pmaddwd m1, m5
+ pshufb m2, m13
+ pmaddwd m2, m6
+ pshufb m4, m0, m12
+ pmaddwd m4, m5
+ pshufb m0, m13
+ pmaddwd m0, m6
+ paddd m2, m1
+ pshufb xm1, xm3, xm12
+ pmaddwd xm1, xm5
+ pshufb xm3, xm13
+ pmaddwd xm3, xm6
+ paddd m0, m4
+ paddd m2, m10
+ paddd xm1, xm10
+ paddd m0, m10
+ paddd xm3, xm1
+ REPX {psrad x, 10}, m2, m0, xm3
+ packssdw m2, m0 ; 0 2 1 3
+ packssdw xm0, xm3 ; 2 4
+ vperm2i128 m0, m2, 0x03
+ punpcklwd m1, m2, m0 ; 01 12
+ punpckhwd m2, m0 ; 23 34
+.hv_w4_loop:
+ movu xm3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vinserti128 m3, [srcq+ssq*0], 1
+ pmaddwd m4, m7, m1 ; a0 b0
+ mova m1, m2
+ pmaddwd m2, m8 ; a1 b1
+ paddd m4, m2
+ pshufb m2, m3, m12
+ pmaddwd m2, m5
+ pshufb m3, m13
+ pmaddwd m3, m6
+ paddd m2, m10
+ paddd m3, m2
+ psrad m3, 10
+ packssdw m3, m3 ; 5 5 6 6
+ vperm2i128 m2, m0, m3, 0x21
+ mova m0, m3
+ punpckhwd m2, m3 ; 45 56
+ pmaddwd m3, m9, m2 ; a2 b2
+ paddd m4, m10
+ paddd m4, m3
+ psrad m4, 10
+ vextracti128 xm3, m4, 1
+ packusdw xm4, xm3
+ pminsw xm4, xm11
+ movq [dstq+dsq*0], xm4
+ movhps [dstq+dsq*1], xm4
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ WIN64_PUSH_XMM 16, 12
+ shr mxd, 16
+ vbroadcasti128 m12, [subpel_h_shufA]
+ vpbroadcastq m2, [base+subpel_filters+1+mxq*8]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ pmovsxbw xm1, [base+subpel_filters+1+myq*8]
+ shl wd, 5
+ mov r6, ssq
+ sub srcq, 4
+ pxor m0, m0
+ neg r6
+ punpcklbw m0, m2
+ lea wd, [hq+wq-256]
+ test dword r8m, 0x800
+ jz .hv_w8_10bit
+ psraw m0, 2
+ psllw xm1, 2
+.hv_w8_10bit:
+ pshufd m7, m0, q0000
+ pshufd m8, m0, q1111
+%if WIN64
+ %define v_mul (rsp+stack_offset+40) ; r4m
+%else
+ %define v_mul (rsp+stack_offset+ 8) ; r6m
+%endif
+ mova [v_mul], xm1
+ pshufd m9, m0, q2222
+.hv_w8_loop0:
+ vbroadcasti128 m0, [srcq+ssq*0+ 0]
+ vinserti128 m3, m0, [srcq+r6*2+ 0], 0
+ lea r7, [srcq+ssq*2]
+ vbroadcasti128 m2, [srcq+ssq*0+16]
+ vinserti128 m1, m2, [srcq+r6*2+16], 0
+ mov r8, dstq
+ vinserti128 m0, [r7 +ssq*0+ 0], 1
+ vinserti128 m2, [r7 +ssq*0+16], 1
+ shufpd m4, m3, m1, 0x05
+%macro PUT_6TAP_HV_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
+ pshufb m%1, m12 ; 01 12 23 34
+ pshufb m%2, m12 ; 45 56 67 78
+ pmaddwd m%4, m7, m%1 ; a0
+ pshufb m%3, m12 ; 89 9a ab bc
+ pmaddwd m%5, m9, m%2 ; a2
+ shufpd m%1, m%2, 0x05 ; 23 34 45 56
+ paddd m%4, m%5 ; a0+a2
+ pmaddwd m%5, m7, m%2 ; b0
+ shufpd m%2, m%3, 0x05 ; 67 78 89 9a
+ pmaddwd m%3, m9 ; b2
+ pmaddwd m%1, m8 ; a1
+ pmaddwd m%2, m8 ; b1
+ paddd m%3, m%5 ; b0+b2
+ paddd m%4, m10
+ paddd m%3, m10
+ paddd m%1, m%4
+ paddd m%2, m%3
+ psrad m%1, 10
+ psrad m%2, 10
+ packssdw m%1, m%2
+%endmacro
+ PUT_6TAP_HV_H 3, 4, 1, 5, 6 ; 0 2
+ movu xm4, [srcq+r6 *1+ 0]
+ vinserti128 m4, [srcq+ssq*1+ 0], 1
+ shufpd m1, m0, m2, 0x05
+ PUT_6TAP_HV_H 0, 1, 2, 5, 6 ; 2 4
+ movu xm2, [srcq+r6 *1+16]
+ vinserti128 m2, [srcq+ssq*1+16], 1
+ shufpd m1, m4, m2, 0x05
+ PUT_6TAP_HV_H 4, 1, 2, 5, 6 ; 1 3
+ vpermq m3, m3, q3120
+ vpermq m4, m4, q3120
+ vpermq m0, m0, q3120
+ punpcklwd m1, m3, m4 ; 01
+ punpckhwd m3, m4 ; 23
+ punpcklwd m2, m4, m0 ; 12
+ punpckhwd m4, m0 ; 34
+.hv_w8_loop:
+ vpbroadcastd m15, [v_mul+4*0]
+ vpbroadcastd m13, [v_mul+4*1]
+ movu xm5, [r7+ssq*1+ 0]
+ movu xm6, [r7+ssq*1+16]
+ lea r7, [r7+ssq*2]
+ pmaddwd m14, m15, m1 ; a0
+ pmaddwd m15, m2 ; b0
+ vinserti128 m5, [r7+ssq*0+ 0], 1
+ vinserti128 m6, [r7+ssq*0+16], 1
+ mova m1, m3
+ pmaddwd m3, m13 ; a1
+ mova m2, m4
+ pmaddwd m4, m13 ; b1
+ paddd m14, m3
+ shufpd m3, m5, m6, 0x05
+ paddd m15, m4
+ PUT_6TAP_HV_H 5, 3, 6, 4, 13 ; 5 6
+ vpbroadcastd m6, [v_mul+4*2]
+ vpermq m5, m5, q3120
+ shufpd m4, m0, m5, 0x05
+ mova m0, m5
+ punpcklwd m3, m4, m5 ; 45
+ punpckhwd m4, m5 ; 56
+ pmaddwd m5, m6, m3 ; a2
+ pmaddwd m6, m4 ; b2
+ paddd m14, m10
+ paddd m15, m10
+ paddd m5, m14
+ paddd m6, m15
+ psrad m5, 10
+ psrad m6, 10
+ packusdw m5, m6
+ pminsw m5, m11
+ vpermq m5, m5, q3120
+ mova [r8+dsq*0], xm5
+ vextracti128 [r8+dsq*1], m5, 1
+ lea r8, [r8+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+ add srcq, 16
+ add dstq, 16
+ movzx hd, wb
+ sub wd, 1<<8
+ jg .hv_w8_loop0
+ RET
+
+PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP, put_8tap_16bpc
+PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH, put_8tap_16bpc
+PUT_8TAP_FN regular_sharp, REGULAR, SHARP, put_8tap_16bpc
+PUT_8TAP_FN sharp_regular, SHARP, REGULAR, put_8tap_16bpc
+PUT_8TAP_FN sharp, SHARP, SHARP
+
+cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
+%define base r8-put_avx2
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ lea r8, [put_avx2]
+ movifnidn wd, wm
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jz mangle(private_prefix %+ _put_6tap_16bpc_avx2).put
+.v:
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
vpbroadcastq m0, [base+subpel_filters+myq*8]
- WIN64_SPILL_XMM 15
+ WIN64_SPILL_XMM 12, 15
vpbroadcastd m6, [pd_32]
vpbroadcastw m7, r8m
lea r6, [ssq*3]
@@ -1518,19 +1946,19 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
RET
.v_w8:
shl wd, 5
- mov r7, srcq
- mov r8, dstq
+ WIN64_PUSH_XMM 15
lea wd, [hq+wq-256]
.v_w8_loop0:
vbroadcasti128 m4, [srcq+ssq*0]
vbroadcasti128 m5, [srcq+ssq*1]
+ lea r7, [srcq+ssq*4]
vbroadcasti128 m0, [srcq+r6 ]
vbroadcasti128 m6, [srcq+ssq*2]
- lea srcq, [srcq+ssq*4]
- vbroadcasti128 m1, [srcq+ssq*0]
- vbroadcasti128 m2, [srcq+ssq*1]
- vbroadcasti128 m3, [srcq+ssq*2]
- add srcq, r6
+ mov r8, dstq
+ vbroadcasti128 m1, [r7+ssq*0]
+ vbroadcasti128 m2, [r7+ssq*1]
+ vbroadcasti128 m3, [r7+ssq*2]
+ add r7, r6
shufpd m4, m0, 0x0c
shufpd m5, m1, 0x0c
punpcklwd m1, m4, m5 ; 01
@@ -1542,7 +1970,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
punpcklwd m3, m6, m0 ; 23
punpckhwd m6, m0 ; 56
.v_w8_loop:
- vbroadcasti128 m14, [srcq+ssq*0]
+ vbroadcasti128 m14, [r7+ssq*0]
pmaddwd m12, m8, m1 ; a0
pmaddwd m13, m8, m2 ; b0
mova m1, m3
@@ -1556,8 +1984,8 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
pmaddwd m5, m10 ; a2
pmaddwd m6, m10 ; b2
paddd m12, m5
- vbroadcasti128 m5, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
+ vbroadcasti128 m5, [r7+ssq*1]
+ lea r7, [r7+ssq*2]
paddd m13, m6
shufpd m6, m0, m14, 0x0d
shufpd m0, m14, m5, 0x0c
@@ -1574,19 +2002,99 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
pavgw m12, m13
pminsw m12, m7
vpermq m12, m12, q3120
- mova [dstq+dsq*0], xm12
- vextracti128 [dstq+dsq*1], m12, 1
- lea dstq, [dstq+dsq*2]
+ mova [r8+dsq*0], xm12
+ vextracti128 [r8+dsq*1], m12, 1
+ lea r8, [r8+dsq*2]
sub hd, 2
jg .v_w8_loop
- add r7, 16
- add r8, 16
+ add srcq, 16
+ add dstq, 16
movzx hd, wb
- mov srcq, r7
- mov dstq, r8
sub wd, 1<<8
jg .v_w8_loop0
RET
+.h:
+ RESET_STACK_STATE
+ test myd, 0xf00
+ jnz .hv
+ mov r7d, r8m
+ vpbroadcastw m5, r8m
+ shr r7d, 11
+ vpbroadcastd m4, [base+put_8tap_h_rnd+r7*4]
+ cmp wd, 4
+ jl mangle(private_prefix %+ _put_6tap_16bpc_avx2).h_w2
+ je mangle(private_prefix %+ _put_6tap_16bpc_avx2).h_w4
+ WIN64_SPILL_XMM 13
+ shr mxd, 16
+ sub srcq, 6
+ vpbroadcastq m0, [base+subpel_filters+mxq*8]
+ vbroadcasti128 m6, [subpel_h_shufA]
+ vbroadcasti128 m7, [subpel_h_shufB]
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ pshufd m8, m0, q0000
+ pshufd m9, m0, q1111
+ pshufd m10, m0, q2222
+ pshufd m11, m0, q3333
+ sub wd, 16
+ jge .h_w16
+.h_w8:
+%macro PUT_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
+ pshufb m%4, m%1, m7 ; 2 3 3 4 4 5 5 6
+ pshufb m%1, m6 ; 0 1 1 2 2 3 3 4
+ pmaddwd m%5, m9, m%4 ; abcd1
+ pmaddwd m%1, m8 ; abcd0
+ pshufb m%2, m7 ; 6 7 7 8 8 9 9 a
+ shufpd m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8
+ paddd m%5, m4
+ paddd m%1, m%5
+ pmaddwd m%5, m11, m%2 ; abcd3
+ paddd m%1, m%5
+ pmaddwd m%5, m10, m%4 ; abcd2
+ pshufb m%3, m7 ; a b b c c d d e
+ pmaddwd m%4, m8 ; efgh0
+ paddd m%1, m%5
+ pmaddwd m%5, m9, m%2 ; efgh1
+ shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c
+ pmaddwd m%3, m11 ; efgh3
+ pmaddwd m%2, m10 ; efgh2
+ paddd m%4, m4
+ paddd m%4, m%5
+ paddd m%3, m%4
+ paddd m%2, m%3
+ psrad m%1, 6
+ psrad m%2, 6
+ packusdw m%1, m%2
+ pminsw m%1, m5
+%endmacro
+ movu xm0, [srcq+ssq*0+ 0]
+ vinserti128 m0, [srcq+ssq*1+ 0], 1
+ movu xm2, [srcq+ssq*0+16]
+ vinserti128 m2, [srcq+ssq*1+16], 1
+ lea srcq, [srcq+ssq*2]
+ shufpd m1, m0, m2, 0x05
+ PUT_8TAP_H 0, 1, 2, 3, 12
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ mov r6d, wd
+.h_w16_loop:
+ movu m0, [srcq+r6*2+ 0]
+ movu m1, [srcq+r6*2+ 8]
+ movu m2, [srcq+r6*2+16]
+ PUT_8TAP_H 0, 1, 2, 3, 12
+ mova [dstq+r6*2], m0
+ sub r6d, 16
+ jge .h_w16_loop
+ add srcq, ssq
+ add dstq, dsq
+ dec hd
+ jg .h_w16
+ RET
.hv:
WIN64_SPILL_XMM 16
vpbroadcastw m15, r8m
@@ -1596,8 +2104,8 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
vpbroadcastd m0, [base+subpel_filters+mxq*8+2]
movzx mxd, myb
shr myd, 16
- cmp hd, 4
- cmovle myd, mxd
+ cmp hd, 6
+ cmovs myd, mxd
vpbroadcastq m1, [base+subpel_filters+myq*8]
vpbroadcastd m6, [pd_512]
lea r6, [ssq*3]
@@ -1773,17 +2281,15 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
vpbroadcastq m2, [base+subpel_filters+mxq*8]
movzx mxd, myb
shr myd, 16
- cmp hd, 4
- cmovle myd, mxd
+ cmp hd, 6
+ cmovs myd, mxd
pmovsxbw xm1, [base+subpel_filters+myq*8]
shl wd, 5
lea r6, [ssq*3]
sub srcq, 6
- sub srcq, r6
pxor m0, m0
+ sub srcq, r6
punpcklbw m0, m2
- mov r7, srcq
- mov r8, dstq
lea wd, [hq+wq-256]
test dword r8m, 0x800
jz .hv_w8_10bit
@@ -1792,14 +2298,9 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
.hv_w8_10bit:
pshufd m11, m0, q0000
pshufd m12, m0, q1111
+ mova [v_mul], xm1
pshufd m13, m0, q2222
pshufd m14, m0, q3333
-%if WIN64
- %define v_mul (rsp+stack_offset+40) ; r4m
-%else
- %define v_mul (rsp-24) ; red zone
-%endif
- mova [v_mul], xm1
.hv_w8_loop0:
%macro PUT_8TAP_HV_H 3 ; dst/src+0, src+8, src+16
pshufb m2, m%1, m9 ; 2 3 3 4 4 5 5 6
@@ -1830,14 +2331,16 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
%endmacro
movu xm4, [srcq+r6 *1+ 0]
vbroadcasti128 m8, [subpel_h_shufA]
+ lea r7, [srcq+ssq*4]
movu xm6, [srcq+r6 *1+ 8]
vbroadcasti128 m9, [subpel_h_shufB]
+ mov r8, dstq
movu xm0, [srcq+r6 *1+16]
vpbroadcastd m10, [pd_512]
movu xm5, [srcq+ssq*0+ 0]
- vinserti128 m5, [srcq+ssq*4+ 0], 1
+ vinserti128 m5, [r7 +ssq*0+ 0], 1
movu xm1, [srcq+ssq*0+16]
- vinserti128 m1, [srcq+ssq*4+16], 1
+ vinserti128 m1, [r7 +ssq*0+16], 1
shufpd m7, m5, m1, 0x05
INIT_XMM avx2
PUT_8TAP_HV_H 4, 6, 0 ; 3
@@ -1851,10 +2354,9 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
PUT_8TAP_HV_H 0, 7, 1 ; 2 6
movu xm6, [srcq+ssq*1+ 0]
movu xm1, [srcq+ssq*1+16]
- lea srcq, [srcq+ssq*4]
- vinserti128 m6, [srcq+ssq*1+ 0], 1
- vinserti128 m1, [srcq+ssq*1+16], 1
- add srcq, r6
+ vinserti128 m6, [r7 +ssq*1+ 0], 1
+ vinserti128 m1, [r7 +ssq*1+16], 1
+ add r7, r6
shufpd m7, m6, m1, 0x05
PUT_8TAP_HV_H 6, 7, 1 ; 1 5
vpermq m4, m4, q1100
@@ -1885,13 +2387,13 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
pmaddwd m6, m10 ; b2
paddd m8, m5
paddd m9, m6
- movu xm5, [srcq+ssq*0]
- vinserti128 m5, [srcq+ssq*1], 1
+ movu xm5, [r7+ssq*0]
+ vinserti128 m5, [r7+ssq*1], 1
vbroadcasti128 m7, [subpel_h_shufA]
vbroadcasti128 m10, [subpel_h_shufB]
- movu xm6, [srcq+ssq*0+16]
- vinserti128 m6, [srcq+ssq*1+16], 1
- vextracti128 [dstq], m0, 1
+ movu xm6, [r7+ssq*0+16]
+ vinserti128 m6, [r7+ssq*1+16], 1
+ vextracti128 [r8], m0, 1
pshufb m0, m5, m7 ; 01
pshufb m5, m10 ; 23
pmaddwd m0, m11
@@ -1902,9 +2404,9 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
pmaddwd m5, m13
pmaddwd m6, m14
paddd m6, m5
- movu xm5, [srcq+ssq*0+8]
- vinserti128 m5, [srcq+ssq*1+8], 1
- lea srcq, [srcq+ssq*2]
+ movu xm5, [r7+ssq*0+8]
+ vinserti128 m5, [r7+ssq*1+8], 1
+ lea r7, [r7+ssq*2]
pshufb m7, m5, m7
pshufb m5, m10
pmaddwd m10, m13, m7
@@ -1916,7 +2418,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
pmaddwd m5, m12
paddd m0, m7
paddd m5, m6
- vbroadcasti128 m6, [dstq]
+ vbroadcasti128 m6, [r8]
paddd m8, m10
paddd m9, m10
paddd m0, m10
@@ -1938,16 +2440,14 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
packusdw m7, m9
pminsw m7, m15
vpermq m7, m7, q3120
- mova [dstq+dsq*0], xm7
- vextracti128 [dstq+dsq*1], m7, 1
- lea dstq, [dstq+dsq*2]
+ mova [r8+dsq*0], xm7
+ vextracti128 [r8+dsq*1], m7, 1
+ lea r8, [r8+dsq*2]
sub hd, 2
jg .hv_w8_loop
- add r7, 16
- add r8, 16
+ add srcq, 16
+ add dstq, 16
movzx hd, wb
- mov srcq, r7
- mov dstq, r8
sub wd, 1<<8
jg .hv_w8_loop0
RET
@@ -1959,28 +2459,24 @@ DECLARE_REG_TMP 6, 7
%endif
%define PREP_8TAP_FN FN prep_8tap,
-PREP_8TAP_FN sharp, SHARP, SHARP
-PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH
-PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP
-PREP_8TAP_FN smooth, SMOOTH, SMOOTH
-PREP_8TAP_FN sharp_regular, SHARP, REGULAR
-PREP_8TAP_FN regular_sharp, REGULAR, SHARP
-PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR
-PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_FN smooth, SMOOTH, SMOOTH, prep_6tap_16bpc
+PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR, prep_6tap_16bpc
+PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH, prep_6tap_16bpc
PREP_8TAP_FN regular, REGULAR, REGULAR
-cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
+cglobal prep_6tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my
%define base r7-prep_avx2
imul mxd, mxm, 0x010101
- add mxd, t0d ; 8tap_h, mx, 4tap_h
+ add mxd, t0d ; 6tap_h, mx, 4tap_h
imul myd, mym, 0x010101
- add myd, t1d ; 8tap_v, my, 4tap_v
+ add myd, t1d ; 6tap_v, my, 4tap_v
lea r7, [prep_avx2]
movifnidn hd, hm
test mxd, 0xf00
jnz .h
test myd, 0xf00
jnz .v
+.prep:
tzcnt wd, wd
mov r6d, r7m ; bitdepth_max
movzx wd, word [r7+wq*2+table_offset(prep,)]
@@ -1988,7 +2484,7 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
shr r6d, 11
add wq, r7
vpbroadcastd m4, [base+prep_mul+r6*4]
- lea r6, [strideq*3]
+ lea r6, [ssq*3]
%if WIN64
pop r7
%endif
@@ -1998,6 +2494,7 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
sub srcq, 2
pmovsxbw xm0, [base+subpel_filters+mxq*8]
vbroadcasti128 m3, [subpel_h_shufA]
+ lea r6, [ssq*3]
vbroadcasti128 m4, [subpel_h_shufB]
WIN64_SPILL_XMM 8
pshufd xm0, xm0, q2211
@@ -2008,11 +2505,11 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
vpbroadcastq m6, xm0
vpermq m7, m0, q1111
.h_w4_loop:
- movu xm1, [srcq+strideq*0]
- vinserti128 m1, [srcq+strideq*2], 1
- movu xm2, [srcq+strideq*1]
- vinserti128 m2, [srcq+r6 ], 1
- lea srcq, [srcq+strideq*4]
+ movu xm1, [srcq+ssq*0]
+ vinserti128 m1, [srcq+ssq*2], 1
+ movu xm2, [srcq+ssq*1]
+ vinserti128 m2, [srcq+r6 *1], 1
+ lea srcq, [srcq+ssq*4]
pshufb m0, m1, m3 ; 0 1 1 2 2 3 3 4
pshufb m1, m4 ; 2 3 3 4 4 5 5 6
pmaddwd m0, m6
@@ -2037,62 +2534,54 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
test myd, 0xf00
jnz .hv
vpbroadcastd m5, [prep_8tap_1d_rnd] ; 8 - (8192 << 4)
- lea r6, [strideq*3]
cmp wd, 4
je .h_w4
shr mxd, 16
- sub srcq, 6
- vpbroadcastq m0, [base+subpel_filters+mxq*8]
- WIN64_SPILL_XMM 12
+ sub srcq, 4
+ vpbroadcastq m0, [base+subpel_filters+1+mxq*8]
+ WIN64_SPILL_XMM 10
vbroadcasti128 m6, [subpel_h_shufA]
- vbroadcasti128 m7, [subpel_h_shufB]
punpcklbw m0, m0
psraw m0, 8 ; sign-extend
test dword r7m, 0x800
jnz .h_12bpc
psllw m0, 2
.h_12bpc:
- pshufd m8, m0, q0000
- pshufd m9, m0, q1111
- pshufd m10, m0, q2222
- pshufd m11, m0, q3333
+ pshufd m7, m0, q0000
+ pshufd m8, m0, q1111
+ pshufd m9, m0, q2222
cmp wd, 8
jg .h_w16
.h_w8:
-%macro PREP_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
- pshufb m%4, m%1, m7 ; 2 3 3 4 4 5 5 6
- pshufb m%1, m6 ; 0 1 1 2 2 3 3 4
- pmaddwd m%5, m9, m%4 ; abcd1
- pmaddwd m%1, m8 ; abcd0
- pshufb m%2, m7 ; 6 7 7 8 8 9 9 a
- shufpd m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8
- paddd m%5, m5
- paddd m%1, m%5
- pmaddwd m%5, m11, m%2 ; abcd3
- paddd m%1, m%5
- pmaddwd m%5, m10, m%4 ; abcd2
- pshufb m%3, m7 ; a b b c c d d e
- pmaddwd m%4, m8 ; efgh0
- paddd m%1, m%5
- pmaddwd m%5, m9, m%2 ; efgh1
- shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c
- pmaddwd m%3, m11 ; efgh3
- pmaddwd m%2, m10 ; efgh2
+ movu xm0, [srcq+ssq*0+ 0]
+ vinserti128 m0, [srcq+ssq*1+ 0], 1
+ movu xm2, [srcq+ssq*0+16]
+ vinserti128 m2, [srcq+ssq*1+16], 1
+ lea srcq, [srcq+ssq*2]
+ shufpd m1, m0, m2, 0x05
+%macro PREP_6TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
+ pshufb m%1, m6 ; 01 12 23 34
+ pshufb m%2, m6 ; 45 56 67 78
+ pmaddwd m%4, m7, m%1 ; a0
+ pshufb m%3, m6 ; 89 9a ab bc
+ pmaddwd m%5, m9, m%2 ; a2
+ shufpd m%1, m%2, 0x05 ; 23 34 45 56
+ paddd m%4, m%5 ; a0+a2
+ pmaddwd m%5, m7, m%2 ; b0
+ shufpd m%2, m%3, 0x05 ; 67 78 89 9a
+ pmaddwd m%3, m9 ; b2
+ pmaddwd m%1, m8 ; a1
+ pmaddwd m%2, m8 ; b1
+ paddd m%3, m%5 ; b0+b2
paddd m%4, m5
- paddd m%4, m%5
- paddd m%3, m%4
+ paddd m%3, m5
+ paddd m%1, m%4
paddd m%2, m%3
psrad m%1, 4
psrad m%2, 4
packssdw m%1, m%2
%endmacro
- movu xm0, [srcq+strideq*0+ 0]
- vinserti128 m0, [srcq+strideq*1+ 0], 1
- movu xm2, [srcq+strideq*0+16]
- vinserti128 m2, [srcq+strideq*1+16], 1
- lea srcq, [srcq+strideq*2]
- shufpd m1, m0, m2, 0x05
- PREP_8TAP_H 0, 1, 2, 3, 4
+ PREP_6TAP_H 0, 1, 2, 3, 4
mova [tmpq], m0
add tmpq, 32
sub hd, 2
@@ -2106,11 +2595,11 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
movu m0, [srcq+r6-32]
movu m1, [srcq+r6-24]
movu m2, [srcq+r6-16]
- PREP_8TAP_H 0, 1, 2, 3, 4
+ PREP_6TAP_H 0, 1, 2, 3, 4
mova [tmpq+r6-32], m0
sub r6d, 32
jg .h_w16_loop
- add srcq, strideq
+ add srcq, ssq
add tmpq, wq
dec hd
jg .h_w16_loop0
@@ -2119,13 +2608,368 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
movzx mxd, myb
shr myd, 16
cmp hd, 4
- cmovle myd, mxd
+ cmove myd, mxd
+ vpbroadcastq m0, [base+subpel_filters+1+myq*8]
+ WIN64_SPILL_XMM 9, 12
+ vpbroadcastd m5, [prep_8tap_1d_rnd]
+ mov r6, ssq
+ punpcklbw m0, m0
+ neg r6
+ psraw m0, 8 ; sign-extend
+ test dword r7m, 0x800
+ jnz .v_12bpc
+ psllw m0, 2
+.v_12bpc:
+ pshufd m6, m0, q0000
+ pshufd m7, m0, q1111
+ pshufd m8, m0, q2222
+ cmp wd, 4
+ jg .v_w8
+.v_w4:
+ movq xm1, [srcq+r6 *2]
+ vpbroadcastq m3, [srcq+r6 *1]
+ vpbroadcastq m2, [srcq+ssq*0]
+ vpbroadcastq m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpbroadcastq m0, [srcq+ssq*0]
+ vpblendd m1, m3, 0x30
+ vpblendd m3, m2, 0x30
+ punpcklwd m1, m3 ; 01 12
+ vpblendd m2, m4, 0x30
+ vpblendd m4, m0, 0x30
+ punpcklwd m2, m4 ; 23 34
+.v_w4_loop:
+ vpbroadcastq m3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddwd m4, m6, m1 ; a0 b0
+ mova m1, m2
+ pmaddwd m2, m7 ; a1 b1
+ paddd m4, m2
+ vpblendd m2, m0, m3, 0x30
+ vpbroadcastq m0, [srcq+ssq*0]
+ vpblendd m3, m0, 0x30
+ punpcklwd m2, m3 ; 45 56
+ pmaddwd m3, m8, m2 ; a2 b2
+ paddd m4, m5
+ paddd m4, m3
+ psrad m4, 4
+ vextracti128 xm3, m4, 1
+ packssdw xm4, xm3
+ mova [tmpq], xm4
+ add tmpq, 16
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+ WIN64_PUSH_XMM 12
+%if WIN64
+ push r8
+%endif
+ mov r8d, wd
+ shl wd, 5
+ lea wd, [hq+wq-256]
+.v_w8_loop0:
+ vbroadcasti128 m3, [srcq+r6 *2]
+ vbroadcasti128 m4, [srcq+r6 *1]
+ lea r5, [srcq+ssq*2]
+ vbroadcasti128 m0, [srcq+ssq*0]
+ vbroadcasti128 m1, [srcq+ssq*1]
+ mov r7, tmpq
+ vbroadcasti128 m2, [r5+ssq*0]
+ shufpd m3, m0, 0x0c
+ shufpd m4, m1, 0x0c
+ punpcklwd m1, m3, m4 ; 01
+ punpckhwd m3, m4 ; 23
+ shufpd m0, m2, 0x0c
+ punpcklwd m2, m4, m0 ; 12
+ punpckhwd m4, m0 ; 34
+.v_w8_loop:
+ vbroadcasti128 m9, [r5+ssq*1]
+ pmaddwd m10, m6, m1 ; a0
+ lea r5, [r5+ssq*2]
+ pmaddwd m11, m6, m2 ; b0
+ mova m1, m3
+ pmaddwd m3, m7 ; a1
+ mova m2, m4
+ pmaddwd m4, m7 ; b1
+ paddd m10, m5
+ paddd m11, m5
+ paddd m10, m3
+ vbroadcasti128 m3, [r5+ssq*0]
+ paddd m11, m4
+ shufpd m4, m0, m9, 0x0d
+ shufpd m0, m9, m3, 0x0c
+ punpcklwd m3, m4, m0 ; 45
+ punpckhwd m4, m0 ; 56
+ pmaddwd m9, m8, m3 ; a2
+ paddd m10, m9
+ pmaddwd m9, m8, m4 ; b2
+ paddd m11, m9
+ psrad m10, 4
+ psrad m11, 4
+ packssdw m10, m11
+ vpermq m10, m10, q3120
+ mova [r7+r8*0], xm10
+ vextracti128 [r7+r8*2], m10, 1
+ lea r7, [r7+r8*4]
+ sub hd, 2
+ jg .v_w8_loop
+ add srcq, 16
+ add tmpq, 16
+ movzx hd, wb
+ sub wd, 1<<8
+ jg .v_w8_loop0
+%if WIN64
+ pop r8
+%endif
+ RET
+.hv:
+ WIN64_SPILL_XMM 13, 15
+ vpbroadcastd m7, [prep_8tap_2d_rnd]
+ vbroadcasti128 m8, [subpel_h_shufA]
+ cmp wd, 4
+ jg .hv_w8
+ movzx mxd, mxb
+ vpbroadcastd m0, [base+subpel_filters+mxq*8+2]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ vpbroadcastq m1, [base+subpel_filters+1+myq*8]
+ mov r6, ssq
+ sub srcq, 2
+ pxor m6, m6
+ neg r6
+ punpcklbw m6, m0
+ punpcklbw m1, m1
+ psraw m6, 4
+ psraw m1, 8
+ test dword r7m, 0x800
+ jz .hv_w4_10bit
+ psraw m6, 2
+.hv_w4_10bit:
+ pshufd m10, m1, q0000
+ pshufd m11, m1, q1111
+ pshufd m12, m1, q2222
+.hv_w4:
+ movu xm2, [srcq+r6 *2]
+ vinserti128 m2, [srcq+r6 *1], 1 ; 0 1
+ pshufd m5, m6, q0000
+ vbroadcasti128 m9, [base+subpel_h_shufB]
+ movu xm0, [srcq+ssq*0]
+ pshufd m6, m6, q1111
+ vinserti128 m0, [srcq+ssq*1], 1 ; 2 3
+ lea srcq, [srcq+ssq*2]
+ movu xm3, [srcq+ssq*0] ; 4
+ pshufb m1, m2, m8
+ pmaddwd m1, m5
+ pshufb m2, m9
+ pmaddwd m2, m6
+ pshufb m4, m0, m8
+ pmaddwd m4, m5
+ pshufb m0, m9
+ pmaddwd m0, m6
+ paddd m2, m1
+ pshufb xm1, xm3, xm8
+ pmaddwd xm1, xm5
+ pshufb xm3, xm9
+ pmaddwd xm3, xm6
+ paddd m0, m4
+ paddd m2, m7
+ paddd xm1, xm7
+ paddd m0, m7
+ paddd xm3, xm1
+ REPX {psrad x, 6}, m2, m0, xm3
+ packssdw m2, m0 ; 0 2 1 3
+ packssdw xm0, xm3 ; 2 4
+ vperm2i128 m0, m2, 0x03
+ punpcklwd m1, m2, m0 ; 01 12
+ punpckhwd m2, m0 ; 23 34
+.hv_w4_loop:
+ movu xm3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vinserti128 m3, [srcq+ssq*0], 1
+ pmaddwd m4, m10, m1 ; a0 b0
+ mova m1, m2
+ pmaddwd m2, m11 ; a1 b1
+ paddd m4, m2
+ pshufb m2, m3, m8
+ pmaddwd m2, m5
+ pshufb m3, m9
+ pmaddwd m3, m6
+ paddd m2, m7
+ paddd m3, m2
+ psrad m3, 6
+ packssdw m3, m3 ; 5 5 6 6
+ vperm2i128 m2, m0, m3, 0x21
+ mova m0, m3
+ punpckhwd m2, m3 ; 45 56
+ pmaddwd m3, m12, m2 ; a2 b2
+ paddd m4, m7
+ paddd m4, m3
+ psrad m4, 6
+ vextracti128 xm3, m4, 1
+ packssdw xm4, xm3
+ mova [tmpq], xm4
+ add tmpq, 16
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ shr mxd, 16
+ vpbroadcastq m2, [base+subpel_filters+1+mxq*8]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ pmovsxbw xm1, [base+subpel_filters+1+myq*8]
+ WIN64_PUSH_XMM 15
+%if WIN64
+ PUSH r8
+%endif
+ mov r8d, wd
+ shl wd, 5
+ mov r6, ssq
+ sub srcq, 4
+ neg r6
+ lea wd, [hq+wq-256]
+ pxor m0, m0
+ punpcklbw m0, m2
+ psraw m0, 4
+ test dword r7m, 0x800
+ jz .hv_w8_10bit
+ psraw m0, 2
+.hv_w8_10bit:
+ pshufd m10, m0, q0000
+ pshufd m11, m0, q1111
+ mova [v_mul], xm1
+ pshufd m12, m0, q2222
+.hv_w8_loop0:
+ vbroadcasti128 m0, [srcq+ssq*0+ 0]
+ vinserti128 m3, m0, [srcq+r6*2+ 0], 0
+ lea r5, [srcq+ssq*2]
+ vbroadcasti128 m2, [srcq+ssq*0+16]
+ vinserti128 m1, m2, [srcq+r6*2+16], 0
+ mov r7, tmpq
+ vinserti128 m0, [r5 +ssq*0+ 0], 1
+ vinserti128 m2, [r5 +ssq*0+16], 1
+ shufpd m4, m3, m1, 0x05
+%macro PREP_6TAP_HV_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
+ pshufb m%1, m8 ; 01 12 23 34
+ pshufb m%2, m8 ; 45 56 67 78
+ pmaddwd m%4, m10, m%1 ; a0
+ pshufb m%3, m8 ; 89 9a ab bc
+ pmaddwd m%5, m12, m%2 ; a2
+ shufpd m%1, m%2, 0x05 ; 23 34 45 56
+ paddd m%4, m%5 ; a0+a2
+ pmaddwd m%5, m10, m%2 ; b0
+ shufpd m%2, m%3, 0x05 ; 67 78 89 9a
+ pmaddwd m%3, m12 ; b2
+ pmaddwd m%1, m11 ; a1
+ pmaddwd m%2, m11 ; b1
+ paddd m%3, m%5 ; b0+b2
+ paddd m%4, m7
+ paddd m%3, m7
+ paddd m%1, m%4
+ paddd m%2, m%3
+ psrad m%1, 6
+ psrad m%2, 6
+ packssdw m%1, m%2
+%endmacro
+ PREP_6TAP_HV_H 3, 4, 1, 5, 6 ; 0 2
+ movu xm4, [srcq+r6 *1+ 0]
+ vinserti128 m4, [srcq+ssq*1+ 0], 1
+ shufpd m1, m0, m2, 0x05
+ PREP_6TAP_HV_H 0, 1, 2, 5, 6 ; 2 4
+ movu xm2, [srcq+r6 *1+16]
+ vinserti128 m2, [srcq+ssq*1+16], 1
+ shufpd m1, m4, m2, 0x05
+ PREP_6TAP_HV_H 4, 1, 2, 5, 6 ; 1 3
+ vpermq m3, m3, q3120
+ vpermq m4, m4, q3120
+ vpermq m0, m0, q3120
+ punpcklwd m1, m3, m4 ; 01
+ punpckhwd m3, m4 ; 23
+ punpcklwd m2, m4, m0 ; 12
+ punpckhwd m4, m0 ; 34
+.hv_w8_loop:
+ vpbroadcastd m14, [v_mul+4*0]
+ vpbroadcastd m9, [v_mul+4*1]
+ movu xm5, [r5+ssq*1+ 0]
+ movu xm6, [r5+ssq*1+16]
+ lea r5, [r5+ssq*2]
+ pmaddwd m13, m14, m1 ; a0
+ pmaddwd m14, m2 ; b0
+ vinserti128 m5, [r5+ssq*0+ 0], 1
+ vinserti128 m6, [r5+ssq*0+16], 1
+ mova m1, m3
+ pmaddwd m3, m9 ; a1
+ mova m2, m4
+ pmaddwd m4, m9 ; b1
+ paddd m13, m3
+ shufpd m3, m5, m6, 0x05
+ paddd m14, m4
+ PREP_6TAP_HV_H 5, 3, 6, 4, 9 ; 5 6
+ vpbroadcastd m6, [v_mul+4*2]
+ vpermq m5, m5, q3120
+ shufpd m4, m0, m5, 0x05
+ mova m0, m5
+ punpcklwd m3, m4, m5 ; 45
+ punpckhwd m4, m5 ; 56
+ pmaddwd m5, m6, m3 ; a2
+ pmaddwd m6, m4 ; b2
+ paddd m13, m7
+ paddd m14, m7
+ paddd m5, m13
+ paddd m6, m14
+ psrad m5, 6
+ psrad m6, 6
+ packssdw m5, m6
+ vpermq m5, m5, q3120
+ mova [r7+r8*0], xm5
+ vextracti128 [r7+r8*2], m5, 1
+ lea r7, [r7+r8*4]
+ sub hd, 2
+ jg .hv_w8_loop
+ add srcq, 16
+ add tmpq, 16
+ movzx hd, wb
+ sub wd, 1<<8
+ jg .hv_w8_loop0
+%if WIN64
+ POP r8
+%endif
+ RET
+
+PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_16bpc
+PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_16bpc
+PREP_8TAP_FN regular_sharp, REGULAR, SHARP, prep_8tap_16bpc
+PREP_8TAP_FN sharp_regular, SHARP, REGULAR, prep_8tap_16bpc
+PREP_8TAP_FN sharp, SHARP, SHARP
+
+cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
+%define base r7-prep_avx2
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ lea r7, [prep_avx2]
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jz mangle(private_prefix %+ _prep_6tap_16bpc_avx2).prep
+.v:
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
vpbroadcastq m0, [base+subpel_filters+myq*8]
- WIN64_SPILL_XMM 15
+ WIN64_SPILL_XMM 12, 15
vpbroadcastd m7, [prep_8tap_1d_rnd]
lea r6, [strideq*3]
- sub srcq, r6
punpcklbw m0, m0
+ sub srcq, r6
psraw m0, 8 ; sign-extend
test dword r7m, 0x800
jnz .v_12bpc
@@ -2183,23 +3027,23 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
RET
.v_w8:
%if WIN64
+ WIN64_PUSH_XMM 15
push r8
%endif
mov r8d, wd
shl wd, 5
- mov r5, srcq
- mov r7, tmpq
lea wd, [hq+wq-256]
.v_w8_loop0:
vbroadcasti128 m4, [srcq+strideq*0]
vbroadcasti128 m5, [srcq+strideq*1]
+ lea r5, [srcq+strideq*4]
vbroadcasti128 m0, [srcq+r6 ]
vbroadcasti128 m6, [srcq+strideq*2]
- lea srcq, [srcq+strideq*4]
- vbroadcasti128 m1, [srcq+strideq*0]
- vbroadcasti128 m2, [srcq+strideq*1]
- vbroadcasti128 m3, [srcq+strideq*2]
- add srcq, r6
+ mov r7, tmpq
+ vbroadcasti128 m1, [r5+strideq*0]
+ vbroadcasti128 m2, [r5+strideq*1]
+ vbroadcasti128 m3, [r5+strideq*2]
+ add r5, r6
shufpd m4, m0, 0x0c
shufpd m5, m1, 0x0c
punpcklwd m1, m4, m5 ; 01
@@ -2211,7 +3055,7 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
punpcklwd m3, m6, m0 ; 23
punpckhwd m6, m0 ; 56
.v_w8_loop:
- vbroadcasti128 m14, [srcq+strideq*0]
+ vbroadcasti128 m14, [r5+strideq*0]
pmaddwd m12, m8, m1 ; a0
pmaddwd m13, m8, m2 ; b0
mova m1, m3
@@ -2227,8 +3071,8 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
pmaddwd m5, m10 ; a2
pmaddwd m6, m10 ; b2
paddd m12, m5
- vbroadcasti128 m5, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
+ vbroadcasti128 m5, [r5+strideq*1]
+ lea r5, [r5+strideq*2]
paddd m13, m6
shufpd m6, m0, m14, 0x0d
shufpd m0, m14, m5, 0x0c
@@ -2242,22 +3086,101 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
psrad m13, 4
packssdw m12, m13
vpermq m12, m12, q3120
- mova [tmpq+r8*0], xm12
- vextracti128 [tmpq+r8*2], m12, 1
- lea tmpq, [tmpq+r8*4]
+ mova [r7+r8*0], xm12
+ vextracti128 [r7+r8*2], m12, 1
+ lea r7, [r7+r8*4]
sub hd, 2
jg .v_w8_loop
- add r5, 16
- add r7, 16
+ add srcq, 16
+ add tmpq, 16
movzx hd, wb
- mov srcq, r5
- mov tmpq, r7
sub wd, 1<<8
jg .v_w8_loop0
%if WIN64
pop r8
%endif
RET
+.h:
+ test myd, 0xf00
+ jnz .hv
+ vpbroadcastd m5, [prep_8tap_1d_rnd] ; 8 - (8192 << 4)
+ cmp wd, 4
+ je mangle(private_prefix %+ _prep_6tap_16bpc_avx2).h_w4
+ shr mxd, 16
+ sub srcq, 6
+ vpbroadcastq m0, [base+subpel_filters+mxq*8]
+ WIN64_SPILL_XMM 12
+ vbroadcasti128 m6, [subpel_h_shufA]
+ vbroadcasti128 m7, [subpel_h_shufB]
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ test dword r7m, 0x800
+ jnz .h_12bpc
+ psllw m0, 2
+.h_12bpc:
+ pshufd m8, m0, q0000
+ pshufd m9, m0, q1111
+ pshufd m10, m0, q2222
+ pshufd m11, m0, q3333
+ cmp wd, 8
+ jg .h_w16
+.h_w8:
+%macro PREP_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
+ pshufb m%4, m%1, m7 ; 2 3 3 4 4 5 5 6
+ pshufb m%1, m6 ; 0 1 1 2 2 3 3 4
+ pmaddwd m%5, m9, m%4 ; abcd1
+ pmaddwd m%1, m8 ; abcd0
+ pshufb m%2, m7 ; 6 7 7 8 8 9 9 a
+ shufpd m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8
+ paddd m%5, m5
+ paddd m%1, m%5
+ pmaddwd m%5, m11, m%2 ; abcd3
+ paddd m%1, m%5
+ pmaddwd m%5, m10, m%4 ; abcd2
+ pshufb m%3, m7 ; a b b c c d d e
+ pmaddwd m%4, m8 ; efgh0
+ paddd m%1, m%5
+ pmaddwd m%5, m9, m%2 ; efgh1
+ shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c
+ pmaddwd m%3, m11 ; efgh3
+ pmaddwd m%2, m10 ; efgh2
+ paddd m%4, m5
+ paddd m%4, m%5
+ paddd m%3, m%4
+ paddd m%2, m%3
+ psrad m%1, 4
+ psrad m%2, 4
+ packssdw m%1, m%2
+%endmacro
+ movu xm0, [srcq+strideq*0+ 0]
+ vinserti128 m0, [srcq+strideq*1+ 0], 1
+ movu xm2, [srcq+strideq*0+16]
+ vinserti128 m2, [srcq+strideq*1+16], 1
+ lea srcq, [srcq+strideq*2]
+ shufpd m1, m0, m2, 0x05
+ PREP_8TAP_H 0, 1, 2, 3, 4
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ add wd, wd
+.h_w16_loop0:
+ mov r6d, wd
+.h_w16_loop:
+ movu m0, [srcq+r6-32]
+ movu m1, [srcq+r6-24]
+ movu m2, [srcq+r6-16]
+ PREP_8TAP_H 0, 1, 2, 3, 4
+ mova [tmpq+r6-32], m0
+ sub r6d, 32
+ jg .h_w16_loop
+ add srcq, strideq
+ add tmpq, wq
+ dec hd
+ jg .h_w16_loop0
+ RET
.hv:
WIN64_SPILL_XMM 16
vpbroadcastd m15, [prep_8tap_2d_rnd]
@@ -2268,12 +3191,12 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
movzx mxd, myb
shr myd, 16
cmp hd, 4
- cmovle myd, mxd
+ cmove myd, mxd
vpbroadcastq m1, [base+subpel_filters+myq*8]
lea r6, [strideq*3]
sub srcq, 2
- sub srcq, r6
pxor m7, m7
+ sub srcq, r6
punpcklbw m7, m0
punpcklbw m1, m1
psraw m7, 4
@@ -2375,7 +3298,7 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
movzx mxd, myb
shr myd, 16
cmp hd, 4
- cmovle myd, mxd
+ cmove myd, mxd
pmovsxbw xm1, [base+subpel_filters+myq*8]
%if WIN64
PUSH r8
@@ -2385,12 +3308,9 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
lea r6, [strideq*3]
sub srcq, 6
sub srcq, r6
- mov r5, srcq
- mov r7, tmpq
lea wd, [hq+wq-256]
pxor m0, m0
punpcklbw m0, m2
- mova [v_mul], xm1
psraw m0, 4
test dword r7m, 0x800
jz .hv_w8_10bit
@@ -2398,6 +3318,7 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
.hv_w8_10bit:
pshufd m11, m0, q0000
pshufd m12, m0, q1111
+ mova [v_mul], xm1
pshufd m13, m0, q2222
pshufd m14, m0, q3333
.hv_w8_loop0:
@@ -2430,13 +3351,15 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
%endmacro
movu xm4, [srcq+r6 + 0]
vbroadcasti128 m8, [subpel_h_shufA]
+ lea r5, [srcq+strideq*4]
movu xm6, [srcq+r6 + 8]
vbroadcasti128 m9, [subpel_h_shufB]
+ mov r7, tmpq
movu xm0, [srcq+r6 +16]
movu xm5, [srcq+strideq*0+ 0]
- vinserti128 m5, [srcq+strideq*4+ 0], 1
+ vinserti128 m5, [r5 +strideq*0+ 0], 1
movu xm1, [srcq+strideq*0+16]
- vinserti128 m1, [srcq+strideq*4+16], 1
+ vinserti128 m1, [r5 +strideq*0+16], 1
shufpd m7, m5, m1, 0x05
INIT_XMM avx2
PREP_8TAP_HV_H 4, 6, 0 ; 3
@@ -2450,10 +3373,9 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
PREP_8TAP_HV_H 0, 7, 1 ; 2 6
movu xm6, [srcq+strideq*1+ 0]
movu xm1, [srcq+strideq*1+16]
- lea srcq, [srcq+strideq*4]
- vinserti128 m6, [srcq+strideq*1+ 0], 1
- vinserti128 m1, [srcq+strideq*1+16], 1
- add srcq, r6
+ vinserti128 m6, [r5 +strideq*1+ 0], 1
+ vinserti128 m1, [r5 +strideq*1+16], 1
+ add r5, r6
shufpd m7, m6, m1, 0x05
PREP_8TAP_HV_H 6, 7, 1 ; 1 5
vpermq m4, m4, q1100
@@ -2486,13 +3408,13 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
pmaddwd m6, m10 ; b2
paddd m8, m5
paddd m9, m6
- movu xm5, [srcq+strideq*0]
- vinserti128 m5, [srcq+strideq*1], 1
+ movu xm5, [r5+strideq*0]
+ vinserti128 m5, [r5+strideq*1], 1
vbroadcasti128 m7, [subpel_h_shufA]
vbroadcasti128 m10, [subpel_h_shufB]
- movu xm6, [srcq+strideq*0+16]
- vinserti128 m6, [srcq+strideq*1+16], 1
- vextracti128 [tmpq], m0, 1
+ movu xm6, [r5+strideq*0+16]
+ vinserti128 m6, [r5+strideq*1+16], 1
+ vextracti128 [r7], m0, 1
pshufb m0, m5, m7 ; 01
pshufb m5, m10 ; 23
pmaddwd m0, m11
@@ -2505,9 +3427,9 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
pmaddwd m6, m14
paddd m5, m15
paddd m6, m5
- movu xm5, [srcq+strideq*0+8]
- vinserti128 m5, [srcq+strideq*1+8], 1
- lea srcq, [srcq+strideq*2]
+ movu xm5, [r5+strideq*0+8]
+ vinserti128 m5, [r5+strideq*1+8], 1
+ lea r5, [r5+strideq*2]
pshufb m7, m5, m7
pshufb m5, m10
pmaddwd m10, m13, m7
@@ -2518,7 +3440,7 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
pmaddwd m5, m12
paddd m0, m7
paddd m5, m6
- vbroadcasti128 m6, [tmpq]
+ vbroadcasti128 m6, [r7]
vpbroadcastd m10, [v_mul+4*3]
psrad m0, 6
psrad m5, 6
@@ -2535,16 +3457,14 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
psrad m9, 6
packssdw m7, m9
vpermq m7, m7, q3120
- mova [tmpq+r8*0], xm7
- vextracti128 [tmpq+r8*2], m7, 1
- lea tmpq, [tmpq+r8*4]
+ mova [r7+r8*0], xm7
+ vextracti128 [r7+r8*2], m7, 1
+ lea r7, [r7+r8*4]
sub hd, 2
jg .hv_w8_loop
- add r5, 16
- add r7, 16
+ add srcq, 16
+ add tmpq, 16
movzx hd, wb
- mov srcq, r5
- mov tmpq, r7
sub wd, 1<<8
jg .hv_w8_loop0
%if WIN64
@@ -4223,14 +5143,14 @@ DECLARE_REG_TMP 6, 8
%define PUT_8TAP_SCALED_FN FN put_8tap_scaled,
BILIN_SCALED_FN put
-PUT_8TAP_SCALED_FN sharp, SHARP, SHARP
-PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH
-PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP
-PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH
-PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR
-PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP
-PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR
-PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_SCALED_FN sharp, SHARP, SHARP, put_8tap_scaled_16bpc
+PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH, put_8tap_scaled_16bpc
+PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP, put_8tap_scaled_16bpc
+PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH, put_8tap_scaled_16bpc
+PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR, put_8tap_scaled_16bpc
+PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP, put_8tap_scaled_16bpc
+PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR, put_8tap_scaled_16bpc
+PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH, put_8tap_scaled_16bpc
PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR
MC_8TAP_SCALED put
@@ -4242,14 +5162,14 @@ DECLARE_REG_TMP 6, 7
%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled,
BILIN_SCALED_FN prep
-PREP_8TAP_SCALED_FN sharp, SHARP, SHARP
-PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH
-PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP
-PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH
-PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR
-PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP
-PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR
-PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_SCALED_FN sharp, SHARP, SHARP, prep_8tap_scaled_16bpc
+PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_scaled_16bpc
+PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_scaled_16bpc
+PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH, prep_8tap_scaled_16bpc
+PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR, prep_8tap_scaled_16bpc
+PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP, prep_8tap_scaled_16bpc
+PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR, prep_8tap_scaled_16bpc
+PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH, prep_8tap_scaled_16bpc
PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR
MC_8TAP_SCALED prep
diff --git a/third_party/dav1d/src/x86/mc_avx2.asm b/third_party/dav1d/src/x86/mc_avx2.asm
index 58e3cb5af1..df8bebb1cb 100644
--- a/third_party/dav1d/src/x86/mc_avx2.asm
+++ b/third_party/dav1d/src/x86/mc_avx2.asm
@@ -60,15 +60,14 @@ subpel_v_shuf4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 1
subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11
subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12
-bilin_h_shuf8: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7
deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11
blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
pb_8x0_8x8: db 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8
bdct_lb_dw: db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12
wswap: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
-rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7
resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7
+rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7
wm_420_sign: dd 0x01020102, 0x01010101
wm_422_sign: dd 0x80808080, 0x7f7f7f7f
@@ -95,6 +94,7 @@ pq_0x40000000: dq 0x40000000
cextern mc_subpel_filters
cextern mc_warp_filter2
cextern resize_filter
+cextern z_filter_s
%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
@@ -184,7 +184,9 @@ BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128
BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE put, 6tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE put, 8tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, 6tap, avx2, 1, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE prep, 8tap, avx2, 1, 4, 8, 16, 32, 64, 128
SCALED_JMP_TABLE put_8tap_scaled, avx2, 2, 4, 8, 16, 32, 64, 128
SCALED_JMP_TABLE prep_8tap_scaled, avx2, 4, 8, 16, 32, 64, 128
@@ -298,7 +300,7 @@ INIT_YMM avx2
; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
imul mxyd, 255
- vbroadcasti128 m4, [bilin_h_shuf8]
+ vbroadcasti128 m4, [z_filter_s+2]
add mxyd, 16
movd xm5, mxyd
mov mxyd, r7m ; my
@@ -900,7 +902,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
; 16 * src[x] + (mx * (src[x + 1] - src[x]))
; = (16 - mx) * src[x] + mx * src[x + 1]
imul mxyd, 255
- vbroadcasti128 m4, [bilin_h_shuf8]
+ vbroadcasti128 m4, [z_filter_s+2]
add mxyd, 16
movd xm5, mxyd
mov mxyd, r6m ; my
@@ -1436,7 +1438,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
%assign FILTER_SMOOTH (1*15 << 16) | 4*15
%assign FILTER_SHARP (2*15 << 16) | 3*15
-%macro FN 4 ; fn, type, type_h, type_v
+%macro FN 4-5 ; fn, type, type_h, type_v, jmp_to
cglobal %1_%2_8bpc
mov t0d, FILTER_%3
%ifidn %3, %4
@@ -1444,8 +1446,8 @@ cglobal %1_%2_8bpc
%else
mov t1d, FILTER_%4
%endif
-%ifnidn %2, regular ; skip the jump in the last filter
- jmp mangle(private_prefix %+ _%1_8bpc %+ SUFFIX)
+%if %0 == 5 ; skip the jump in the last filter
+ jmp mangle(private_prefix %+ _%5 %+ SUFFIX)
%endif
%endmacro
@@ -1456,28 +1458,24 @@ DECLARE_REG_TMP 7, 8
%endif
%define PUT_8TAP_FN FN put_8tap,
-PUT_8TAP_FN sharp, SHARP, SHARP
-PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH
-PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP
-PUT_8TAP_FN smooth, SMOOTH, SMOOTH
-PUT_8TAP_FN sharp_regular, SHARP, REGULAR
-PUT_8TAP_FN regular_sharp, REGULAR, SHARP
-PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR
-PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_FN smooth, SMOOTH, SMOOTH, put_6tap_8bpc
+PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR, put_6tap_8bpc
+PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH, put_6tap_8bpc
PUT_8TAP_FN regular, REGULAR, REGULAR
-cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
+cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ns
imul mxd, mxm, 0x010101
- add mxd, t0d ; 8tap_h, mx, 4tap_h
+ add mxd, t0d ; 6tap_h, mx, 4tap_h
imul myd, mym, 0x010101
- add myd, t1d ; 8tap_v, my, 4tap_v
+ add myd, t1d ; 6tap_v, my, 4tap_v
lea r8, [put_avx2]
- movsxd wq, wm
+ mov wd, wm
movifnidn hd, hm
test mxd, 0xf00
jnz .h
test myd, 0xf00
jnz .v
+.put:
tzcnt wd, wd
movzx wd, word [r8+wq*2+table_offset(put,)]
add wq, r8
@@ -1487,36 +1485,18 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
pop r8
%endif
jmp wq
-.h:
- test myd, 0xf00
- jnz .hv
- vpbroadcastd m5, [pw_34] ; 2 + (8 << 2)
- WIN64_SPILL_XMM 11
- cmp wd, 4
- jl .h_w2
- vbroadcasti128 m6, [subpel_h_shufA]
- je .h_w4
- tzcnt wd, wd
- vbroadcasti128 m7, [subpel_h_shufB]
- vbroadcasti128 m8, [subpel_h_shufC]
- shr mxd, 16
- sub srcq, 3
- movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)]
- vpbroadcastd m9, [r8+mxq*8+subpel_filters-put_avx2+0]
- vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+4]
- add wq, r8
- jmp wq
.h_w2:
movzx mxd, mxb
- dec srcq
- mova xm4, [subpel_h_shuf4]
- vpbroadcastd xm3, [r8+mxq*8+subpel_filters-put_avx2+2]
+ lea srcq, [srcq-1]
+ vpbroadcastd xm4, [r8+mxq*8+subpel_filters-put_avx2+2]
+ je .h_w4
+ mova xm3, [subpel_h_shuf4]
.h_w2_loop:
movq xm0, [srcq+ssq*0]
movhps xm0, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- pshufb xm0, xm4
- pmaddubsw xm0, xm3
+ pshufb xm0, xm3
+ pmaddubsw xm0, xm4
phaddw xm0, xm0
paddw xm0, xm5
psraw xm0, 6
@@ -1528,17 +1508,15 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
jg .h_w2_loop
RET
.h_w4:
- movzx mxd, mxb
- dec srcq
- vpbroadcastd xm3, [r8+mxq*8+subpel_filters-put_avx2+2]
+ mova xm3, [subpel_h_shufA]
.h_w4_loop:
movq xm0, [srcq+ssq*0]
movq xm1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- pshufb xm0, xm6
- pshufb xm1, xm6
- pmaddubsw xm0, xm3
- pmaddubsw xm1, xm3
+ pshufb xm0, xm3
+ pshufb xm1, xm3
+ pmaddubsw xm0, xm4
+ pmaddubsw xm1, xm4
phaddw xm0, xm1
paddw xm0, xm5
psraw xm0, 6
@@ -1549,25 +1527,43 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
sub hd, 2
jg .h_w4_loop
RET
+.h:
+ test myd, 0xf00
+ jnz .hv
+ vpbroadcastd m5, [pw_34] ; 2 + (8 << 2)
+ cmp wd, 4
+ jle .h_w2
+ WIN64_SPILL_XMM 11
+ tzcnt wd, wd
+ vbroadcasti128 m4, [z_filter_s+ 2] ; 01
+ shr mxd, 16
+ vbroadcasti128 m6, [z_filter_s+ 6] ; 23
+ sub srcq, 2
+ vbroadcasti128 m7, [z_filter_s+10] ; 45
+ lea mxq, [r8+mxq*8+subpel_filters+1-put_avx2]
+ movzx wd, word [r8+wq*2+table_offset(put, _6tap_h)]
+ vpbroadcastw m8, [mxq+0]
+ vpbroadcastw m9, [mxq+2]
+ add wq, r8
+ vpbroadcastw m10, [mxq+4]
+ jmp wq
.h_w8:
-%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3]
- pshufb m%2, m%1, m7
- pshufb m%3, m%1, m8
- pshufb m%1, m6
- pmaddubsw m%4, m%2, m9
- pmaddubsw m%2, m10
- pmaddubsw m%3, m10
- pmaddubsw m%1, m9
- paddw m%3, m%4
+%macro PUT_6TAP_H 3 ; dst/src, tmp[1-2]
+ pshufb m%2, m%1, m4
+ pmaddubsw m%2, m8
+ pshufb m%3, m%1, m6
+ pmaddubsw m%3, m9
+ pshufb m%1, m7
+ pmaddubsw m%1, m10
+ paddw m%2, m5
+ paddw m%1, m%3
paddw m%1, m%2
- phaddw m%1, m%3
- paddw m%1, m5
psraw m%1, 6
%endmacro
movu xm0, [srcq+ssq*0]
vinserti128 m0, [srcq+ssq*1], 1
lea srcq, [srcq+ssq*2]
- PUT_8TAP_H 0, 1, 2, 3
+ PUT_6TAP_H 0, 1, 2
vextracti128 xm1, m0, 1
packuswb xm0, xm1
movq [dstq+dsq*0], xm0
@@ -1581,9 +1577,9 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
vinserti128 m0, [srcq+ssq*1+8*0], 1
movu xm1, [srcq+ssq*0+8*1]
vinserti128 m1, [srcq+ssq*1+8*1], 1
- PUT_8TAP_H 0, 2, 3, 4
+ PUT_6TAP_H 0, 2, 3
lea srcq, [srcq+ssq*2]
- PUT_8TAP_H 1, 2, 3, 4
+ PUT_6TAP_H 1, 2, 3
packuswb m0, m1
mova [dstq+dsq*0], xm0
vextracti128 [dstq+dsq*1], m0, 1
@@ -1606,8 +1602,8 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
.h_loop:
movu m0, [srcq+r6+8*0]
movu m1, [srcq+r6+8*1]
- PUT_8TAP_H 0, 2, 3, 4
- PUT_8TAP_H 1, 2, 3, 4
+ PUT_6TAP_H 0, 2, 3
+ PUT_6TAP_H 1, 2, 3
packuswb m0, m1
mova [dstq+r6], m0
add r6, 32
@@ -1619,7 +1615,421 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
jg .h_loop
RET
.v:
- WIN64_SPILL_XMM 16
+ WIN64_SPILL_XMM 9, 12
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ tzcnt r6d, wd
+ movzx r6d, word [r8+r6*2+table_offset(put, _6tap_v)]
+ vpbroadcastd m8, [pw_512]
+ lea myq, [r8+myq*8+subpel_filters+1-put_avx2]
+ vpbroadcastw m5, [myq+0]
+ vpbroadcastw m6, [myq+2]
+ vpbroadcastw m7, [myq+4]
+ add r6, r8
+ mov nsq, ssq
+ neg nsq
+ jmp r6
+.v_w2:
+ movd xm2, [srcq+nsq*2]
+ pinsrw xm2, [srcq+nsq*1], 2
+ pinsrw xm2, [srcq+ssq*0], 4
+ pinsrw xm2, [srcq+ssq*1], 6 ; 0 1 2 3
+ lea srcq, [srcq+ssq*2]
+ vpbroadcastd xm0, [srcq+ssq*0]
+ palignr xm3, xm0, xm2, 4 ; 1 2 3 4
+ punpcklbw xm1, xm2, xm3 ; 01 12
+ punpckhbw xm2, xm3 ; 23 34
+.v_w2_loop:
+ vpbroadcastd xm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw xm3, xm1, xm5 ; a0 b0
+ mova xm1, xm2
+ pmaddubsw xm2, xm6 ; a1 b1
+ paddw xm3, xm2
+ vpblendd xm2, xm0, xm4, 0x02 ; 4 5
+ vpbroadcastd xm0, [srcq+ssq*0]
+ vpblendd xm4, xm0, 0x02 ; 5 6
+ punpcklbw xm2, xm4 ; 67 78
+ pmaddubsw xm4, xm2, xm7 ; a3 b3
+ paddw xm3, xm4
+ pmulhrsw xm3, xm8
+ packuswb xm3, xm3
+ pextrw [dstq+dsq*0], xm3, 0
+ pextrw [dstq+dsq*1], xm3, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movd xm2, [srcq+nsq*2]
+ pinsrd xm2, [srcq+nsq*1], 1
+ pinsrd xm2, [srcq+ssq*0], 2
+ pinsrd xm2, [srcq+ssq*1], 3 ; 0 1 2 3
+ lea srcq, [srcq+ssq*2]
+ vpbroadcastd xm0, [srcq+ssq*0]
+ palignr xm3, xm0, xm2, 4 ; 1 2 3 4
+ punpcklbw xm1, xm2, xm3 ; 01 12
+ punpckhbw xm2, xm3 ; 23 34
+.v_w4_loop:
+ vpbroadcastd xm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw xm3, xm1, xm5 ; a0 b0
+ mova xm1, xm2
+ pmaddubsw xm2, xm6 ; a1 b1
+ paddw xm3, xm2
+ vpblendd xm2, xm0, xm4, 0x02 ; 4 5
+ vpbroadcastd xm0, [srcq+ssq*0]
+ vpblendd xm4, xm0, 0x02 ; 5 6
+ punpcklbw xm2, xm4 ; 45 56
+ pmaddubsw xm4, xm2, xm7 ; a2 b2
+ paddw xm3, xm4
+ pmulhrsw xm3, xm8
+ packuswb xm3, xm3
+ movd [dstq+dsq*0], xm3
+ pextrd [dstq+dsq*1], xm3, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movq xm1, [srcq+nsq*2]
+ vpbroadcastq m3, [srcq+nsq*1]
+ vpbroadcastq m2, [srcq+ssq*0]
+ vpbroadcastq m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpbroadcastq m0, [srcq+ssq*0]
+ vpblendd m1, m3, 0x30
+ vpblendd m3, m2, 0x30
+ punpcklbw m1, m3 ; 01 12
+ vpblendd m2, m4, 0x30
+ vpblendd m4, m0, 0x30
+ punpcklbw m2, m4 ; 23 34
+.v_w8_loop:
+ vpbroadcastq m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw m3, m1, m5 ; a0 b0
+ mova m1, m2
+ pmaddubsw m2, m6 ; a1 b1
+ paddw m3, m2
+ vpblendd m2, m0, m4, 0x30
+ vpbroadcastq m0, [srcq+ssq*0]
+ vpblendd m4, m0, 0x30
+ punpcklbw m2, m4 ; 45 56
+ pmaddubsw m4, m2, m7 ; a2 b2
+ paddw m3, m4
+ pmulhrsw m3, m8
+ vextracti128 xm4, m3, 1
+ packuswb xm3, xm4
+ movq [dstq+dsq*0], xm3
+ movhps [dstq+dsq*1], xm3
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ RET
+.v_w16:
+.v_w32:
+.v_w64:
+.v_w128:
+ lea r6d, [wq*8-128]
+ WIN64_PUSH_XMM 12
+ lea r6d, [hq+r6*2]
+.v_w16_loop0:
+ vbroadcasti128 m3, [srcq+nsq*2]
+ vbroadcasti128 m4, [srcq+nsq*1]
+ lea r4, [srcq+ssq*2]
+ vbroadcasti128 m0, [srcq+ssq*0]
+ vbroadcasti128 m1, [srcq+ssq*1]
+ mov r7, dstq
+ vbroadcasti128 m2, [r4+ssq*0]
+ shufpd m3, m0, 0x0c
+ shufpd m4, m1, 0x0c
+ punpcklbw m1, m3, m4 ; 01
+ punpckhbw m3, m4 ; 23
+ shufpd m0, m2, 0x0c
+ punpcklbw m2, m4, m0 ; 12
+ punpckhbw m4, m0 ; 34
+.v_w16_loop:
+ vbroadcasti128 m9, [r4+ssq*1]
+ pmaddubsw m10, m1, m5 ; a0
+ lea r4, [r4+ssq*2]
+ pmaddubsw m11, m2, m5 ; b0
+ mova m1, m3
+ pmaddubsw m3, m6 ; a1
+ mova m2, m4
+ pmaddubsw m4, m6 ; b1
+ paddw m10, m3
+ vbroadcasti128 m3, [r4+ssq*0]
+ paddw m11, m4
+ shufpd m4, m0, m9, 0x0d
+ shufpd m0, m9, m3, 0x0c
+ punpcklbw m3, m4, m0 ; 45
+ punpckhbw m4, m0 ; 56
+ pmaddubsw m9, m3, m7 ; a2
+ paddw m10, m9
+ pmaddubsw m9, m4, m7 ; b2
+ paddw m11, m9
+ pmulhrsw m10, m8
+ pmulhrsw m11, m8
+ packuswb m10, m11
+ vpermq m10, m10, q3120
+ mova [r7+dsq*0], xm10
+ vextracti128 [r7+dsq*1], m10, 1
+ lea r7, [r7+dsq*2]
+ sub hd, 2
+ jg .v_w16_loop
+ add srcq, 16
+ add dstq, 16
+ movzx hd, r6b
+ sub r6d, 1<<8
+ jg .v_w16_loop0
+ RET
+.hv:
+ WIN64_SPILL_XMM 12, 16
+ cmp wd, 4
+ jg .hv_w8
+ movzx mxd, mxb
+ dec srcq
+ vpbroadcastd m6, [r8+mxq*8+subpel_filters-put_avx2+2]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ vpbroadcastq m0, [r8+myq*8+subpel_filters+1-put_avx2]
+ vpbroadcastd m7, [pw_8192]
+ punpcklbw m0, m0
+ vpbroadcastd m8, [pd_512]
+ psraw m0, 8 ; sign-extend
+ mov nsq, ssq
+ pshufd m9, m0, q0000
+ neg nsq
+ pshufd m10, m0, q1111
+ pshufd m11, m0, q2222
+ cmp wd, 4
+ je .hv_w4
+ vbroadcasti128 m5, [subpel_h_shuf4]
+ movq xm2, [srcq+nsq*2]
+ movhps xm2, [srcq+nsq*1]
+ movq xm0, [srcq+ssq*0]
+ movhps xm0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpbroadcastq m1, [srcq+ssq*0]
+ vpblendd m2, m1, 0x30
+ pshufb m2, m5
+ pshufb xm0, xm5
+ pmaddubsw m2, m6
+ pmaddubsw xm0, xm6
+ phaddw m2, m0
+ pmulhrsw m2, m7
+ vextracti128 xm0, m2, 1
+ palignr xm0, xm2, 4
+ punpcklwd xm1, xm2, xm0 ; 01 12
+ punpckhwd xm2, xm0 ; 23 34
+.hv_w2_loop:
+ movq xm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movhps xm4, [srcq+ssq*0]
+ pshufb xm4, xm5
+ pmaddubsw xm4, xm6
+ pmaddwd xm3, xm9, xm1 ; a0 b0
+ mova xm1, xm2
+ pmaddwd xm2, xm10 ; a1 b1
+ phaddw xm4, xm4
+ paddd xm3, xm2
+ pmulhrsw xm4, xm7
+ palignr xm2, xm4, xm0, 12
+ mova xm0, xm4
+ punpcklwd xm2, xm4 ; 45 56
+ pmaddwd xm4, xm11, xm2 ; a2 b2
+ paddd xm3, xm8
+ paddd xm3, xm4
+ psrad xm3, 10
+ packssdw xm3, xm3
+ packuswb xm3, xm3
+ pextrw [dstq+dsq*0], xm3, 0
+ pextrw [dstq+dsq*1], xm3, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w4:
+ mova m5, [subpel_h_shuf4]
+ vpbroadcastq m2, [srcq+nsq*2]
+ vpbroadcastq m4, [srcq+nsq*1]
+ vpbroadcastq m1, [srcq+ssq*0]
+ vpbroadcastq m3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpbroadcastq m0, [srcq+ssq*0]
+ vpblendd m2, m4, 0xcc ; 0 1
+ vpblendd m1, m3, 0xcc ; 2 3
+ pshufb m2, m5
+ pshufb m1, m5
+ pshufb m0, m5
+ pmaddubsw m2, m6
+ pmaddubsw m1, m6
+ pmaddubsw m0, m6
+ phaddw m2, m1
+ phaddw m0, m0
+ pmulhrsw m2, m7
+ pmulhrsw m0, m7
+ palignr m3, m0, m2, 4
+ punpcklwd m1, m2, m3 ; 01 12
+ punpckhwd m2, m3 ; 23 34
+.hv_w4_loop:
+ vpbroadcastq m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddwd m3, m9, m1 ; a0 b0
+ mova m1, m2
+ pmaddwd m2, m10 ; a1 b1
+ paddd m3, m2
+ vpbroadcastq m2, [srcq+ssq*0]
+ vpblendd m4, m2, 0xcc ; 5 6
+ pshufb m4, m5
+ pmaddubsw m4, m6
+ phaddw m4, m4
+ pmulhrsw m4, m7
+ palignr m2, m4, m0, 12
+ mova m0, m4
+ punpcklwd m2, m4 ; 45 56
+ pmaddwd m4, m11, m2 ; a2 b2
+ paddd m3, m8
+ paddd m3, m4
+ psrad m3, 10
+ vextracti128 xm4, m3, 1
+ packssdw xm3, xm4
+ packuswb xm3, xm3
+ pshuflw xm3, xm3, q3120
+ movd [dstq+dsq*0], xm3
+ pextrd [dstq+dsq*1], xm3, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ shr mxd, 16
+ sub srcq, 2
+ lea mxq, [r8+mxq*8+subpel_filters+1-put_avx2]
+ WIN64_PUSH_XMM 16
+ vpbroadcastw m10, [mxq+0]
+ vpbroadcastw m11, [mxq+2]
+ vpbroadcastw m12, [mxq+4]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ vpbroadcastq m0, [r8+myq*8+subpel_filters+1-put_avx2]
+ lea r6d, [wq*8-64]
+ vbroadcasti128 m8, [z_filter_s+ 6]
+ punpcklbw m0, m0
+ vbroadcasti128 m9, [z_filter_s+10]
+ psraw m0, 8 ; sign-extend
+ mov nsq, ssq
+ pshufd m13, m0, q0000
+ neg nsq
+ pshufd m14, m0, q1111
+ lea r6d, [hq+r6*4]
+ pshufd m15, m0, q2222
+.hv_w8_loop0:
+ vbroadcasti128 m7, [z_filter_s+2]
+ movu xm3, [srcq+nsq*2]
+ lea r4, [srcq+ssq*2]
+ movu xm4, [srcq+nsq*1]
+ vbroadcasti128 m0, [srcq+ssq*0]
+ mov r7, dstq
+ vinserti128 m4, [srcq+ssq*1], 1 ; 1 3
+ vpblendd m3, m0, 0xf0 ; 0 2
+ vinserti128 m0, [r4+ssq*0], 1 ; 2 4
+ vpbroadcastd m5, [pw_8192]
+%macro HV_H_6TAP_W8 6 ; src/dst, tmp[1-2], shuf[1-3]
+ pshufb %2, %1, %4
+ pmaddubsw %2, m10
+ pshufb %3, %1, %5
+ pmaddubsw %3, m11
+ pshufb %1, %6
+ pmaddubsw %1, m12
+ paddw %2, %3
+ paddw %1, %2
+%endmacro
+ HV_H_6TAP_W8 m3, m1, m2, m7, m8, m9
+ HV_H_6TAP_W8 m4, m1, m2, m7, m8, m9
+ HV_H_6TAP_W8 m0, m1, m2, m7, m8, m9
+ vpermq m3, m3, q3120
+ vpermq m4, m4, q3120
+ vpermq m0, m0, q3120
+ pmulhrsw m3, m5
+ pmulhrsw m4, m5
+ pmulhrsw m0, m5
+ punpcklwd m1, m3, m4 ; 01
+ punpckhwd m3, m4 ; 23
+ punpcklwd m2, m4, m0 ; 12
+ punpckhwd m4, m0 ; 34
+.hv_w8_loop:
+ movu xm7, [r4+ssq*1]
+ lea r4, [r4+ssq*2]
+ vinserti128 m7, [r4+ssq*0], 1 ; 5 6
+ pmaddwd m5, m13, m1 ; a0
+ mova m1, m3
+ pmaddwd m6, m13, m2 ; b0
+ mova m2, m4
+ pmaddwd m3, m14 ; a1
+ pmaddwd m4, m14 ; b1
+ paddd m5, m3
+ vbroadcasti128 m3, [z_filter_s+2]
+ paddd m6, m4
+ HV_H_6TAP_W8 m7, m3, m4, m3, m8, m9
+ vpbroadcastd m3, [pw_8192]
+ vpbroadcastd m4, [pd_512]
+ pmulhrsw m7, m3
+ paddd m5, m4
+ paddd m6, m4
+ mova m4, m0
+ vpermq m0, m7, q3120
+ shufpd m4, m0, 0x05
+ punpcklwd m3, m4, m0 ; 45
+ pmaddwd m7, m15, m3 ; a2
+ punpckhwd m4, m0 ; 67
+ paddd m5, m7
+ pmaddwd m7, m15, m4 ; b2
+ paddd m6, m7
+ psrad m5, 10
+ psrad m6, 10
+ packssdw m5, m6
+ vextracti128 xm6, m5, 1
+ packuswb xm5, xm6
+ pshufd xm5, xm5, q3120
+ movq [r7+dsq*0], xm5
+ movhps [r7+dsq*1], xm5
+ lea r7, [r7+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+ add srcq, 8
+ add dstq, 8
+ movzx hd, r6b
+ sub r6d, 1<<8
+ jg .hv_w8_loop0
+ RET
+
+PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP, put_8tap_8bpc
+PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH, put_8tap_8bpc
+PUT_8TAP_FN regular_sharp, REGULAR, SHARP, put_8tap_8bpc
+PUT_8TAP_FN sharp_regular, SHARP, REGULAR, put_8tap_8bpc
+PUT_8TAP_FN sharp, SHARP, SHARP
+
+cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ lea r8, [put_avx2]
+ movsxd wq, wm
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jz mangle(private_prefix %+ _put_6tap_8bpc_avx2).put
+.v:
+ WIN64_SPILL_XMM 12, 15
movzx mxd, myb
shr myd, 16
cmp hd, 6
@@ -1765,19 +2175,19 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
.v_w64:
.v_w128:
lea r6d, [wq*8-128]
- mov r4, srcq
- mov r7, dstq
+ WIN64_PUSH_XMM 15
lea r6d, [hq+r6*2]
.v_w16_loop0:
vbroadcasti128 m4, [srcq+ssq*0]
vbroadcasti128 m5, [srcq+ssq*1]
+ lea r4, [srcq+ss3q]
vbroadcasti128 m6, [srcq+ssq*2]
- add srcq, ss3q
- vbroadcasti128 m0, [srcq+ssq*0]
- vbroadcasti128 m1, [srcq+ssq*1]
- vbroadcasti128 m2, [srcq+ssq*2]
- add srcq, ss3q
- vbroadcasti128 m3, [srcq+ssq*0]
+ vbroadcasti128 m0, [r4+ssq*0]
+ mov r7, dstq
+ vbroadcasti128 m1, [r4+ssq*1]
+ vbroadcasti128 m2, [r4+ssq*2]
+ add r4, ss3q
+ vbroadcasti128 m3, [r4+ssq*0]
shufpd m4, m0, 0x0c
shufpd m5, m1, 0x0c
punpcklbw m1, m4, m5 ; 01
@@ -1789,50 +2199,137 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
punpcklbw m3, m6, m0 ; 23
punpckhbw m6, m0 ; 56
.v_w16_loop:
- vbroadcasti128 m12, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- vbroadcasti128 m13, [srcq+ssq*0]
- pmaddubsw m14, m1, m8 ; a0
- pmaddubsw m15, m2, m8 ; b0
+ vbroadcasti128 m12, [r4+ssq*1]
+ lea r4, [r4+ssq*2]
+ pmaddubsw m13, m1, m8 ; a0
+ pmaddubsw m14, m2, m8 ; b0
mova m1, m3
mova m2, m4
pmaddubsw m3, m9 ; a1
pmaddubsw m4, m9 ; b1
- paddw m14, m3
- paddw m15, m4
+ paddw m13, m3
+ paddw m14, m4
mova m3, m5
mova m4, m6
pmaddubsw m5, m10 ; a2
pmaddubsw m6, m10 ; b2
- paddw m14, m5
- paddw m15, m6
+ paddw m13, m5
+ vbroadcasti128 m5, [r4+ssq*0]
+ paddw m14, m6
shufpd m6, m0, m12, 0x0d
- shufpd m0, m12, m13, 0x0c
+ shufpd m0, m12, m5, 0x0c
punpcklbw m5, m6, m0 ; 67
punpckhbw m6, m0 ; 78
pmaddubsw m12, m5, m11 ; a3
- pmaddubsw m13, m6, m11 ; b3
+ paddw m13, m12
+ pmaddubsw m12, m6, m11 ; b3
paddw m14, m12
- paddw m15, m13
+ pmulhrsw m13, m7
pmulhrsw m14, m7
- pmulhrsw m15, m7
- packuswb m14, m15
- vpermq m14, m14, q3120
- mova [dstq+dsq*0], xm14
- vextracti128 [dstq+dsq*1], m14, 1
- lea dstq, [dstq+dsq*2]
+ packuswb m13, m14
+ vpermq m13, m13, q3120
+ mova [r7+dsq*0], xm13
+ vextracti128 [r7+dsq*1], m13, 1
+ lea r7, [r7+dsq*2]
sub hd, 2
jg .v_w16_loop
- add r4, 16
- add r7, 16
+ add srcq, 16
+ add dstq, 16
movzx hd, r6b
- mov srcq, r4
- mov dstq, r7
sub r6d, 1<<8
jg .v_w16_loop0
RET
+.h:
+.h_w2:
+.h_w4:
+ test myd, 0xf00
+ jnz .hv
+ vpbroadcastd m5, [pw_34] ; 2 + (8 << 2)
+ cmp wd, 4
+ jle mangle(private_prefix %+ _put_6tap_8bpc_avx2).h_w2
+ WIN64_SPILL_XMM 11
+ tzcnt wd, wd
+ vbroadcasti128 m6, [subpel_h_shufA]
+ shr mxd, 16
+ vbroadcasti128 m7, [subpel_h_shufB]
+ sub srcq, 3
+ vbroadcasti128 m8, [subpel_h_shufC]
+ movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)]
+ vpbroadcastd m9, [r8+mxq*8+subpel_filters-put_avx2+0]
+ vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+4]
+ add wq, r8
+ jmp wq
+.h_w8:
+%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3]
+ pshufb m%2, m%1, m7
+ pshufb m%3, m%1, m8
+ pshufb m%1, m6
+ pmaddubsw m%4, m%2, m9
+ pmaddubsw m%2, m10
+ pmaddubsw m%3, m10
+ pmaddubsw m%1, m9
+ paddw m%3, m%4
+ paddw m%1, m%2
+ phaddw m%1, m%3
+ paddw m%1, m5
+ psraw m%1, 6
+%endmacro
+ movu xm0, [srcq+ssq*0]
+ vinserti128 m0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_H 0, 1, 2, 3
+ vextracti128 xm1, m0, 1
+ packuswb xm0, xm1
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ movu xm0, [srcq+ssq*0+8*0]
+ vinserti128 m0, [srcq+ssq*1+8*0], 1
+ movu xm1, [srcq+ssq*0+8*1]
+ vinserti128 m1, [srcq+ssq*1+8*1], 1
+ PUT_8TAP_H 0, 2, 3, 4
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_H 1, 2, 3, 4
+ packuswb m0, m1
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w16
+ RET
+.h_w32:
+ xor r6d, r6d
+ jmp .h_start
+.h_w64:
+ mov r6, -32*1
+ jmp .h_start
+.h_w128:
+ mov r6, -32*3
+.h_start:
+ sub srcq, r6
+ sub dstq, r6
+ mov r4, r6
+.h_loop:
+ movu m0, [srcq+r6+8*0]
+ movu m1, [srcq+r6+8*1]
+ PUT_8TAP_H 0, 2, 3, 4
+ PUT_8TAP_H 1, 2, 3, 4
+ packuswb m0, m1
+ mova [dstq+r6], m0
+ add r6, 32
+ jle .h_loop
+ add srcq, ssq
+ add dstq, dsq
+ mov r6, r4
+ dec hd
+ jg .h_loop
+ RET
.hv:
- WIN64_SPILL_XMM 16
+ WIN64_SPILL_XMM 14, 16
cmp wd, 4
jg .hv_w8
movzx mxd, mxb
@@ -1975,6 +2472,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
jg .hv_w4_loop
RET
.hv_w8:
+ WIN64_PUSH_XMM 16
shr mxd, 16
sub srcq, 3
vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+0]
@@ -1993,24 +2491,23 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
pshufd m14, m0, q2222
pshufd m15, m0, q3333
lea r6d, [wq*8-64]
- mov r4, srcq
- mov r7, dstq
lea r6d, [hq+r6*4]
.hv_w8_loop0:
vbroadcasti128 m7, [subpel_h_shufA]
movu xm4, [srcq+ssq*0]
+ lea r4, [srcq+ss3q]
vbroadcasti128 m8, [subpel_h_shufB]
movu xm5, [srcq+ssq*1]
+ mov r7, dstq
vbroadcasti128 m9, [subpel_h_shufC]
movu xm6, [srcq+ssq*2]
- add srcq, ss3q
- vbroadcasti128 m0, [srcq+ssq*0]
- vpblendd m4, m0, 0xf0 ; 0 3
- vinserti128 m5, [srcq+ssq*1], 1 ; 1 4
- vinserti128 m6, [srcq+ssq*2], 1 ; 2 5
- add srcq, ss3q
- vinserti128 m0, [srcq+ssq*0], 1 ; 3 6
-%macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3]
+ vbroadcasti128 m0, [r4+ssq*0]
+ vpblendd m4, m0, 0xf0 ; 0 3
+ vinserti128 m5, [r4+ssq*1], 1 ; 1 4
+ vinserti128 m6, [r4+ssq*2], 1 ; 2 5
+ add r4, ss3q
+ vinserti128 m0, [r4+ssq*0], 1 ; 3 6
+%macro HV_H_8TAP_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3]
pshufb %3, %1, %6
pshufb %4, %1, %7
pshufb %1, %5
@@ -2022,10 +2519,10 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
paddw %1, %3
phaddw %1, %2
%endmacro
- HV_H_W8 m4, m1, m2, m3, m7, m8, m9
- HV_H_W8 m5, m1, m2, m3, m7, m8, m9
- HV_H_W8 m6, m1, m2, m3, m7, m8, m9
- HV_H_W8 m0, m1, m2, m3, m7, m8, m9
+ HV_H_8TAP_W8 m4, m1, m2, m3, m7, m8, m9
+ HV_H_8TAP_W8 m5, m1, m2, m3, m7, m8, m9
+ HV_H_8TAP_W8 m6, m1, m2, m3, m7, m8, m9
+ HV_H_8TAP_W8 m0, m1, m2, m3, m7, m8, m9
vpbroadcastd m7, [pw_8192]
vpermq m4, m4, q3120
vpermq m5, m5, q3120
@@ -2043,9 +2540,9 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
punpckhwd m6, m7 ; 56
.hv_w8_loop:
vextracti128 r6m, m0, 1 ; not enough registers
- movu xm0, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- vinserti128 m0, [srcq+ssq*0], 1 ; 7 8
+ movu xm0, [r4+ssq*1]
+ lea r4, [r4+ssq*2]
+ vinserti128 m0, [r4+ssq*0], 1 ; 7 8
pmaddwd m8, m1, m12 ; a0
pmaddwd m9, m2, m12 ; b0
mova m1, m3
@@ -2063,15 +2560,15 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
vbroadcasti128 m6, [subpel_h_shufB]
vbroadcasti128 m7, [subpel_h_shufC]
vbroadcasti128 m5, [subpel_h_shufA]
- HV_H_W8 m0, m5, m6, m7, m5, m6, m7
+ HV_H_8TAP_W8 m0, m5, m6, m7, m5, m6, m7
vpbroadcastd m5, [pw_8192]
vpbroadcastd m7, [pd_512]
vbroadcasti128 m6, r6m
pmulhrsw m0, m5
paddd m8, m7
paddd m9, m7
- vpermq m7, m0, q3120 ; 7 8
- shufpd m6, m6, m7, 0x04 ; 6 7
+ vpermq m7, m0, q3120 ; 7 8
+ shufpd m6, m7, 0x04 ; 6 7
punpcklwd m5, m6, m7 ; 67
punpckhwd m6, m7 ; 78
pmaddwd m7, m5, m15 ; a3
@@ -2084,34 +2581,18 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
vextracti128 xm7, m8, 1
packuswb xm8, xm7
pshufd xm7, xm8, q3120
- movq [dstq+dsq*0], xm7
- movhps [dstq+dsq*1], xm7
- lea dstq, [dstq+dsq*2]
+ movq [r7+dsq*0], xm7
+ movhps [r7+dsq*1], xm7
+ lea r7, [r7+dsq*2]
sub hd, 2
jg .hv_w8_loop
- add r4, 8
- add r7, 8
+ add srcq, 8
+ add dstq, 8
movzx hd, r6b
- mov srcq, r4
- mov dstq, r7
sub r6d, 1<<8
jg .hv_w8_loop0
RET
-%macro PREP_8TAP_H 0
- pshufb m1, m0, m5
- pshufb m2, m0, m6
- pshufb m3, m0, m7
- pmaddubsw m1, m8
- pmaddubsw m0, m2, m8
- pmaddubsw m2, m9
- pmaddubsw m3, m9
- paddw m1, m2
- paddw m0, m3
- phaddw m0, m1, m0
- pmulhrsw m0, m4
-%endmacro
-
%if WIN64
DECLARE_REG_TMP 6, 4
%else
@@ -2119,71 +2600,197 @@ DECLARE_REG_TMP 6, 7
%endif
%define PREP_8TAP_FN FN prep_8tap,
-PREP_8TAP_FN sharp, SHARP, SHARP
-PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH
-PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP
-PREP_8TAP_FN smooth, SMOOTH, SMOOTH
-PREP_8TAP_FN sharp_regular, SHARP, REGULAR
-PREP_8TAP_FN regular_sharp, REGULAR, SHARP
-PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR
-PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_FN smooth, SMOOTH, SMOOTH, prep_6tap_8bpc
+PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR, prep_6tap_8bpc
+PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH, prep_6tap_8bpc
PREP_8TAP_FN regular, REGULAR, REGULAR
-cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
+cglobal prep_6tap_8bpc, 3, 8, 0, tmp, src, ss, w, h, mx, my, ns
imul mxd, mxm, 0x010101
- add mxd, t0d ; 8tap_h, mx, 4tap_h
+ add mxd, t0d ; 6tap_h, mx, 4tap_h
imul myd, mym, 0x010101
- add myd, t1d ; 8tap_v, my, 4tap_v
+ add myd, t1d ; 6tap_v, my, 4tap_v
lea r7, [prep%+SUFFIX]
- movsxd wq, wm
+ mov wd, wm
movifnidn hd, hm
test mxd, 0xf00
jnz .h
test myd, 0xf00
jnz .v
+.prep:
tzcnt wd, wd
movzx wd, word [r7+wq*2+table_offset(prep,)]
add wq, r7
- lea r6, [strideq*3]
+ lea r6, [ssq*3]
%if WIN64
pop r7
%endif
jmp wq
-.h:
- test myd, 0xf00
- jnz .hv
- vpbroadcastd m4, [pw_8192]
- vbroadcasti128 m5, [subpel_h_shufA]
- WIN64_SPILL_XMM 10
- cmp wd, 4
- je .h_w4
- tzcnt wd, wd
- vbroadcasti128 m6, [subpel_h_shufB]
- vbroadcasti128 m7, [subpel_h_shufC]
- shr mxd, 16
- sub srcq, 3
- movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)]
- vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0]
- vpbroadcastd m9, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4]
- add wq, r7
- jmp wq
+.v:
+ WIN64_SPILL_XMM 10, 12
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ lea myq, [r7+myq*8+subpel_filters+1-prep%+SUFFIX]
+ vpbroadcastd m9, [pw_8192]
+ vpbroadcastw m6, [myq+0]
+ mov nsq, ssq
+ vpbroadcastw m7, [myq+2]
+ neg nsq
+ vpbroadcastw m8, [myq+4]
+ cmp wd, 8
+ jg .v_w16
+ je .v_w8
+.v_w4:
+ movd xm2, [srcq+nsq*2]
+ pinsrd xm2, [srcq+nsq*1], 1
+ vpbroadcastd m1, [srcq+ssq*0]
+ vpbroadcastd m3, [srcq+ssq*1]
+ vpbroadcastd m0, [srcq+ssq*2]
+ vbroadcasti128 m5, [deint_shuf4]
+ vpblendd m1, m2, 0xeb
+ punpcklqdq m3, m0
+ vpblendd m1, m3, 0x60 ; 0 1 2 _ 2 3 4 _
+ pshufb m1, m5 ; 01 12 23 34
+.v_w4_loop:
+ lea srcq, [srcq+ssq*4]
+ pinsrd xm0, [srcq+nsq*1], 1
+ vpbroadcastd m2, [srcq+ssq*0]
+ vpbroadcastd m3, [srcq+ssq*1]
+ vpblendd m2, m0, 0xeb
+ vpbroadcastd m0, [srcq+ssq*2]
+ punpcklqdq m3, m0
+ vpblendd m2, m3, 0x60 ; 4 5 6 _ 6 7 8 _
+ pshufb m2, m5 ; 45 56 67 78
+ pmaddubsw m3, m1, m6 ; a0 b0 c0 d0
+ vperm2i128 m1, m2, 0x21 ; 23 34 45 56
+ pmaddubsw m4, m2, m8 ; a2 b2 c2 d2
+ pmaddubsw m1, m7 ; a1 b1 c1 d1
+ paddw m3, m4
+ paddw m3, m1
+ pmulhrsw m3, m9
+ mova m1, m2
+ mova [tmpq], m3
+ add tmpq, 32
+ sub hd, 4
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movq xm1, [srcq+nsq*2]
+ vpbroadcastq m3, [srcq+nsq*1]
+ vpbroadcastq m2, [srcq+ssq*0]
+ vpbroadcastq m4, [srcq+ssq*1]
+ vpbroadcastq m0, [srcq+ssq*2]
+ vpblendd m1, m3, 0x30
+ vpblendd m3, m2, 0x30
+ punpcklbw m1, m3 ; 01 12
+ vpblendd m2, m4, 0x30
+ vpblendd m4, m0, 0x30
+ punpcklbw m2, m4 ; 23 34
+.v_w8_loop:
+ lea srcq, [srcq+ssq*4]
+ pmaddubsw m1, m6 ; a0
+ vpbroadcastq m3, [srcq+nsq*1]
+ pmaddubsw m4, m2, m7 ; a1
+ pmaddubsw m5, m2, m6 ; b0
+ vpbroadcastq m2, [srcq+ssq*0]
+ vpblendd m0, m3, 0x30
+ vpblendd m3, m2, 0x30
+ paddw m4, m1
+ punpcklbw m1, m0, m3 ; 45 56
+ vpbroadcastq m3, [srcq+ssq*1]
+ vpbroadcastq m0, [srcq+ssq*2]
+ vpblendd m2, m3, 0x30
+ vpblendd m3, m0, 0x30
+ punpcklbw m2, m3 ; 67 78
+ pmaddubsw m3, m1, m7 ; b1
+ paddw m5, m3
+ pmaddubsw m3, m1, m8 ; a2
+ paddw m4, m3
+ pmaddubsw m3, m2, m8 ; b2
+ paddw m5, m3
+ pmulhrsw m4, m9
+ pmulhrsw m5, m9
+ mova [tmpq+32*0], m4
+ mova [tmpq+32*1], m5
+ add tmpq, 32*2
+ sub hd, 4
+ jg .v_w8_loop
+ RET
+.v_w16:
+ lea r6d, [wq*2-32]
+ lea srcq, [srcq+nsq*2]
+ WIN64_PUSH_XMM 12
+ lea r6d, [hq+r6*8]
+.v_w16_loop0:
+ vbroadcasti128 m3, [srcq+ssq*0]
+ lea r5, [srcq+ssq*2]
+ vbroadcasti128 m4, [srcq+ssq*1]
+ mov r7, tmpq
+ vbroadcasti128 m0, [r5+ssq*0]
+ vbroadcasti128 m1, [r5+ssq*1]
+ lea r5, [r5+ssq*2]
+ vbroadcasti128 m2, [r5+ssq*0]
+ shufpd m3, m0, 0x0c
+ shufpd m4, m1, 0x0c
+ punpcklbw m1, m3, m4 ; 01
+ punpckhbw m3, m4 ; 23
+ shufpd m0, m2, 0x0c
+ punpcklbw m2, m4, m0 ; 12
+ punpckhbw m4, m0 ; 34
+.v_w16_loop:
+ vbroadcasti128 m5, [r5+ssq*1]
+ pmaddubsw m10, m1, m6 ; a0
+ lea r5, [r5+ssq*2]
+ pmaddubsw m11, m2, m6 ; b0
+ mova m1, m3
+ pmaddubsw m3, m7 ; a1
+ mova m2, m4
+ pmaddubsw m4, m7 ; b1
+ paddw m10, m3
+ vbroadcasti128 m3, [r5+ssq*0]
+ paddw m11, m4
+ shufpd m4, m0, m5, 0x0d
+ shufpd m0, m5, m3, 0x0c
+ punpcklbw m3, m4, m0 ; 45
+ punpckhbw m4, m0 ; 56
+ pmaddubsw m5, m3, m8 ; a2
+ paddw m10, m5
+ pmaddubsw m5, m4, m8 ; b2
+ paddw m11, m5
+ pmulhrsw m10, m9
+ pmulhrsw m11, m9
+ mova [r7+wq*0], m10
+ mova [r7+wq*2], m11
+ lea r7, [r7+wq*4]
+ sub hd, 2
+ jg .v_w16_loop
+ add srcq, 16
+ add tmpq, 32
+ movzx hd, r6b
+ sub r6d, 1<<8
+ jg .v_w16_loop0
+ RET
.h_w4:
+ RESET_STACK_STATE
movzx mxd, mxb
+ vbroadcasti128 m3, [subpel_h_shufA]
dec srcq
- vpbroadcastd m6, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2]
- lea stride3q, [strideq*3]
+ vpbroadcastd m5, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2]
+ lea r3, [ssq*3]
.h_w4_loop:
- movq xm0, [srcq+strideq*0]
- vpbroadcastq m2, [srcq+strideq*2]
- movq xm1, [srcq+strideq*1]
- vpblendd m0, m2, 0xf0
- vpbroadcastq m2, [srcq+stride3q ]
- lea srcq, [srcq+strideq*4]
- vpblendd m1, m2, 0xf0
- pshufb m0, m5
- pshufb m1, m5
- pmaddubsw m0, m6
- pmaddubsw m1, m6
+ movq xm0, [srcq+ssq*0]
+ vpbroadcastq m2, [srcq+ssq*2]
+ movq xm1, [srcq+ssq*1]
+ vpblendd m0, m2, 0x30
+ vpbroadcastq m2, [srcq+r3 ]
+ lea srcq, [srcq+ssq*4]
+ vpblendd m1, m2, 0x30
+ pshufb m0, m3
+ pshufb m1, m3
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
phaddw m0, m1
pmulhrsw m0, m4
mova [tmpq], m0
@@ -2191,25 +2798,56 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
sub hd, 4
jg .h_w4_loop
RET
+.h:
+ test myd, 0xf00
+ jnz .hv
+ vpbroadcastd m4, [pw_8192]
+ cmp wd, 4
+ je .h_w4
+ WIN64_SPILL_XMM 10
+ tzcnt wd, wd
+ vbroadcasti128 m3, [z_filter_s+ 2]
+ shr mxd, 16
+ vbroadcasti128 m5, [z_filter_s+ 6]
+ sub srcq, 2
+ vbroadcasti128 m6, [z_filter_s+10]
+ lea mxq, [r7+mxq*8+subpel_filters+1-prep%+SUFFIX]
+ movzx wd, word [r7+wq*2+table_offset(prep, _6tap_h)]
+ vpbroadcastw m7, [mxq+0]
+ vpbroadcastw m8, [mxq+2]
+ add wq, r7
+ vpbroadcastw m9, [mxq+4]
+ jmp wq
.h_w8:
- movu xm0, [srcq+strideq*0]
- vinserti128 m0, [srcq+strideq*1], 1
- lea srcq, [srcq+strideq*2]
- PREP_8TAP_H
+ movu xm0, [srcq+ssq*0]
+ vinserti128 m0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+%macro PREP_6TAP_H 0
+ pshufb m1, m0, m3
+ pmaddubsw m1, m7
+ pshufb m2, m0, m5
+ pmaddubsw m2, m8
+ pshufb m0, m6
+ pmaddubsw m0, m9
+ paddw m1, m2
+ paddw m0, m1
+ pmulhrsw m0, m4
+%endmacro
+ PREP_6TAP_H
mova [tmpq], m0
add tmpq, 32
sub hd, 2
jg .h_w8
RET
.h_w16:
- movu xm0, [srcq+strideq*0+8*0]
- vinserti128 m0, [srcq+strideq*0+8*1], 1
- PREP_8TAP_H
+ movu xm0, [srcq+ssq*0+8*0]
+ vinserti128 m0, [srcq+ssq*0+8*1], 1
+ PREP_6TAP_H
mova [tmpq+32*0], m0
- movu xm0, [srcq+strideq*1+8*0]
- vinserti128 m0, [srcq+strideq*1+8*1], 1
- lea srcq, [srcq+strideq*2]
- PREP_8TAP_H
+ movu xm0, [srcq+ssq*1+8*0]
+ vinserti128 m0, [srcq+ssq*1+8*1], 1
+ lea srcq, [srcq+ssq*2]
+ PREP_6TAP_H
mova [tmpq+32*1], m0
add tmpq, 32*2
sub hd, 2
@@ -2229,27 +2867,219 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
.h_loop:
movu xm0, [srcq+r6+8*0]
vinserti128 m0, [srcq+r6+8*1], 1
- PREP_8TAP_H
+ PREP_6TAP_H
mova [tmpq+32*0], m0
movu xm0, [srcq+r6+8*2]
vinserti128 m0, [srcq+r6+8*3], 1
- PREP_8TAP_H
+ PREP_6TAP_H
mova [tmpq+32*1], m0
add tmpq, 32*2
add r6, 32
jle .h_loop
- add srcq, strideq
+ add srcq, ssq
mov r6, r5
dec hd
jg .h_loop
RET
+.hv:
+ WIN64_SPILL_XMM 14, 16
+ cmp wd, 4
+ jne .hv_w8
+.hv_w4:
+ movzx mxd, mxb
+ dec srcq
+ vpbroadcastd m7, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ mova m6, [subpel_h_shuf4]
+ vpbroadcastq m0, [r7+myq*8+subpel_filters+1-prep%+SUFFIX]
+ mov nsq, ssq
+ pmovzxbd m13, [deint_shuf4]
+ neg nsq
+ vpbroadcastd m8, [pw_8192]
+ vpbroadcastd m9, [pd_32]
+ punpcklbw m0, m0
+ vpbroadcastq m2, [srcq+nsq*2]
+ psraw m0, 8 ; sign-extend
+ vpbroadcastq m4, [srcq+nsq*1]
+ pshufd m10, m0, q0000
+ vpbroadcastq m1, [srcq+ssq*0]
+ pshufd m11, m0, q1111
+ vpbroadcastq m3, [srcq+ssq*1]
+ pshufd m12, m0, q2222
+ vpbroadcastq m0, [srcq+ssq*2]
+ vpblendd m2, m4, 0xcc ; 0 1
+ vpblendd m1, m3, 0xcc ; 2 3
+ pshufb m2, m6
+ pshufb m1, m6
+ pshufb m0, m6
+ pmaddubsw m2, m7
+ pmaddubsw m1, m7
+ pmaddubsw m0, m7
+ phaddw m2, m1 ; 0 1 2 3
+ phaddw m0, m0 ; 4
+ pmulhrsw m2, m8
+ pmulhrsw m0, m8
+ palignr m0, m2, 4
+ punpcklwd m1, m2, m0 ; 01 12
+ punpckhwd m2, m0 ; 23 34
+.hv_w4_loop:
+ pmaddwd m4, m10, m1 ; a0 b0
+ lea srcq, [srcq+ssq*4]
+ pmaddwd m5, m2, m10 ; c0 d0
+ vpbroadcastq m1, [srcq+nsq*1]
+ pmaddwd m2, m11 ; a1 b1
+ vpbroadcastq m3, [srcq+ssq*0]
+ paddd m4, m2
+ vpbroadcastq m2, [srcq+ssq*1]
+ vpblendd m1, m3, 0xcc ; 5 6
+ vpbroadcastq m3, [srcq+ssq*2]
+ vpblendd m2, m3, 0xcc ; 7 8
+ pshufb m1, m6
+ pshufb m2, m6
+ pmaddubsw m1, m7
+ pmaddubsw m2, m7
+ phaddw m1, m2 ; 5 6 7 8
+ pmulhrsw m1, m8
+ paddd m5, m9
+ paddd m4, m9
+ palignr m2, m1, m0, 12
+ mova m0, m1
+ punpcklwd m1, m2, m0 ; 45 56
+ punpckhwd m2, m0 ; 67 78
+ pmaddwd m3, m11, m1 ; c1 d1
+ paddd m5, m3
+ pmaddwd m3, m12, m1 ; a2 b2
+ paddd m4, m3
+ pmaddwd m3, m12, m2 ; c2 d2
+ paddd m5, m3
+ psrad m4, 6
+ psrad m5, 6
+ packssdw m4, m5
+ vpermd m4, m13, m4
+ mova [tmpq], m4
+ add tmpq, 32
+ sub hd, 4
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ shr mxd, 16
+ lea mxq, [r7+mxq*8+subpel_filters+1-prep_avx2]
+ WIN64_PUSH_XMM 16
+ vpbroadcastw m10, [mxq+0]
+ vpbroadcastw m11, [mxq+2]
+ vpbroadcastw m12, [mxq+4]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ vpbroadcastq m0, [r7+myq*8+subpel_filters+1-prep_avx2]
+ lea r7, [ssq*2+2]
+ vbroadcasti128 m8, [z_filter_s+ 6]
+ punpcklbw m0, m0
+ vbroadcasti128 m9, [z_filter_s+10]
+ psraw m0, 8 ; sign-extend
+ lea r6d, [wq*8-64]
+ pshufd m13, m0, q0000
+ sub srcq, r7
+ pshufd m14, m0, q1111
+ lea r6d, [hq+r6*4]
+ pshufd m15, m0, q2222
+.hv_w8_loop0:
+ vbroadcasti128 m7, [z_filter_s+2]
+ movu xm3, [srcq+ssq*0]
+ lea r5, [srcq+ssq*2]
+ movu xm4, [srcq+ssq*1]
+ vbroadcasti128 m0, [r5+ssq*0]
+ mov r7, tmpq
+ vinserti128 m4, [r5+ssq*1], 1 ; 1 3
+ lea r5, [r5+ssq*2]
+ vpblendd m3, m0, 0xf0 ; 0 2
+ vinserti128 m0, [r5+ssq*0], 1 ; 2 4
+ vpbroadcastd m5, [pw_8192]
+ HV_H_6TAP_W8 m3, m1, m2, m7, m8, m9
+ HV_H_6TAP_W8 m4, m1, m2, m7, m8, m9
+ HV_H_6TAP_W8 m0, m1, m2, m7, m8, m9
+ vpermq m3, m3, q3120
+ vpermq m4, m4, q3120
+ vpermq m0, m0, q3120
+ pmulhrsw m3, m5
+ pmulhrsw m4, m5
+ pmulhrsw m0, m5
+ punpcklwd m1, m3, m4 ; 01
+ punpckhwd m3, m4 ; 23
+ punpcklwd m2, m4, m0 ; 12
+ punpckhwd m4, m0 ; 34
+.hv_w8_loop:
+ movu xm7, [r5+ssq*1]
+ lea r5, [r5+ssq*2]
+ vinserti128 m7, [r5+ssq*0], 1 ; 5 6
+ pmaddwd m5, m13, m1 ; a0
+ mova m1, m3
+ pmaddwd m6, m13, m2 ; b0
+ mova m2, m4
+ pmaddwd m3, m14 ; a1
+ pmaddwd m4, m14 ; b1
+ paddd m5, m3
+ vbroadcasti128 m3, [z_filter_s+2]
+ paddd m6, m4
+ HV_H_6TAP_W8 m7, m3, m4, m3, m8, m9
+ vpbroadcastd m3, [pw_8192]
+ vpbroadcastd m4, [pd_32]
+ pmulhrsw m7, m3
+ paddd m5, m4
+ paddd m6, m4
+ mova m4, m0
+ vpermq m0, m7, q3120
+ shufpd m4, m0, 0x05
+ punpcklwd m3, m4, m0 ; 45
+ pmaddwd m7, m15, m3 ; a2
+ punpckhwd m4, m0 ; 67
+ paddd m5, m7
+ pmaddwd m7, m15, m4 ; b2
+ paddd m6, m7
+ psrad m5, 6
+ psrad m6, 6
+ packssdw m5, m6
+ vpermq m5, m5, q3120
+ mova [r7+wq*0], xm5
+ vextracti128 [r7+wq*2], m5, 1
+ lea r7, [r7+wq*4]
+ sub hd, 2
+ jg .hv_w8_loop
+ add srcq, 8
+ add tmpq, 16
+ movzx hd, r6b
+ sub r6d, 1<<8
+ jg .hv_w8_loop0
+ RET
+
+PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_8bpc
+PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_8bpc
+PREP_8TAP_FN regular_sharp, REGULAR, SHARP, prep_8tap_8bpc
+PREP_8TAP_FN sharp_regular, SHARP, REGULAR, prep_8tap_8bpc
+PREP_8TAP_FN sharp, SHARP, SHARP
+
+cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ lea r7, [prep%+SUFFIX]
+ mov wd, wm
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jz mangle(private_prefix %+ _prep_6tap_8bpc_avx2).prep
.v:
- WIN64_SPILL_XMM 16
+ WIN64_SPILL_XMM 12, 15
movzx mxd, myb ; Select 4-tap/8-tap filter multipliers.
shr myd, 16 ; Note that the code is 8-tap only, having
cmp hd, 4 ; a separate 4-tap code path for (4|8|16)x4
cmove myd, mxd ; had a negligible effect on performance.
- ; TODO: Would a 6-tap code path be worth it?
lea myq, [r7+myq*8+subpel_filters-prep%+SUFFIX]
lea stride3q, [strideq*3]
sub srcq, stride3q
@@ -2359,72 +3189,154 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
jg .v_w8_loop
RET
.v_w16:
- add wd, wd
- mov r5, srcq
- mov r7, tmpq
- lea r6d, [hq+wq*8-256]
+ lea r6d, [wq*2-32]
+ WIN64_PUSH_XMM 15
+ lea r6d, [hq+r6*8]
.v_w16_loop0:
vbroadcasti128 m4, [srcq+strideq*0]
vbroadcasti128 m5, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- vbroadcasti128 m0, [srcq+strideq*1]
- vbroadcasti128 m6, [srcq+strideq*0]
- lea srcq, [srcq+strideq*2]
- vbroadcasti128 m1, [srcq+strideq*0]
- vbroadcasti128 m2, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- vbroadcasti128 m3, [srcq+strideq*0]
- shufpd m4, m4, m0, 0x0c
- shufpd m5, m5, m1, 0x0c
+ lea r5, [srcq+strideq*2]
+ vbroadcasti128 m0, [r5+strideq*1]
+ vbroadcasti128 m6, [r5+strideq*0]
+ lea r5, [r5+strideq*2]
+ vbroadcasti128 m1, [r5+strideq*0]
+ vbroadcasti128 m2, [r5+strideq*1]
+ lea r5, [r5+strideq*2]
+ vbroadcasti128 m3, [r5+strideq*0]
+ mov r7, tmpq
+ shufpd m4, m0, 0x0c
+ shufpd m5, m1, 0x0c
punpcklbw m1, m4, m5 ; 01
punpckhbw m4, m5 ; 34
- shufpd m6, m6, m2, 0x0c
+ shufpd m6, m2, 0x0c
punpcklbw m2, m5, m6 ; 12
punpckhbw m5, m6 ; 45
- shufpd m0, m0, m3, 0x0c
+ shufpd m0, m3, 0x0c
punpcklbw m3, m6, m0 ; 23
punpckhbw m6, m0 ; 56
.v_w16_loop:
- vbroadcasti128 m12, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- vbroadcasti128 m13, [srcq+strideq*0]
- pmaddubsw m14, m1, m8 ; a0
- pmaddubsw m15, m2, m8 ; b0
+ vbroadcasti128 m12, [r5+strideq*1]
+ lea r5, [r5+strideq*2]
+ pmaddubsw m13, m1, m8 ; a0
+ pmaddubsw m14, m2, m8 ; b0
mova m1, m3
mova m2, m4
pmaddubsw m3, m9 ; a1
pmaddubsw m4, m9 ; b1
- paddw m14, m3
- paddw m15, m4
+ paddw m13, m3
+ paddw m14, m4
mova m3, m5
mova m4, m6
pmaddubsw m5, m10 ; a2
pmaddubsw m6, m10 ; b2
- paddw m14, m5
- paddw m15, m6
+ paddw m13, m5
+ vbroadcasti128 m5, [r5+strideq*0]
+ paddw m14, m6
shufpd m6, m0, m12, 0x0d
- shufpd m0, m12, m13, 0x0c
+ shufpd m0, m12, m5, 0x0c
punpcklbw m5, m6, m0 ; 67
punpckhbw m6, m0 ; 78
pmaddubsw m12, m5, m11 ; a3
- pmaddubsw m13, m6, m11 ; b3
+ paddw m13, m12
+ pmaddubsw m12, m6, m11 ; b3
paddw m14, m12
- paddw m15, m13
+ pmulhrsw m13, m7
pmulhrsw m14, m7
- pmulhrsw m15, m7
- mova [tmpq+wq*0], m14
- mova [tmpq+wq*1], m15
- lea tmpq, [tmpq+wq*2]
+ mova [r7+wq*0], m13
+ mova [r7+wq*2], m14
+ lea r7, [r7+wq*4]
sub hd, 2
jg .v_w16_loop
- add r5, 16
- add r7, 32
+ add srcq, 16
+ add tmpq, 32
movzx hd, r6b
- mov srcq, r5
- mov tmpq, r7
sub r6d, 1<<8
jg .v_w16_loop0
RET
+.h:
+.h_w4:
+ test myd, 0xf00
+ jnz .hv
+ vpbroadcastd m4, [pw_8192]
+ cmp wd, 4
+ je mangle(private_prefix %+ _prep_6tap_8bpc_avx2).h_w4
+ WIN64_SPILL_XMM 10
+ vbroadcasti128 m5, [subpel_h_shufA]
+ tzcnt wd, wd
+ vbroadcasti128 m6, [subpel_h_shufB]
+ vbroadcasti128 m7, [subpel_h_shufC]
+ shr mxd, 16
+ sub srcq, 3
+ movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)]
+ vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0]
+ vpbroadcastd m9, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4]
+ add wq, r7
+ jmp wq
+.h_w8:
+ movu xm0, [srcq+strideq*0]
+ vinserti128 m0, [srcq+strideq*1], 1
+ lea srcq, [srcq+strideq*2]
+%macro PREP_8TAP_H 0
+ pshufb m1, m0, m5
+ pshufb m2, m0, m6
+ pshufb m3, m0, m7
+ pmaddubsw m1, m8
+ pmaddubsw m0, m2, m8
+ pmaddubsw m2, m9
+ pmaddubsw m3, m9
+ paddw m1, m2
+ paddw m0, m3
+ phaddw m0, m1, m0
+ pmulhrsw m0, m4
+%endmacro
+ PREP_8TAP_H
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ movu xm0, [srcq+strideq*0+8*0]
+ vinserti128 m0, [srcq+strideq*0+8*1], 1
+ PREP_8TAP_H
+ mova [tmpq+32*0], m0
+ movu xm0, [srcq+strideq*1+8*0]
+ vinserti128 m0, [srcq+strideq*1+8*1], 1
+ lea srcq, [srcq+strideq*2]
+ PREP_8TAP_H
+ mova [tmpq+32*1], m0
+ add tmpq, 32*2
+ sub hd, 2
+ jg .h_w16
+ RET
+.h_w32:
+ xor r6d, r6d
+ jmp .h_start
+.h_w64:
+ mov r6, -32*1
+ jmp .h_start
+.h_w128:
+ mov r6, -32*3
+.h_start:
+ sub srcq, r6
+ mov r5, r6
+.h_loop:
+ movu xm0, [srcq+r6+8*0]
+ vinserti128 m0, [srcq+r6+8*1], 1
+ PREP_8TAP_H
+ mova [tmpq+32*0], m0
+ movu xm0, [srcq+r6+8*2]
+ vinserti128 m0, [srcq+r6+8*3], 1
+ PREP_8TAP_H
+ mova [tmpq+32*1], m0
+ add tmpq, 32*2
+ add r6, 32
+ jle .h_loop
+ add srcq, strideq
+ mov r6, r5
+ dec hd
+ jg .h_loop
+ RET
.hv:
WIN64_SPILL_XMM 16
cmp wd, 4
@@ -2542,28 +3454,27 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
RET
.hv_w8:
lea r6d, [wq*8-64]
- mov r5, srcq
- mov r7, tmpq
lea r6d, [hq+r6*4]
.hv_w8_loop0:
vbroadcasti128 m7, [subpel_h_shufA]
movu xm4, [srcq+strideq*0]
+ lea r5, [srcq+strideq*2]
vbroadcasti128 m8, [subpel_h_shufB]
movu xm5, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
+ mov r7, tmpq
vbroadcasti128 m9, [subpel_h_shufC]
- movu xm6, [srcq+strideq*0]
- vbroadcasti128 m0, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- vpblendd m4, m0, 0xf0 ; 0 3
- vinserti128 m5, [srcq+strideq*0], 1 ; 1 4
- vinserti128 m6, [srcq+strideq*1], 1 ; 2 5
- lea srcq, [srcq+strideq*2]
- vinserti128 m0, [srcq+strideq*0], 1 ; 3 6
- HV_H_W8 m4, m1, m2, m3, m7, m8, m9
- HV_H_W8 m5, m1, m2, m3, m7, m8, m9
- HV_H_W8 m6, m1, m2, m3, m7, m8, m9
- HV_H_W8 m0, m1, m2, m3, m7, m8, m9
+ movu xm6, [r5+strideq*0]
+ vbroadcasti128 m0, [r5+strideq*1]
+ lea r5, [r5+strideq*2]
+ vpblendd m4, m0, 0xf0 ; 0 3
+ vinserti128 m5, [r5+strideq*0], 1 ; 1 4
+ vinserti128 m6, [r5+strideq*1], 1 ; 2 5
+ lea r5, [r5+strideq*2]
+ vinserti128 m0, [r5+strideq*0], 1 ; 3 6
+ HV_H_8TAP_W8 m4, m1, m2, m3, m7, m8, m9
+ HV_H_8TAP_W8 m5, m1, m2, m3, m7, m8, m9
+ HV_H_8TAP_W8 m6, m1, m2, m3, m7, m8, m9
+ HV_H_8TAP_W8 m0, m1, m2, m3, m7, m8, m9
vpbroadcastd m7, [pw_8192]
vpermq m4, m4, q3120
vpermq m5, m5, q3120
@@ -2580,10 +3491,10 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
punpcklwd m3, m6, m7 ; 23
punpckhwd m6, m7 ; 56
.hv_w8_loop:
- vextracti128 [tmpq], m0, 1 ; not enough registers
- movu xm0, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- vinserti128 m0, [srcq+strideq*0], 1 ; 7 8
+ vextracti128 [r7], m0, 1 ; not enough registers
+ movu xm0, [r5+strideq*1]
+ lea r5, [r5+strideq*2]
+ vinserti128 m0, [r5+strideq*0], 1 ; 7 8
pmaddwd m8, m1, m12 ; a0
pmaddwd m9, m2, m12 ; b0
mova m1, m3
@@ -2601,15 +3512,15 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
vbroadcasti128 m6, [subpel_h_shufB]
vbroadcasti128 m7, [subpel_h_shufC]
vbroadcasti128 m5, [subpel_h_shufA]
- HV_H_W8 m0, m5, m6, m7, m5, m6, m7
+ HV_H_8TAP_W8 m0, m5, m6, m7, m5, m6, m7
vpbroadcastd m5, [pw_8192]
vpbroadcastd m7, [pd_32]
- vbroadcasti128 m6, [tmpq]
+ vbroadcasti128 m6, [r7]
pmulhrsw m0, m5
paddd m8, m7
paddd m9, m7
- vpermq m7, m0, q3120 ; 7 8
- shufpd m6, m6, m7, 0x04 ; 6 7
+ vpermq m7, m0, q3120 ; 7 8
+ shufpd m6, m7, 0x04 ; 6 7
punpcklwd m5, m6, m7 ; 67
punpckhwd m6, m7 ; 78
pmaddwd m7, m5, m15 ; a3
@@ -2620,16 +3531,14 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
psrad m7, 6
packssdw m8, m7
vpermq m7, m8, q3120
- mova [tmpq+wq*0], xm7
- vextracti128 [tmpq+wq*2], m7, 1
- lea tmpq, [tmpq+wq*4]
+ mova [r7+wq*0], xm7
+ vextracti128 [r7+wq*2], m7, 1
+ lea r7, [r7+wq*4]
sub hd, 2
jg .hv_w8_loop
- add r5, 8
- add r7, 16
+ add srcq, 8
+ add tmpq, 16
movzx hd, r6b
- mov srcq, r5
- mov tmpq, r7
sub r6d, 1<<8
jg .hv_w8_loop0
RET
@@ -4008,14 +4917,14 @@ DECLARE_REG_TMP 6, 8
%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled,
BILIN_SCALED_FN put
-PUT_8TAP_SCALED_FN sharp, SHARP, SHARP
-PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH
-PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP
-PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH
-PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR
-PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP
-PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR
-PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_SCALED_FN sharp, SHARP, SHARP, put_8tap_scaled_8bpc
+PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH, put_8tap_scaled_8bpc
+PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP, put_8tap_scaled_8bpc
+PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH, put_8tap_scaled_8bpc
+PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR, put_8tap_scaled_8bpc
+PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP, put_8tap_scaled_8bpc
+PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR, put_8tap_scaled_8bpc
+PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH, put_8tap_scaled_8bpc
PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR
MC_8TAP_SCALED put
@@ -4026,14 +4935,14 @@ DECLARE_REG_TMP 6, 7
%endif
BILIN_SCALED_FN prep
-PREP_8TAP_SCALED_FN sharp, SHARP, SHARP
-PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH
-PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP
-PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH
-PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR
-PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP
-PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR
-PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_SCALED_FN sharp, SHARP, SHARP, prep_8tap_scaled_8bpc
+PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_scaled_8bpc
+PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_scaled_8bpc
+PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH, prep_8tap_scaled_8bpc
+PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR, prep_8tap_scaled_8bpc
+PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP, prep_8tap_scaled_8bpc
+PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR, prep_8tap_scaled_8bpc
+PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH, prep_8tap_scaled_8bpc
PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR
MC_8TAP_SCALED prep
diff --git a/third_party/dav1d/src/x86/mc_avx512.asm b/third_party/dav1d/src/x86/mc_avx512.asm
index f9043f1ad3..50e670ec25 100644
--- a/third_party/dav1d/src/x86/mc_avx512.asm
+++ b/third_party/dav1d/src/x86/mc_avx512.asm
@@ -89,55 +89,47 @@ wm_444_mask: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 3
db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63
db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62
-bilin_h_perm16: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
- db 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15
- db 33, 32, 34, 33, 35, 34, 36, 35, 37, 36, 38, 37, 39, 38, 40, 39
- db 41, 40, 42, 41, 43, 42, 44, 43, 45, 44, 46, 45, 47, 46, 48, 47
-bilin_h_perm32: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
- db 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15
- db 17, 16, 18, 17, 19, 18, 20, 19, 21, 20, 22, 21, 23, 22, 24, 23
- db 25, 24, 26, 25, 27, 26, 28, 27, 29, 28, 30, 29, 31, 30, 32, 31
-bilin_v_perm8: db 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7
- db 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23
- db 32, 80, 33, 81, 34, 82, 35, 83, 36, 84, 37, 85, 38, 86, 39, 87
- db 64, 32, 65, 33, 66, 34, 67, 35, 68, 36, 69, 37, 70, 38, 71, 39
-bilin_v_perm16: db 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7
- db 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15
- db 64, 16, 65, 17, 66, 18, 67, 19, 68, 20, 69, 21, 70, 22, 71, 23
- db 72, 24, 73, 25, 74, 26, 75, 27, 76, 28, 77, 29, 78, 30, 79, 31
-bilin_v_perm32: db 64, 0, 65, 1, 66, 2, 67, 3, 68, 4, 69, 5, 70, 6, 71, 7
- db 72, 8, 73, 9, 74, 10, 75, 11, 76, 12, 77, 13, 78, 14, 79, 15
- db 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23
- db 88, 24, 89, 25, 90, 26, 91, 27, 92, 28, 93, 29, 94, 30, 95, 31
-bilin_v_perm64: dq 0, 4, 1, 5, 2, 6, 3, 7
-spel_h_perm16a: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+bilin_h_perm16: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+ db 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16
+ db 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39, 40
+ db 40, 41, 41, 42, 42, 43, 43, 44, 44, 45, 45, 46, 46, 47, 47, 48
+bilin_h_perm32: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+ db 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16
+ db 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24
+ db 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32
+bilin_v_perm8: db 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23
+ db 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87
+ db 80, 32, 81, 33, 82, 34, 83, 35, 84, 36, 85, 37, 86, 38, 87, 39
+ db 32, 64, 33, 65, 34, 66, 35, 67, 36, 68, 37, 69, 38, 70, 39, 71
+bilin_v_perm16: db 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23
+ db 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31
+ db 16, 64, 17, 65, 18, 66, 19, 67, 20, 68, 21, 69, 22, 70, 23, 71
+ db 24, 72, 25, 73, 26, 74, 27, 75, 28, 76, 29, 77, 30, 78, 31, 79
+bilin_v_perm32: db 0, 64, 1, 65, 2, 66, 3, 67, 4, 68, 5, 69, 6, 70, 7, 71
+ db 8, 72, 9, 73, 10, 74, 11, 75, 12, 76, 13, 77, 14, 78, 15, 79
+ db 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87
+ db 24, 88, 25, 89, 26, 90, 27, 91, 28, 92, 29, 93, 30, 94, 31, 95
+bilin_v_perm64: dd 0, 0, 4, 8, 1, 1, 5, 9, 2, 2, 6, 10, 3, 3, 7, 11
+spel_h_perm16: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38
db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46
-spel_h_perm16b: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
- db 12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18
- db 36, 37, 38, 39, 37, 38, 39, 40, 38, 39, 40, 41, 39, 40, 41, 42
- db 44, 45, 46, 47, 45, 46, 47, 48, 46, 47, 48, 49, 47, 48, 49, 50
-spel_h_perm16c: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
- db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22
- db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46
- db 48, 49, 50, 51, 49, 50, 51, 52, 50, 51, 52, 53, 51, 52, 53, 54
-spel_h_perm32a: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+spel_h_perm32: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22
db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30
-spel_h_perm32b: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
- db 12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18
- db 20, 21, 22, 23, 21, 22, 23, 24, 22, 23, 24, 25, 23, 24, 25, 26
- db 28, 29, 30, 31, 29, 30, 31, 32, 30, 31, 32, 33, 31, 32, 33, 34
-spel_h_perm32c: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
- db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22
- db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30
- db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38
-spel_v_perm16: db 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7
+spel_v_perm8: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
+ db 8, 16, 9, 17, 10, 18, 11, 19, 12, 20, 13, 21, 14, 22, 15, 23
+ db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
+ db 24, 32, 25, 33, 26, 34, 27, 35, 28, 36, 29, 37, 30, 38, 31, 39
+spel_v_perm16a: db 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7
db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
db 40, 16, 41, 17, 42, 18, 43, 19, 44, 20, 45, 21, 46, 22, 47, 23
db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
+spel_v_perm16b: db 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7
+ db 40, 16, 41, 17, 42, 18, 43, 19, 44, 20, 45, 21, 46, 22, 47, 23
+ db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
+ db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
spel_v_perm32: db 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39
db 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47
db 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55
@@ -154,34 +146,20 @@ spel_hv_perm8a: db 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 2
db 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31
db 16, 17, 32, 33, 18, 19, 34, 35, 20, 21, 36, 37, 22, 23, 38, 39
db 24, 25, 40, 41, 26, 27, 42, 43, 28, 29, 44, 45, 30, 31, 46, 47
-spel_hv_perm8b: db 32, 33, 48, 49, 34, 35, 50, 51, 36, 37, 52, 53, 38, 39, 54, 55
- db 40, 41, 56, 57, 42, 43, 58, 59, 44, 45, 60, 61, 46, 47, 62, 63
- db 48, 49, 64, 65, 50, 51, 66, 67, 52, 53, 68, 69, 54, 55, 70, 71
- db 56, 57, 72, 73, 58, 59, 74, 75, 60, 61, 76, 77, 62, 63, 78, 79
-spel_hv_perm8c: db 34, 35, 0, 1, 38, 39, 4, 5, 42, 43, 8, 9, 46, 47, 12, 13
+spel_hv_perm8b: db 34, 35, 0, 1, 38, 39, 4, 5, 42, 43, 8, 9, 46, 47, 12, 13
db 50, 51, 16, 17, 54, 55, 20, 21, 58, 59, 24, 25, 62, 63, 28, 29
db 0, 1, 32, 33, 4, 5, 36, 37, 8, 9, 40, 41, 12, 13, 44, 45
db 16, 17, 48, 49, 20, 21, 52, 53, 24, 25, 56, 57, 28, 29, 60, 61
-spel_hv_end16: db 1, 3, 17, 19, 5, 7, 21, 23, 33, 35, 49, 51, 37, 39, 53, 55
- db 9, 11, 25, 27, 13, 15, 29, 31, 41, 43, 57, 59, 45, 47, 61, 63
spel_hv_perm16a:db 0, 1, 2, 3, 32, 33, 34, 35, 1, 2, 3, 4, 33, 34, 35, 36
db 2, 3, 4, 5, 34, 35, 36, 37, 3, 4, 5, 6, 35, 36, 37, 38
-spel_hv_perm16c:db 8, 9, 10, 11, 40, 41, 42, 43, 9, 10, 11, 12, 41, 42, 43, 44
+ db 8, 9, 10, 11, 40, 41, 42, 43, 9, 10, 11, 12, 41, 42, 43, 44
db 10, 11, 12, 13, 42, 43, 44, 45, 11, 12, 13, 14, 43, 44, 45, 46
- db 16, 17, 18, 19, 48, 49, 50, 51, 17, 18, 19, 20, 49, 50, 51, 52
- db 18, 19, 20, 21, 50, 51, 52, 53, 19, 20, 21, 22, 51, 52, 53, 54
-spel_hv_perm16b:db 4, 5, 6, 7, 36, 37, 38, 39, 5, 6, 7, 8, 37, 38, 39, 40
- db 6, 7, 8, 9, 38, 39, 40, 41, 7, 8, 9, 10, 39, 40, 41, 42
- db 12, 13, 14, 15, 44, 45, 46, 47, 13, 14, 15, 16, 45, 46, 47, 48
- db 14, 15, 16, 17, 46, 47, 48, 49, 15, 16, 17, 18, 47, 48, 49, 50
-spel_hv_perm16d:db 0, 1, 2, 3, 1, 2, 3, 4, 4, 5, 6, 7, 5, 6, 7, 8
+spel_hv_perm16b:db 0, 1, 2, 3, 1, 2, 3, 4, 4, 5, 6, 7, 5, 6, 7, 8
db 2, 3, 4, 5, 3, 4, 5, 6, 6, 7, 8, 9, 7, 8, 9, 10
db 8, 9, 10, 11, 9, 10, 11, 12, 12, 13, 14, 15, 13, 14, 15, 16
db 10, 11, 12, 13, 11, 12, 13, 14, 14, 15, 16, 17, 15, 16, 17, 18
-spel_hv_perm16e:db 4, 5, 6, 7, 5, 6, 7, 8, 8, 9, 10, 11, 9, 10, 11, 12
- db 6, 7, 8, 9, 7, 8, 9, 10, 10, 11, 12, 13, 11, 12, 13, 14
- db 12, 13, 14, 15, 13, 14, 15, 16, 16, 17, 18, 19, 17, 18, 19, 20
- db 14, 15, 16, 17, 15, 16, 17, 18, 18, 19, 20, 21, 19, 20, 21, 22
+spel_hv_end16: db 1, 3, 17, 19, 5, 7, 21, 23, 33, 35, 49, 51, 37, 39, 53, 55
+ db 9, 11, 25, 27, 13, 15, 29, 31, 41, 43, 57, 59, 45, 47, 61, 63
spel_hv_end: db 1, 3, 5, 7, 17, 19, 21, 23, 33, 35, 37, 39, 49, 51, 53, 55
deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11
subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12
@@ -189,15 +167,14 @@ subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 1
subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
-bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11
-bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
-bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7
+bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12
+bilin_v_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11
blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7
resize_permA: dd 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
resize_permB: dd 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
resize_permC: dd 0, 4, 8, 12
+resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7
pb_02461357: db 0, 2, 4, 6, 1, 3, 5, 7
wm_420_perm64: dq 0xfedcba9876543210
@@ -205,6 +182,8 @@ wm_sign: dd 0x40804080, 0xc0c0c0c0, 0x40404040
pb_8x0_8x8: times 8 db 0
times 8 db 8
+pb_4: times 4 db 4
+pb_32: times 4 db 32
pb_127: times 4 db 127
pw_m128 times 2 dw -128
pw_m256: times 2 dw -256
@@ -216,7 +195,6 @@ pd_32: dd 32
pd_34: dd 34
pd_63: dd 63
pd_512: dd 512
-pd_32768: dd 32768
%define pb_m64 (wm_sign+4)
%define pb_64 (wm_sign+8)
@@ -289,8 +267,10 @@ BASE_JMP_TABLE put, avx512icl, 2, 4, 8, 16, 32, 64, 128
BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE put, bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE put, 6tap, avx512icl, 2, 2, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE put, 8tap, avx512icl, 3, 2, 4, 8, 16, 32, 64, 128
-HV_JMP_TABLE prep, 8tap, avx512icl, 7, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, 6tap, avx512icl, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, 8tap, avx512icl, 3, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE avg, avx512icl, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_avg, avx512icl, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE mask, avx512icl, 4, 8, 16, 32, 64, 128
@@ -401,9 +381,9 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
.h:
; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
- imul mxyd, 0xff01
- vbroadcasti128 m4, [bilin_h_shuf8]
- add mxyd, 16 << 8
+ imul mxyd, 255
+ vbroadcasti128 m4, [bilin_h_perm16]
+ add mxyd, 16
vpbroadcastw m5, mxyd
mov mxyd, r7m ; my
test mxyd, mxyd
@@ -526,9 +506,9 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
RET
.v:
movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)]
- imul mxyd, 0xff01
+ imul mxyd, 255
vpbroadcastd m5, [pw_2048]
- add mxyd, 16 << 8
+ add mxyd, 16
add wq, r7
vpbroadcastw m4, mxyd
jmp wq
@@ -539,7 +519,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
lea srcq, [srcq+ssq*2]
pinsrw xmm0, xmm1, [srcq+ssq*0], 0 ; 2 1
pshuflw xmm1, xmm1, q2301 ; 1 0
- punpcklbw xmm1, xmm0, xmm1
+ punpcklbw xmm1, xmm0
pmaddubsw xmm1, xm4
pmulhrsw xmm1, xm5
packuswb xmm1, xmm1
@@ -552,11 +532,11 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
.v_w4:
movd xmm0, [srcq+ssq*0]
.v_w4_loop:
- vpbroadcastd xmm1, [srcq+ssq*1]
+ vpbroadcastd xmm2, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- vpblendd xmm2, xmm1, xmm0, 0x01 ; 0 1
+ vpblendd xmm1, xmm2, xmm0, 0x01 ; 0 1
vpbroadcastd xmm0, [srcq+ssq*0]
- vpblendd xmm1, xmm0, 0x02 ; 1 2
+ vpblendd xmm2, xmm0, 0x02 ; 1 2
punpcklbw xmm1, xmm2
pmaddubsw xmm1, xm4
pmulhrsw xmm1, xm5
@@ -570,11 +550,11 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
.v_w8:
movq xmm0, [srcq+ssq*0]
.v_w8_loop:
- movq xmm3, [srcq+ssq*1]
+ movq xmm2, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- punpcklbw xmm1, xmm3, xmm0
+ punpcklbw xmm1, xmm0, xmm2
movq xmm0, [srcq+ssq*0]
- punpcklbw xmm2, xmm0, xmm3
+ punpcklbw xmm2, xmm0
pmaddubsw xmm1, xm4
pmaddubsw xmm2, xm4
pmulhrsw xmm1, xm5
@@ -589,11 +569,11 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
.v_w16:
movu xmm0, [srcq+ssq*0]
.v_w16_loop:
- vbroadcasti128 ymm2, [srcq+ssq*1]
+ vbroadcasti128 ymm3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- vpblendd ymm3, ymm2, ymm0, 0x0f ; 0 1
+ vpblendd ymm2, ymm3, ymm0, 0x0f ; 0 1
vbroadcasti128 ymm0, [srcq+ssq*0]
- vpblendd ymm2, ymm2, ymm0, 0xf0 ; 1 2
+ vpblendd ymm3, ymm0, 0xf0 ; 1 2
punpcklbw ymm1, ymm2, ymm3
punpckhbw ymm2, ymm3
pmaddubsw ymm1, ym4
@@ -612,11 +592,11 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
movu ym0, [srcq+ssq*0]
kxnorb k1, k1, k1
.v_w32_loop:
- vbroadcasti32x8 m2, [srcq+ssq*1]
+ vbroadcasti32x8 m3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- vpblendmd m3{k1}, m2, m0 ; 0 1
+ vpblendmd m2{k1}, m3, m0 ; 0 1
vbroadcasti32x8 m0, [srcq+ssq*0]
- vpblendmd m2{k1}, m0, m2 ; 1 2
+ vpblendmd m3{k1}, m0, m3 ; 1 2
punpcklbw m1, m2, m3
punpckhbw m2, m3
pmaddubsw m1, m4
@@ -635,18 +615,18 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
.v_w64_loop:
movu m3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- punpcklbw m1, m3, m0
- punpckhbw m6, m3, m0
+ punpcklbw m1, m0, m3
+ punpckhbw m6, m0, m3
movu m0, [srcq+ssq*0]
pmaddubsw m1, m4
pmaddubsw m6, m4
- punpcklbw m2, m0, m3
- punpckhbw m7, m0, m3
+ punpcklbw m2, m3, m0
+ punpckhbw m3, m0
pmaddubsw m2, m4
- pmaddubsw m7, m4
- REPX {pmulhrsw x, m5}, m1, m6, m2, m7
+ pmaddubsw m3, m4
+ REPX {pmulhrsw x, m5}, m1, m6, m2, m3
packuswb m1, m6
- packuswb m2, m7
+ packuswb m2, m3
mova [dstq+dsq*0], m1
mova [dstq+dsq*1], m2
lea dstq, [dstq+dsq*2]
@@ -660,13 +640,13 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
add srcq, ssq
movu m2, [srcq+64*0]
movu m3, [srcq+64*1]
- punpcklbw m6, m2, m0
+ punpcklbw m6, m0, m2
pmaddubsw m6, m4
- punpckhbw m0, m2, m0
+ punpckhbw m0, m2
pmaddubsw m0, m4
- punpcklbw m7, m3, m1
+ punpcklbw m7, m1, m3
pmaddubsw m7, m4
- punpckhbw m1, m3, m1
+ punpckhbw m1, m3
pmaddubsw m1, m4
REPX {pmulhrsw x, m5}, m6, m0, m7, m1
packuswb m6, m0
@@ -1005,8 +985,8 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
.h:
; 16 * src[x] + (mx * (src[x + 1] - src[x]))
; = (16 - mx) * src[x] + mx * src[x + 1]
- imul mxyd, 0xff01
- add mxyd, 16 << 8
+ imul mxyd, 255
+ add mxyd, 16
vpbroadcastw m5, mxyd
mov mxyd, r6m ; my
test mxyd, mxyd
@@ -1032,7 +1012,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
jg .h_w4_loop
RET
.h_w8:
- vbroadcasti32x4 m4, [bilin_h_shuf8]
+ vbroadcasti32x4 m4, [bilin_h_perm16]
.h_w8_loop:
movu xmm0, [srcq+strideq*0]
vinserti32x4 ym0, ymm0, [srcq+strideq*1], 1
@@ -1127,8 +1107,8 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
.v:
WIN64_SPILL_XMM 7
movzx wd, word [t2+wq*2+table_offset(prep, _bilin_v)]
- imul mxyd, 0xff01
- add mxyd, 16 << 8
+ imul mxyd, 255
+ add mxyd, 16
add wq, t2
lea stride3q, [strideq*3]
vpbroadcastw m6, mxyd
@@ -1218,11 +1198,11 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
.v_w64_loop:
vpermq m1, m5, [srcq+strideq*1]
lea srcq, [srcq+strideq*2]
- punpcklbw m4, m1, m0
- punpckhbw m2, m1, m0
+ punpcklbw m4, m0, m1
+ punpckhbw m2, m0, m1
vpermq m0, m5, [srcq+strideq*0]
- punpcklbw m3, m0, m1
- punpckhbw m1, m0, m1
+ punpcklbw m3, m1, m0
+ punpckhbw m1, m0
pmaddubsw m4, m6
pmaddubsw m2, m6
pmaddubsw m3, m6
@@ -1243,28 +1223,28 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
vpermq m2, m5, [srcq+strideq*1+ 0]
vpermq m3, m5, [srcq+strideq*1+64]
lea srcq, [srcq+strideq*2]
- punpcklbw m4, m2, m0
- punpckhbw m0, m2, m0
+ punpcklbw m4, m0, m2
+ punpckhbw m0, m2
pmaddubsw m4, m6
pmaddubsw m0, m6
mova [tmpq+64*0], m4
mova [tmpq+64*1], m0
- punpcklbw m4, m3, m1
- punpckhbw m1, m3, m1
+ punpcklbw m4, m1, m3
+ punpckhbw m1, m3
pmaddubsw m4, m6
pmaddubsw m1, m6
mova [tmpq+64*2], m4
mova [tmpq+64*3], m1
vpermq m0, m5, [srcq+strideq*0+ 0]
vpermq m1, m5, [srcq+strideq*0+64]
- punpcklbw m4, m0, m2
- punpckhbw m2, m0, m2
+ punpcklbw m4, m2, m0
+ punpckhbw m2, m0
pmaddubsw m4, m6
pmaddubsw m2, m6
mova [tmpq+64*4], m4
mova [tmpq+64*5], m2
- punpcklbw m4, m1, m3
- punpckhbw m3, m1, m3
+ punpcklbw m4, m3, m1
+ punpckhbw m3, m1
pmaddubsw m4, m6
pmaddubsw m3, m6
mova [tmpq+64*6], m4
@@ -1308,7 +1288,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
jg .hv_w4_loop
RET
.hv_w8:
- vbroadcasti32x4 m4, [bilin_h_shuf8]
+ vbroadcasti32x4 m4, [bilin_h_perm16]
vbroadcasti32x4 m0, [srcq+strideq*0]
pshufb m0, m4
pmaddubsw m0, m5
@@ -1448,7 +1428,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
%assign FILTER_SMOOTH (1*15 << 16) | 4*15
%assign FILTER_SHARP (2*15 << 16) | 3*15
-%macro FN 4 ; fn, type, type_h, type_v
+%macro FN 4-5 ; fn, type, type_h, type_v, jmp_to
cglobal %1_%2_8bpc
mov t0d, FILTER_%3
%ifidn %3, %4
@@ -1456,8 +1436,8 @@ cglobal %1_%2_8bpc
%else
mov t1d, FILTER_%4
%endif
-%ifnidn %2, regular ; skip the jump in the last filter
- jmp mangle(private_prefix %+ _%1_8bpc %+ SUFFIX)
+%if %0 == 5 ; skip the jump in the last filter
+ jmp mangle(private_prefix %+ _%5 %+ SUFFIX)
%endif
%endmacro
@@ -1489,24 +1469,22 @@ DECLARE_REG_TMP 4, 5
DECLARE_REG_TMP 7, 8
%endif
+; Due to the use of vpdpbusd (which does 4 pixels per instruction) in
+; the horizontal filter, 6-tap is only used for the vertical filter.
%define PUT_8TAP_FN FN put_8tap,
-
-PUT_8TAP_FN sharp, SHARP, SHARP
-PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH
-PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP
-PUT_8TAP_FN smooth, SMOOTH, SMOOTH
-PUT_8TAP_FN sharp_regular, SHARP, REGULAR
-PUT_8TAP_FN regular_sharp, REGULAR, SHARP
-PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR
-PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH, put_6tap_8bpc
+PUT_8TAP_FN sharp_regular, SHARP, REGULAR, put_6tap_8bpc
+PUT_8TAP_FN smooth, SMOOTH, SMOOTH, put_6tap_8bpc
+PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR, put_6tap_8bpc
+PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH, put_6tap_8bpc
PUT_8TAP_FN regular, REGULAR, REGULAR
-cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
+cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ns
%define base r8-put_avx512icl
imul mxd, mxm, 0x010101
- add mxd, t0d ; 8tap_h, mx, 4tap_h
+ add mxd, t0d ; 6tap_h, mx, 4tap_h
imul myd, mym, 0x010101
- add myd, t1d ; 8tap_v, my, 4tap_v
+ add myd, t1d ; 6tap_v, my, 4tap_v
lea r8, [put_avx512icl]
movsxd wq, wm
movifnidn hd, hm
@@ -1514,6 +1492,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
jnz .h
test myd, 0xf00
jnz .v
+.put:
tzcnt wd, wd
movzx wd, word [r8+wq*2+table_offset(put,)]
add wq, r8
@@ -1523,158 +1502,577 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
pop r8
%endif
jmp wq
-.h:
- test myd, 0xf00
- jnz .hv
- vpbroadcastd m5, [pd_34] ; 2 + (8 << 2)
- WIN64_SPILL_XMM 11
- cmp wd, 4
- jl .h_w2
- vbroadcasti128 m6, [subpel_h_shufA]
- je .h_w4
- tzcnt wd, wd
- vbroadcasti128 m7, [subpel_h_shufB]
- vbroadcasti128 m8, [subpel_h_shufC]
- shr mxd, 16
- sub srcq, 3
- movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)]
- vpbroadcastd m9, [base+mxq*8+subpel_filters+0]
- vpbroadcastd m10, [base+mxq*8+subpel_filters+4]
- add wq, r8
- jmp wq
-.h_w2:
- movzx mxd, mxb
- dec srcq
- mova xmm4, [subpel_h_shuf4]
- vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2]
-.h_w2_loop:
- movq xmm0, [srcq+ssq*0]
- movhps xmm0, [srcq+ssq*1]
+.v:
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ tzcnt r6d, wd
+ movzx r6d, word [r8+r6*2+table_offset(put, _6tap_v)]
+ vpbroadcastd m6, [pw_512]
+ lea myq, [base+subpel_filters+1+myq*8]
+ vpbroadcastw m7, [myq+0]
+ add r6, r8
+ vpbroadcastw m8, [myq+2]
+ mov nsq, ssq
+ vpbroadcastw m9, [myq+4]
+ neg nsq
+ jmp r6
+.v_w2:
+ movd xmm2, [srcq+nsq*2]
+ pinsrw xmm2, [srcq+nsq*1], 2
+ pinsrw xmm2, [srcq+ssq*0], 4
+ pinsrw xmm2, [srcq+ssq*1], 6 ; 0 1 2 3
lea srcq, [srcq+ssq*2]
- pshufb xmm0, xmm4
- mova xmm1, xm5
- vpdpbusd xmm1, xmm0, xmm3
- packssdw xmm0, xmm1, xmm1
- psraw xmm0, 6
- packuswb xmm0, xm0
- pextrw [dstq+dsq*0], xmm0, 0
- pextrw [dstq+dsq*1], xmm0, 1
+ vpbroadcastd xmm0, [srcq+ssq*0]
+ palignr xmm3, xmm0, xmm2, 4 ; 1 2 3 4
+ punpcklbw xmm1, xmm2, xmm3 ; 01 12
+ punpckhbw xmm2, xmm3 ; 23 34
+.v_w2_loop:
+ vpbroadcastd xmm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw xmm3, xmm1, xm7 ; a0 b0
+ mova xmm1, xmm2
+ pmaddubsw xmm2, xm8 ; a1 b1
+ paddw xmm3, xmm2
+ vpblendd xmm2, xmm0, xmm4, 0x02 ; 4 5
+ vpbroadcastd xmm0, [srcq+ssq*0]
+ vpblendd xmm4, xmm0, 0x02 ; 5 6
+ punpcklbw xmm2, xmm4 ; 67 78
+ pmaddubsw xmm4, xmm2, xm9 ; a3 b3
+ paddw xmm3, xmm4
+ pmulhrsw xmm3, xm6
+ packuswb xmm3, xmm3
+ pextrw [dstq+dsq*0], xmm3, 0
+ pextrw [dstq+dsq*1], xmm3, 2
lea dstq, [dstq+dsq*2]
sub hd, 2
- jg .h_w2_loop
+ jg .v_w2_loop
RET
-.h_w4:
- movzx mxd, mxb
- dec srcq
- vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2]
-.h_w4_loop:
- movq xmm0, [srcq+ssq*0]
- movq xmm1, [srcq+ssq*1]
+.v_w4:
+ movd xmm2, [srcq+nsq*2]
+ pinsrd xmm2, [srcq+nsq*1], 1
+ pinsrd xmm2, [srcq+ssq*0], 2
+ pinsrd xmm2, [srcq+ssq*1], 3 ; 0 1 2 3
lea srcq, [srcq+ssq*2]
- pshufb xmm0, xm6
- pshufb xmm1, xm6
- mova xmm2, xm5
- vpdpbusd xmm2, xmm0, xmm3
- mova xmm0, xm5
- vpdpbusd xmm0, xmm1, xmm3
- packssdw xmm0, xmm2, xmm0
- psraw xmm0, 6
- packuswb xmm0, xmm0
- movd [dstq+dsq*0], xmm0
- pextrd [dstq+dsq*1], xmm0, 1
+ vpbroadcastd xmm0, [srcq+ssq*0]
+ palignr xmm3, xmm0, xmm2, 4 ; 1 2 3 4
+ punpcklbw xmm1, xmm2, xmm3 ; 01 12
+ punpckhbw xmm2, xmm3 ; 23 34
+.v_w4_loop:
+ vpbroadcastd xmm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw xmm3, xmm1, xm7 ; a0 b0
+ mova xmm1, xmm2
+ pmaddubsw xmm2, xm8 ; a1 b1
+ paddw xmm3, xmm2
+ vpblendd xmm2, xmm0, xmm4, 0x02 ; 4 5
+ vpbroadcastd xmm0, [srcq+ssq*0]
+ vpblendd xmm4, xmm0, 0x02 ; 5 6
+ punpcklbw xmm2, xmm4 ; 45 56
+ pmaddubsw xmm4, xmm2, xm9 ; a2 b2
+ paddw xmm3, xmm4
+ pmulhrsw xmm3, xm6
+ packuswb xmm3, xmm3
+ movd [dstq+dsq*0], xmm3
+ pextrd [dstq+dsq*1], xmm3, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
- jg .h_w4_loop
+ jg .v_w4_loop
RET
-.h_w8:
- movu xm0, [srcq+ssq*0]
- vinserti32x4 ym0, [srcq+ssq*1], 1
+.v_w8:
+ movq xmm1, [srcq+nsq*2]
+ vpbroadcastq ymm3, [srcq+nsq*1]
+ vpbroadcastq ymm2, [srcq+ssq*0]
+ vpbroadcastq ymm4, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- WRAP_YMM PUT_8TAP_H 0, 1, 2, 3
- vpmovuswb xm0, ym0
- movq [dstq+dsq*0], xm0
- movhps [dstq+dsq*1], xm0
+ vpbroadcastq ymm0, [srcq+ssq*0]
+ vpblendd ymm1, ymm3, 0x30
+ vpblendd ymm3, ymm2, 0x30
+ punpcklbw ymm1, ymm3 ; 01 12
+ vpblendd ymm2, ymm4, 0x30
+ vpblendd ymm4, ymm0, 0x30
+ punpcklbw ymm2, ymm4 ; 23 34
+.v_w8_loop:
+ vpbroadcastq ymm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw ymm3, ymm1, ym7 ; a0 b0
+ mova ymm1, ymm2
+ pmaddubsw ymm2, ym8 ; a1 b1
+ paddw ymm3, ymm2
+ vpblendd ymm2, ymm0, ymm4, 0x30
+ vpbroadcastq ymm0, [srcq+ssq*0]
+ vpblendd ymm4, ymm0, 0x30
+ punpcklbw ymm2, ymm4 ; 45 56
+ pmaddubsw ymm4, ymm2, ym9 ; a2 b2
+ paddw ymm3, ymm4
+ pmulhrsw ymm3, ym6
+ vextracti128 xmm4, ymm3, 1
+ packuswb xmm3, xmm4
+ movq [dstq+dsq*0], xmm3
+ movhps [dstq+dsq*1], xmm3
lea dstq, [dstq+dsq*2]
sub hd, 2
- jg .h_w8
+ jg .v_w8_loop
+ vzeroupper
RET
-.h_w16:
- mova m6, [spel_h_perm16a]
- mova m7, [spel_h_perm16b]
- mova m8, [spel_h_perm16c]
-.h_w16_loop:
- movu ym0, [srcq+ssq*0]
+.v_w16:
+ mova m5, [spel_v_perm16a]
+ vbroadcasti32x4 m1, [srcq+nsq*2]
+ vbroadcasti32x4 ym3, [srcq+nsq*1]
+ mov r6d, 0x0f
+ vbroadcasti32x4 m2, [srcq+ssq*0]
+ kmovb k1, r6d
+ vbroadcasti32x4 ym4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vbroadcasti32x4 m0, [srcq+ssq*0]
+ vshufpd m1{k1}, m3, m2, 0xcc
+ vshufpd m2{k1}, m4, m0, 0xcc
+ vpermb m1, m5, m1 ; 01 12
+ vpermb m2, m5, m2 ; 23 34
+.v_w16_loop:
+ vbroadcasti32x4 ym4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw m3, m1, m7 ; a0 b0
+ mova m1, m2
+ pmaddubsw m2, m8 ; a1 b1
+ paddw m3, m2
+ mova m2, m0
+ vbroadcasti32x4 m0, [srcq+ssq*0]
+ vshufpd m2{k1}, m4, m0, 0xcc
+ vpermb m2, m5, m2 ; 45 56
+ pmaddubsw m4, m2, m9 ; a2 b2
+ paddw m3, m4
+ pmulhrsw m3, m6
+ vextracti32x8 ym4, m3, 1
+ packuswb ym3, ym4
+ mova [dstq+dsq*0], xm3
+ vextracti32x4 [dstq+dsq*1], ym3, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w16_loop
+ RET
+.v_w32:
+ mova m10, [spel_v_perm32]
+ pmovzxbq m5, [pb_02461357]
+ vpshrdw m11, m10, m10, 8
+ movu ym0, [srcq+nsq*2]
+ vinserti32x8 m0, [srcq+nsq*1], 1
+ vpermb m1, m10, m0 ; 01
+ vinserti32x8 m0, [srcq+ssq*0], 0
+ vpermb m2, m11, m0 ; 12
vinserti32x8 m0, [srcq+ssq*1], 1
lea srcq, [srcq+ssq*2]
- PUT_8TAP_H 0, 1, 2, 3, 1
- vpmovuswb ym0, m0
- mova [dstq+dsq*0], xm0
- vextracti128 [dstq+dsq*1], ym0, 1
+ vpermb m3, m10, m0 ; 23
+ vinserti32x8 m0, [srcq+ssq*0], 0
+ vpermb m4, m11, m0 ; 34
+.v_w32_loop:
+ vinserti32x8 m0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw m12, m1, m7
+ mova m1, m3
+ pmaddubsw m13, m2, m7
+ mova m2, m4
+ pmaddubsw m14, m3, m8
+ vpermb m3, m10, m0 ; 45
+ vinserti32x8 m0, [srcq+ssq*0], 0
+ pmaddubsw m15, m4, m8
+ vpermb m4, m11, m0 ; 56
+ paddw m12, m14
+ pmaddubsw m14, m3, m9
+ paddw m13, m15
+ pmaddubsw m15, m4, m9
+ paddw m12, m14
+ paddw m13, m15
+ pmulhrsw m12, m6
+ pmulhrsw m13, m6
+ packuswb m12, m13
+ vpermq m12, m5, m12
+ mova [dstq+dsq*0], ym12
+ vextracti32x8 [dstq+dsq*1], m12, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
- jg .h_w16_loop
+ jg .v_w32_loop
RET
-.h_w32:
- movu ym0, [srcq+ssq*0+8*0]
- vinserti32x8 m0, [srcq+ssq*1+8*0], 1
- movu ym1, [srcq+ssq*0+8*1]
- vinserti32x8 m1, [srcq+ssq*1+8*1], 1
+.v_w64:
+.v_w128:
+ lea r6d, [hq+wq*4-256]
+.v_loop0:
+ movu m2, [srcq+nsq*2]
+ movu m4, [srcq+nsq*1]
+ lea r4, [srcq+ssq*2]
+ movu m11, [srcq+ssq*0]
+ movu m13, [srcq+ssq*1]
+ mov r7, dstq
+ movu m0, [r4 +ssq*0]
+ punpcklbw m1, m2, m4 ; 01l
+ punpckhbw m2, m4 ; 01h
+ punpcklbw m3, m4, m11 ; 12l
+ punpckhbw m4, m11 ; 12h
+ punpcklbw m10, m11, m13 ; 23l
+ punpckhbw m11, m13 ; 23h
+ punpcklbw m12, m13, m0 ; 34l
+ punpckhbw m13, m0 ; 34h
+.v_loop:
+ movu m5, [r4+ssq*1]
+ pmaddubsw m14, m1, m7 ; a0l
+ mova m1, m10
+ pmaddubsw m10, m8 ; a1l
+ lea r4, [r4+ssq*2]
+ pmaddubsw m15, m2, m7 ; a0h
+ mova m2, m11
+ pmaddubsw m11, m8 ; a1h
+ paddw m14, m10
+ punpcklbw m10, m0, m5 ; 45l
+ paddw m15, m11
+ punpckhbw m11, m0, m5 ; 45h
+ pmaddubsw m0, m10, m9 ; a2l
+ paddw m14, m0
+ pmaddubsw m0, m11, m9 ; a2h
+ paddw m15, m0
+ movu m0, [r4+ssq*0]
+ pmulhrsw m14, m6
+ pmulhrsw m15, m6
+ packuswb m14, m15
+ pmaddubsw m15, m3, m7 ; b0l
+ mova m3, m12
+ pmaddubsw m12, m8 ; b1l
+ mova [r7+dsq*0], m14
+ pmaddubsw m14, m4, m7 ; b0h
+ mova m4, m13
+ pmaddubsw m13, m8 ; b1h
+ paddw m15, m12
+ punpcklbw m12, m5, m0 ; 56l
+ paddw m14, m13
+ punpckhbw m13, m5, m0 ; 56h
+ pmaddubsw m5, m12, m9 ; b2l
+ paddw m15, m5
+ pmaddubsw m5, m13, m9 ; b2h
+ paddw m14, m5
+ pmulhrsw m15, m6
+ pmulhrsw m14, m6
+ packuswb m15, m14
+ mova [r7+dsq*1], m15
+ lea r7, [r7+dsq*2]
+ sub hd, 2
+ jg .v_loop
+ add srcq, 64
+ add dstq, 64
+ movzx hd, r6b
+ sub r6d, 256
+ jg .v_loop0
+ RET
+.h:
+ test myd, 0xf00
+ jz mangle(private_prefix %+ _put_8tap_8bpc_avx512icl).h2
+.hv:
+ vpbroadcastd m9, [pd_34]
+ mova xm10, [spel_hv_end]
+ pxor xm0, xm0
+ cmp wd, 4
+ jg .hv_w8
+ movzx mxd, mxb
+ dec srcq
+ vpbroadcastd m7, [base+subpel_filters+mxq*8+2]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ vpbroadcastq ym1, [base+subpel_filters+1+myq*8]
+ mov nsq, ssq
+ punpcklbw ym0, ym1
+ neg nsq
+ psraw ym0, 2 ; << 6
+ pshufd ym11, ym0, q0000
+ pshufd ym12, ym0, q1111
+ pshufd ym13, ym0, q2222
+ cmp wd, 4
+ je .hv_w4
+ vbroadcasti128 ym5, [subpel_h_shuf4]
+ movq xmm0, [srcq+nsq*2]
+ movhps xmm0, [srcq+nsq*1]
+ movq xmm2, [srcq+ssq*0]
+ movhps xmm2, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- PUT_8TAP_H 0, 2, 3, 4
- PUT_8TAP_H 1, 4, 3, 2
- packuswb m0, m1
- mova [dstq+dsq*0], ym0
- vextracti32x8 [dstq+dsq*1], m0, 1
+ vpbroadcastq ymm1, [srcq+ssq*0]
+ vpblendd ymm0, ymm1, 0x30
+ pshufb xmm2, xm5 ; 2 3
+ pshufb ymm0, ym5 ; 0 1 4
+ mova xmm1, xm9
+ vpdpbusd xmm1, xmm2, xm7
+ mova ymm2, ym9
+ vpdpbusd ymm2, ymm0, ym7
+ packssdw ymm2, ymm1
+ psraw ymm2, 2
+ vextracti128 xmm0, ymm2, 1
+ vzeroupper
+ palignr xmm0, xmm2, 4
+ punpcklwd xmm1, xmm2, xmm0 ; 01 12
+ punpckhwd xmm2, xmm0 ; 23 34
+.hv_w2_loop:
+ movq xmm3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movhps xmm3, [srcq+ssq*0]
+ pmaddwd xmm4, xmm1, xm11 ; a0 b0
+ mova xmm1, xmm2
+ vpdpwssd xmm4, xmm2, xm12 ; a1 b1
+ pshufb xmm3, xm5
+ mova xmm2, xm9
+ vpdpbusd xmm2, xmm3, xm7
+ packssdw xmm3, xmm2, xmm2
+ psraw xmm3, 2
+ palignr xmm2, xmm3, xmm0, 12
+ mova xmm0, xmm3
+ punpcklwd xmm2, xmm3 ; 45 56
+ vpdpwssd xmm4, xmm2, xm13 ; a2 b2
+ packuswb xmm4, xmm4
+ pshufb xmm4, xm10
+ pextrw [dstq+dsq*0], xmm4, 0
+ pextrw [dstq+dsq*1], xmm4, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
- jg .h_w32
+ jg .hv_w2_loop
RET
-.h_w64:
- movu m0, [srcq+8*0]
- movu m1, [srcq+8*1]
- add srcq, ssq
- PUT_8TAP_H 0, 2, 3, 4
- PUT_8TAP_H 1, 4, 3, 2
- packuswb m0, m1
- mova [dstq], m0
- add dstq, dsq
- dec hd
- jg .h_w64
+.hv_w4:
+ movq xm2, [srcq+nsq*2]
+ vpbroadcastq ym1, [srcq+nsq*1]
+ vinserti32x4 ym2, [srcq+ssq*0], 1
+ vinserti32x4 m1, [srcq+ssq*1], 2 ; _ 1 3
+ lea srcq, [srcq+ssq*2]
+ vbroadcasti32x4 m5, [subpel_h_shufA]
+ vinserti32x4 m2, [srcq+ssq*0], 2 ; 0 2 4
+ pshufb m1, m5
+ mova m0, m9
+ pshufb m2, m5
+ mova m3, m9
+ vpdpbusd m0, m1, m7
+ mova ym1, [spel_hv_perm4a]
+ vpdpbusd m3, m2, m7
+ mova ym2, [spel_hv_perm4b]
+ mov r6d, 0x5555
+ mova ym6, [spel_hv_perm4d]
+ packssdw m0, m3
+ kmovw k1, r6d
+ psraw m0, 2 ; _ 0 1 2 3 4 5 6
+ vpermb ym1, ym1, ym0 ; 01 12
+ vpermb m2, m2, m0 ; 23 34
+.hv_w4_loop:
+ movq xm3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vinserti32x4 ym3, [srcq+ssq*0], 1
+ pmaddwd ym4, ym1, ym11 ; a0 b0
+ mova ym1, ym2
+ pshufb ym3, ym5
+ mova ym0, ym9
+ vpdpbusd ym0, ym3, ym7
+ vpdpwssd ym4, ym2, ym12 ; a1 b1
+ vpsraw ym2{k1}, ym0, 2 ; 5 6
+ vpermb ym2, ym6, ym2 ; 45 56
+ vpdpwssd ym4, ym2, ym13 ; a2 b2
+ packuswb ym4, ym4
+ vpermb ym4, ym10, ym4
+ movd [dstq+dsq*0], xm4
+ pextrd [dstq+dsq*1], xm4, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
RET
-.h_w128:
- movu m0, [srcq+8*0]
- movu m2, [srcq+8*1]
- movu m1, [srcq+8*8]
- movu m3, [srcq+8*9]
- add srcq, ssq
- PUT_8TAP_H 0, 4, 11, 12
- PUT_8TAP_H 2, 12, 11, 4
- PUT_8TAP_H 1, 4, 11, 12
- PUT_8TAP_H 3, 12, 11, 4
- packuswb m0, m2
- packuswb m1, m3
- mova [dstq+64*0], m0
- mova [dstq+64*1], m1
- add dstq, dsq
- dec hd
- jg .h_w128
+.hv_w8:
+ shr mxd, 16
+ sub srcq, 3
+ vpbroadcastd m11, [base+subpel_filters+mxq*8+0]
+ vpbroadcastd m12, [base+subpel_filters+mxq*8+4]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ vpbroadcastq m1, [base+subpel_filters+1+myq*8]
+ mov nsq, ssq
+ punpcklbw m0, m1
+ neg nsq
+ psraw m0, 2 ; << 6
+ pshufd m13, m0, q0000
+ pshufd m14, m0, q1111
+ pshufd m15, m0, q2222
+ cmp wd, 8
+ jne .hv_w16
+ movu xm0, [srcq+nsq*2]
+ vinserti32x4 ym0, [srcq+nsq*1], 1
+ vbroadcasti32x4 m1, [subpel_h_shufA]
+ vinserti32x4 m0, [srcq+ssq*0], 2
+ vbroadcasti32x4 m4, [subpel_h_shufB]
+ vinserti32x4 m0, [srcq+ssq*1], 3
+ lea srcq, [srcq+ssq*2]
+ vbroadcasti32x4 m7, [subpel_h_shufC]
+ vbroadcasti32x4 ym5, [srcq+ssq*0]
+ vbroadcasti32x8 m6, [subpel_h_shufA]
+ pshufb m1, m0, m1 ; 0 1 2 3 0123
+ mova m2, m9
+ vpdpbusd m2, m1, m11
+ pshufb m4, m0, m4 ; 0 1 2 3 4567
+ mova m1, m9
+ vpdpbusd m1, m4, m11
+ pshufb m0, m7 ; 0 1 2 3 89ab
+ pshufb ym7, ym5, ym6 ; 4 0123 4567
+ mova ym3, ym9
+ vpdpbusd ym3, ym7, ym11
+ vbroadcasti32x8 m7, [subpel_h_shufB]
+ vpdpbusd m2, m4, m12
+ mova m4, [spel_hv_perm8a]
+ pshufb ym5, ym7 ; 4 4567 89ab
+ vpdpbusd m1, m0, m12
+ vpaddd m0, m4, [pb_32] {1to16}
+ vpdpbusd ym3, ym5, ym12
+ mova m5, [spel_hv_perm8b]
+ mov r6, 0x55555555ff00
+ packssdw m2, m1
+ vpmovsdw xm3, ym3
+ kmovq k1, r6
+ psraw m2, 2 ; 0 1 2 3
+ psraw xm3, 2 ; 4
+ vpermb m1, m4, m2 ; 01 12
+ kshiftrq k2, k1, 16
+ vpermt2b m2, m0, m3 ; 23 34
+.hv_w8_loop:
+ vbroadcasti32x4 ym3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vbroadcasti32x4 m3{k1}, [srcq+ssq*0]
+ pmaddwd m0, m1, m13 ; a0 b0
+ pshufb m1, m3, m6 ; 5 6 0123 4567
+ mova m4, m9
+ vpdpbusd m4, m1, m11
+ pshufb m3, m7 ; 5 6 4567 89ab
+ vpdpwssd m0, m2, m14 ; a1 b1
+ mova m1, m2
+ vpdpbusd m4, m3, m12
+ psraw m2{k2}, m4, 2 ; 53 64
+ vpermb m2, m5, m2 ; 45 56
+ vpdpwssd m0, m2, m15 ; a2 b2
+ packuswb m0, m0
+ vpermb m0, m10, m0
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+ movu m19, [spel_hv_perm16a]
+ vpbroadcastd m7, [pb_4]
+ lea r6d, [wq*2-32]
+ mova m6, [spel_hv_perm16b]
+ paddb m20, m7, m19
+ lea r6d, [hq+r6*8]
+ paddb m21, m7, m20
+ mova ym10, [spel_hv_end16]
+ paddb m7, m6
+.hv_w16_loop0:
+ movu ym16, [srcq+nsq*2]
+ vinserti32x8 m16, [srcq+nsq*1], 1
+ lea r4, [srcq+ssq*2]
+ movu ym17, [srcq+ssq*0]
+ vinserti32x8 m17, [srcq+ssq*1], 1
+ mov r7, dstq
+ movu ym18, [r4 +ssq*0]
+ vpermb m2, m19, m16 ; 0 1 0123 89ab
+ mova m1, m9
+ vpermb m3, m21, m16 ; 0 1 89ab ghij
+ vpdpbusd m1, m2, m11
+ mova m2, m9
+ vpermb m4, m19, m17 ; 2 3 0123 89ab
+ vpdpbusd m2, m3, m12
+ mova m3, m9
+ vpermb m5, m21, m17 ; 2 3 89ab ghij
+ vpdpbusd m3, m4, m11
+ mova m4, m9
+ vpermb m0, m6, m18 ; 4 0145 2367 89cd abef
+ vpdpbusd m4, m5, m12
+ mova m5, m9
+ vpermb m16, m20, m16 ; 0 1 4567 cdef
+ vpdpbusd m5, m0, m11
+ vpermb m17, m20, m17 ; 2 3 4567 cdef
+ vpdpbusd m1, m16, m12
+ vpermb m18, m7, m18 ; 4 4589 67ab cdgh efij
+ vpdpbusd m2, m16, m11
+ vpdpbusd m3, m17, m12
+ vpdpbusd m4, m17, m11
+ vpdpbusd m5, m18, m12
+ packssdw m1, m2 ; 01
+ packssdw m3, m4 ; 23
+ REPX {psraw x, 2}, m1, m3, m5
+ vpshrdd m2, m1, m3, 16 ; 12
+ vpshrdd m4, m3, m5, 16 ; 34
+.hv_w16_loop:
+ movu ym18, [r4+ssq*1]
+ lea r4, [r4+ssq*2]
+ vinserti32x8 m18, [r4+ssq*0], 1
+ pmaddwd m16, m1, m13 ; a0
+ vpermb m1, m19, m18 ; 5 6 0123 89ab
+ pmaddwd m17, m2, m13 ; b0
+ vpermb m2, m20, m18 ; 5 6 4567 cdef
+ mova m0, m9
+ vpdpbusd m0, m1, m11
+ vpermb m18, m21, m18
+ mova m1, m9
+ vpdpbusd m1, m2, m11
+ vpdpwssd m16, m3, m14 ; a1
+ vpdpwssd m17, m4, m14 ; b1
+ vpdpbusd m0, m2, m12
+ mova m2, m4
+ vpdpbusd m1, m18, m12
+ packssdw m0, m1
+ mova m1, m3
+ psraw m4, m0, 2 ; 5 6
+ vpshrdd m3, m2, m4, 16 ; 4 5
+ vpdpwssd m17, m4, m15 ; b2
+ vpdpwssd m16, m3, m15 ; a2
+ packuswb m16, m17
+ vpermb m16, m10, m16
+ mova [r7+dsq*0], xm16
+ vextracti128 [r7+dsq*1], ym16, 1
+ lea r7, [r7+dsq*2]
+ sub hd, 2
+ jg .hv_w16_loop
+ add srcq, 16
+ add dstq, 16
+ movzx hd, r6b
+ sub r6d, 1<<8
+ jg .hv_w16_loop0
+ vzeroupper
RET
+
+PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP, put_8tap_8bpc
+PUT_8TAP_FN regular_sharp, REGULAR, SHARP, put_8tap_8bpc
+PUT_8TAP_FN sharp, SHARP, SHARP
+
+cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ lea r8, [put_avx512icl]
+ movsxd wq, wm
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jz mangle(private_prefix %+ _put_6tap_8bpc_avx512icl).put
.v:
movzx mxd, myb
shr myd, 16
cmp hd, 6
cmovs myd, mxd
tzcnt r6d, wd
+ lea myq, [base+subpel_filters+myq*8]
movzx r6d, word [r8+r6*2+table_offset(put, _8tap_v)]
vpbroadcastd m7, [pw_512]
- lea myq, [base+subpel_filters+myq*8]
vpbroadcastw m8, [myq+0]
- vpbroadcastw m9, [myq+2]
- vpbroadcastw m10, [myq+4]
- vpbroadcastw m11, [myq+6]
add r6, r8
+ vpbroadcastw m9, [myq+2]
lea ss3q, [ssq*3]
+ vpbroadcastw m10, [myq+4]
sub srcq, ss3q
+ vpbroadcastw m11, [myq+6]
jmp r6
.v_w2:
movd xmm2, [srcq+ssq*0]
@@ -1802,7 +2200,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
vzeroupper
RET
.v_w16:
- mova m12, [spel_v_perm16]
+ mova m12, [spel_v_perm16a]
vbroadcasti32x4 m1, [srcq+ssq*0]
vbroadcasti32x4 ym4, [srcq+ssq*1]
mov r6d, 0x0f
@@ -1990,7 +2388,146 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
jg .v_loop0
vzeroupper
RET
+.h:
+ test myd, 0xf00
+ jnz .hv
+.h2:
+ vpbroadcastd m5, [pd_34] ; 2 + (8 << 2)
+ cmp wd, 4
+ jl .h_w2
+ vbroadcasti128 m6, [subpel_h_shufA]
+ je .h_w4
+ tzcnt wd, wd
+ vbroadcasti128 m7, [subpel_h_shufB]
+ vbroadcasti128 m8, [subpel_h_shufC]
+ shr mxd, 16
+ sub srcq, 3
+ movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)]
+ vpbroadcastd m9, [base+mxq*8+subpel_filters+0]
+ vpbroadcastd m10, [base+mxq*8+subpel_filters+4]
+ add wq, r8
+ jmp wq
+.h_w2:
+ movzx mxd, mxb
+ dec srcq
+ mova xmm4, [subpel_h_shuf4]
+ vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2]
+.h_w2_loop:
+ movq xmm0, [srcq+ssq*0]
+ movhps xmm0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xmm0, xmm4
+ mova xmm1, xm5
+ vpdpbusd xmm1, xmm0, xmm3
+ packssdw xmm0, xmm1, xmm1
+ psraw xmm0, 6
+ packuswb xmm0, xm0
+ pextrw [dstq+dsq*0], xmm0, 0
+ pextrw [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2_loop
+ RET
+.h_w4:
+ movzx mxd, mxb
+ dec srcq
+ vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2]
+.h_w4_loop:
+ movq xmm0, [srcq+ssq*0]
+ movq xmm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xmm0, xm6
+ pshufb xmm1, xm6
+ mova xmm2, xm5
+ vpdpbusd xmm2, xmm0, xmm3
+ mova xmm0, xm5
+ vpdpbusd xmm0, xmm1, xmm3
+ packssdw xmm0, xmm2, xmm0
+ psraw xmm0, 6
+ packuswb xmm0, xmm0
+ movd [dstq+dsq*0], xmm0
+ pextrd [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4_loop
+ RET
+.h_w8:
+ movu xm0, [srcq+ssq*0]
+ vinserti32x4 ym0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ WRAP_YMM PUT_8TAP_H 0, 1, 2, 3
+ vpmovuswb xm0, ym0
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ mova m6, [spel_h_perm16]
+ vpbroadcastd m8, [pb_4]
+ paddb m7, m8, m6
+ paddb m8, m7
+.h_w16_loop:
+ movu ym0, [srcq+ssq*0]
+ vinserti32x8 m0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_H 0, 1, 2, 3, 1
+ vpmovuswb ym0, m0
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], ym0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w16_loop
+ RET
+.h_w32:
+ movu ym0, [srcq+ssq*0+8*0]
+ vinserti32x8 m0, [srcq+ssq*1+8*0], 1
+ movu ym1, [srcq+ssq*0+8*1]
+ vinserti32x8 m1, [srcq+ssq*1+8*1], 1
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_H 0, 2, 3, 4
+ PUT_8TAP_H 1, 4, 3, 2
+ packuswb m0, m1
+ mova [dstq+dsq*0], ym0
+ vextracti32x8 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w32
+ RET
+.h_w64:
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
+ add srcq, ssq
+ PUT_8TAP_H 0, 2, 3, 4
+ PUT_8TAP_H 1, 4, 3, 2
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, dsq
+ dec hd
+ jg .h_w64
+ RET
+.h_w128:
+ movu m0, [srcq+8*0]
+ movu m2, [srcq+8*1]
+ movu m1, [srcq+8*8]
+ movu m3, [srcq+8*9]
+ add srcq, ssq
+ PUT_8TAP_H 0, 4, 11, 12
+ PUT_8TAP_H 2, 12, 11, 4
+ PUT_8TAP_H 1, 4, 11, 12
+ PUT_8TAP_H 3, 12, 11, 4
+ packuswb m0, m2
+ packuswb m1, m3
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ add dstq, dsq
+ dec hd
+ jg .h_w128
+ RET
.hv:
+ vpbroadcastd m9, [pd_34]
+ pxor xm0, xm0
cmp wd, 4
jg .hv_w8
movzx mxd, mxb
@@ -2000,12 +2537,10 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
shr myd, 16
cmp hd, 6
cmovs myd, mxd
- vpbroadcastd m8, [pd_2]
- vpbroadcastq ym0, [base+subpel_filters+myq*8]
+ vpbroadcastq ym1, [base+subpel_filters+myq*8]
lea ss3q, [ssq*3]
- vpbroadcastd ym9, [pd_32768]
mov r6, srcq
- punpcklbw ym0, ym8, ym0
+ punpcklbw ym0, ym1
sub r6, ss3q
psraw ym0, 2 ; << 6
mova xm14, [spel_hv_end]
@@ -2029,9 +2564,9 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
vpblendd ymm2, ymm4, 0xc0 ; 0 1 4 5
pshufb ymm2, ym6
pshufb ymm0, ym6
- mova ymm1, ym8
+ mova ymm1, ym9
vpdpbusd ymm1, ymm2, ym7
- mova ymm2, ym8
+ mova ymm2, ym9
vpdpbusd ymm2, ymm0, ym7
packssdw ymm2, ymm1, ymm2
psraw ymm2, 2
@@ -2045,14 +2580,13 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
movq xmm4, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
movhps xmm4, [srcq+ssq*0]
- mova xmm5, xm9
- vpdpwssd xmm5, xmm1, xm10 ; a0 b0
+ pmaddwd xmm5, xmm1, xm10 ; a0 b0
mova xmm1, xmm2
vpdpwssd xmm5, xmm2, xm11 ; a1 b1
pshufb xmm4, xm6
mova xmm2, xmm3
vpdpwssd xmm5, xmm3, xm12 ; a2 b2
- mova xmm3, xm8
+ mova xmm3, xm9
vpdpbusd xmm3, xmm4, xm7
packssdw xmm4, xmm3, xmm3
psraw xmm4, 2
@@ -2081,9 +2615,9 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
vinserti32x4 m1, [srcq+ssq*0], 3 ; 0 2 4 6
pshufb m2, m6
pshufb m1, m6
- mova m0, m8
+ mova m0, m9
vpdpbusd m0, m2, m7
- mova m4, m8
+ mova m4, m9
vpdpbusd m4, m1, m7
mova ym1, [spel_hv_perm4a]
mova ym2, [spel_hv_perm4b]
@@ -2100,11 +2634,10 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
movq xmm4, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
vinserti32x4 ym4, ymm4, [srcq+ssq*0], 1
- mova ym5, ym9
- vpdpwssd ym5, ym1, ym10 ; a0 b0
+ pmaddwd ym5, ym1, ym10 ; a0 b0
mova ym1, ym2
pshufb ym4, ym6
- mova ym0, ym8
+ mova ym0, ym9
vpdpbusd ym0, ym4, ym7
vpdpwssd ym5, ym2, ym11 ; a1 b1
mova ym2, ym3
@@ -2129,10 +2662,8 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
shr myd, 16
cmp hd, 6
cmovs myd, mxd
- vpbroadcastd m8, [pd_2]
- vpbroadcastq m0, [base+subpel_filters+myq*8]
- vpbroadcastd m9, [pd_32768]
- punpcklbw m0, m8, m0
+ vpbroadcastq m1, [base+subpel_filters+myq*8]
+ punpcklbw m0, m1
lea ss3q, [ssq*3]
psraw m0, 2 ; << 6
pshufd m12, m0, q0000
@@ -2153,31 +2684,31 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
vbroadcasti32x4 m4, [subpel_h_shufA]
vinserti32x4 m0, zmm2, [srcq+ssq*0], 2 ; 4 5 6 _
vbroadcasti32x4 m7, [subpel_h_shufB]
- vbroadcasti32x4 m17, [subpel_h_shufC]
+ vbroadcasti32x4 m8, [subpel_h_shufC]
pshufb m1, m6, m4 ; 0 1 2 3 0123
- mova m2, m8
+ mova m2, m9
vpdpbusd m2, m1, m10
pshufb m5, m6, m7 ; 0 1 2 3 4567
- mova m1, m8
+ mova m1, m9
vpdpbusd m1, m5, m10
pshufb m4, m0, m4 ; 4 5 6 _ 0123
- mova m3, m8
+ mova m3, m9
vpdpbusd m3, m4, m10
pshufb m7, m0, m7 ; 4 5 6 _ 4567
- mova m4, m8
+ mova m4, m9
vpdpbusd m4, m7, m10
- pshufb m6, m17
+ pshufb m6, m8
vpdpbusd m2, m5, m11
vpdpbusd m1, m6, m11
- pshufb m6, m0, m17
+ pshufb m6, m0, m8
vpdpbusd m3, m7, m11
vpdpbusd m4, m6, m11
mova m5, [spel_hv_perm8a]
- mova m0, [spel_hv_perm8b]
+ vpaddd m0, m5, [pb_32] {1to16}
mov r6, 0x55555555ff00
packssdw m2, m1
packssdw m3, m4
- mova m18, [spel_hv_perm8c]
+ mova m8, [spel_hv_perm8b]
psraw m2, 2 ; 0 1 2 3
psraw m3, 2 ; 4 5 6 _
vpermb m1, m5, m2 ; 01 12
@@ -2192,10 +2723,9 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
vbroadcasti32x4 ym4, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
vbroadcasti32x4 m4{k1}, [srcq+ssq*0]
- mova m0, m9
- vpdpwssd m0, m1, m12 ; a0 b0
+ pmaddwd m0, m1, m12 ; a0 b0
pshufb m1, m4, m6 ; 7 8 0123 4567
- mova m5, m8
+ mova m5, m9
vpdpbusd m5, m1, m10
pshufb m4, m7 ; 7 8 4567 89ab
vpdpwssd m0, m2, m13 ; a1 b1
@@ -2204,7 +2734,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
mova m2, m3
vpdpwssd m0, m3, m14 ; a2 b2
psraw m3{k2}, m5, 2 ; 75 86
- vpermb m3, m18, m3 ; 67 78
+ vpermb m3, m8, m3 ; 67 78
vpdpwssd m0, m3, m15 ; a3 b3
packuswb m0, m0
vpermb zmm1, m16, m0
@@ -2216,111 +2746,652 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
vzeroupper
RET
.hv_w16:
- movu m7, [spel_hv_perm16a]
+ WIN64_SPILL_XMM 23
+ movu m22, [spel_hv_perm16a]
sub srcq, ss3q
- mova m20, [spel_hv_perm16b]
+ vpbroadcastd m8, [pb_4]
lea r6d, [wq*2-32]
- mova m21, [spel_hv_perm16c]
- mov r4, srcq
- mov r7, dstq
+ mova m7, [spel_hv_perm16b]
+ paddb m20, m8, m22
mova ym16, [spel_hv_end16]
+ paddb m21, m8, m20
lea r6d, [hq+r6*8]
+ paddb m8, m7
.hv_w16_loop0:
movu ym17, [srcq+ssq*0]
vinserti32x8 m17, [srcq+ssq*1], 1 ; 0 1
+ lea r4, [srcq+ss3q]
movu ym18, [srcq+ssq*2]
- add srcq, ss3q
- vinserti32x8 m18, [srcq+ssq*0], 1 ; 2 3
- movu ym19, [srcq+ssq*1]
- vinserti32x8 m19, [srcq+ssq*2], 1 ; 4 5
- add srcq, ss3q
- vpermb m2, m7, m17 ; 0 1 0123 89ab
- vpermb m0, m20, m17 ; 0 1 4567 cdef
- vpermb m4, m7, m18 ; 2 3 0123 89ab
- mova m1, m8
+ vinserti32x8 m18, [r4 +ssq*0], 1 ; 2 3
+ mov r7, dstq
+ movu ym19, [r4 +ssq*1]
+ vinserti32x8 m19, [r4 +ssq*2], 1 ; 4 5
+ add r4, ss3q
+ vpermb m2, m22, m17 ; 0 1 0123 89ab
+ mova m1, m9
+ vpermb m3, m21, m17 ; 0 1 89ab ghij
vpdpbusd m1, m2, m10
- vpermb m5, m20, m18 ; 2 3 4567 cdef
- mova m2, m8
- vpdpbusd m2, m0, m10
- vpermb m17, m21, m17 ; 0 1 89ab ghij
- mova m3, m8
+ mova m2, m9
+ vpermb m4, m22, m18 ; 2 3 0123 89ab
+ vpdpbusd m2, m3, m11
+ mova m3, m9
+ vpermb m5, m21, m18 ; 2 3 89ab ghij
vpdpbusd m3, m4, m10
- vpermb m6, m7, m19 ; 4 5 0123 89ab
- mova m4, m8
- vpdpbusd m4, m5, m10
- vpermb m18, m21, m18 ; 2 3 89ab ghij
- vpdpbusd m1, m0, m11
- movu ym0, [srcq+ssq*0] ; 6
- vpdpbusd m2, m17, m11
- vpermb m17, m20, m19 ; 4 5 4567 cdef
- vpdpbusd m3, m5, m11
- mova m5, m8
+ mova m4, m9
+ vpermb m6, m22, m19 ; 4 5 0123 89ab
+ vpdpbusd m4, m5, m11
+ mova m5, m9
+ vpermb m17, m20, m17 ; 0 1 4567 cdef
vpdpbusd m5, m6, m10
- mova m6, m8
- vpdpbusd m6, m17, m10
- vpdpbusd m4, m18, m11
- mova m18, [spel_hv_perm16d]
- vpermb m18, m18, m0 ; 6 0145 2367 89cd abef
- vpdpbusd m5, m17, m11
- vpermb m19, m21, m19 ; 4 5 89ab ghij
- mova m17, m8
- vpdpbusd m17, m18, m10
- mova m18, [spel_hv_perm16e]
- vpermb m0, m18, m0 ; 6 4589 67ab cdgh efij
- packssdw m1, m2 ; 01
- vpdpbusd m6, m19, m11
- packssdw m3, m4 ; 23
- vpdpbusd m17, m0, m11
- psraw m1, 2
- packssdw m5, m6 ; 45
- psraw m3, 2
+ mova m6, m9
+ vpermb m0, m21, m19 ; 4 5 89ab ghij
+ vpdpbusd m1, m17, m11
+ vpdpbusd m2, m17, m10
+ movu ym17, [r4+ssq*0] ; 6
+ vpermb m18, m20, m18 ; 2 3 4567 cdef
+ vpdpbusd m6, m0, m11
+ vpermb m0, m7, m17 ; 6 0145 2367 89cd abef
+ vpdpbusd m3, m18, m11
+ vpermb m19, m20, m19 ; 4 5 4567 cdef
+ vpdpbusd m4, m18, m10
+ mova m18, m9
+ vpermb m17, m8, m17 ; 6 4589 67ab cdgh efij
+ vpdpbusd m18, m0, m10
+ packssdw m1, m2
+ vpdpbusd m5, m19, m11
+ vpdpbusd m6, m19, m10
+ packssdw m3, m4
+ vpdpbusd m18, m17, m11
+ psraw m1, 2 ; 01
+ psraw m3, 2 ; 23
+ packssdw m5, m6
vpshrdd m2, m1, m3, 16 ; 12
- psraw m5, 2
+ psraw m5, 2 ; 45
vpshrdd m4, m3, m5, 16 ; 34
- psraw m17, 2
- vpshrdd m6, m5, m17, 16 ; 56
+ psraw m18, 2
+ vpshrdd m6, m5, m18, 16 ; 56
.hv_w16_loop:
- movu ym18, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- vinserti32x8 m18, [srcq+ssq*0], 1
+ movu ym19, [r4+ssq*1]
+ lea r4, [r4+ssq*2]
+ vinserti32x8 m19, [r4+ssq*0], 1
+ pmaddwd m17, m1, m12 ; a0
+ vpermb m1, m22, m19 ; 7 8 0123 89ab
+ pmaddwd m18, m2, m12 ; b0
mova m0, m9
- vpdpwssd m0, m1, m12 ; a0
- vpermb m1, m7, m18 ; 7 8 0123 89ab
- mova m17, m9
- vpdpwssd m17, m2, m12 ; b0
- vpermb m2, m20, m18 ; 7 8 4567 cdef
- mova m19, m8
- vpdpbusd m19, m1, m10
- vpermb m18, m21, m18
- mova m1, m8
- vpdpbusd m1, m2, m10
- vpdpwssd m0, m3, m13 ; a1
- vpdpwssd m17, m4, m13 ; b1
- vpdpbusd m19, m2, m11
+ vpermb m2, m21, m19 ; 7 8 89ab ghij
+ vpdpbusd m0, m1, m10
+ mova m1, m9
+ vpermb m19, m20, m19 ; 7 8 4567 cdef
+ vpdpbusd m1, m2, m11
mova m2, m4
- vpdpbusd m1, m18, m11
+ vpdpwssd m17, m3, m13 ; a1
+ vpdpwssd m18, m4, m13 ; b1
mova m4, m6
- vpdpwssd m0, m5, m14 ; a2
- vpdpwssd m17, m6, m14 ; b2
- packssdw m19, m1
+ vpdpbusd m0, m19, m11
+ vpdpbusd m1, m19, m10
+ vpdpwssd m17, m5, m14 ; a2
+ vpdpwssd m18, m6, m14 ; b2
+ packssdw m0, m1
mova m1, m3
+ psraw m6, m0, 2 ; 78
mova m3, m5
- psraw m6, m19, 2 ; 7 8
- vpshrdd m5, m4, m6, 16 ; 6 7
- vpdpwssd m17, m6, m15 ; b3
- vpdpwssd m0, m5, m15 ; a3
- packuswb m0, m17
- vpermb zmm1, m16, m0
- mova [dstq+dsq*0], xmm1
- vextracti128 [dstq+dsq*1], ymm1, 1
- lea dstq, [dstq+dsq*2]
+ vpshrdd m5, m4, m6, 16 ; 67
+ vpdpwssd m18, m6, m15 ; b3
+ vpdpwssd m17, m5, m15 ; a3
+ packuswb m17, m18
+ vpermb m17, m16, m17
+ mova [r7+dsq*0], xm17
+ vextracti128 [r7+dsq*1], ym17, 1
+ lea r7, [r7+dsq*2]
sub hd, 2
jg .hv_w16_loop
- add r4, 16
- add r7, 16
+ add srcq, 16
+ add dstq, 16
+ movzx hd, r6b
+ sub r6d, 1<<8
+ jg .hv_w16_loop0
+ RET
+
+%if WIN64
+DECLARE_REG_TMP 6, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+
+%define PREP_8TAP_FN FN prep_8tap,
+PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH, prep_6tap_8bpc
+PREP_8TAP_FN sharp_regular, SHARP, REGULAR, prep_6tap_8bpc
+PREP_8TAP_FN smooth, SMOOTH, SMOOTH, prep_6tap_8bpc
+PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR, prep_6tap_8bpc
+PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH, prep_6tap_8bpc
+PREP_8TAP_FN regular, REGULAR, REGULAR
+
+cglobal prep_6tap_8bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my, ss3
+%define base r7-prep_avx512icl
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 6tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 6tap_v, my, 4tap_v
+ lea r7, [prep_avx512icl]
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+.prep:
+ tzcnt wd, wd
+ movzx wd, word [r7+wq*2+table_offset(prep,)]
+ add wq, r7
+ lea r6, [ssq*3]
+%if WIN64
+ pop r7
+%endif
+ jmp wq
+.v:
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ tzcnt r5d, wd
+ lea myq, [base+subpel_filters+1+myq*8]
+ movzx r5d, word [r7+r5*2+table_offset(prep, _6tap_v)]
+ vpbroadcastd m7, [pw_8192]
+ sub srcq, ssq
+ vpbroadcastw m8, [myq+0]
+ add r5, r7
+ vpbroadcastw m9, [myq+2]
+ lea ss3q, [ssq*3]
+ vpbroadcastw m10, [myq+4]
+ sub srcq, ssq
+ jmp r5
+.v_w4:
+ movd xmm2, [srcq+ssq*0]
+ pinsrd xmm2, [srcq+ssq*1], 1
+ vpbroadcastd ymm1, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastd ymm3, [srcq+ssq*0]
+ vpbroadcastd ymm0, [srcq+ssq*1]
+ vbroadcasti128 ymm5, [deint_shuf4]
+ vpblendd ymm1, ymm2, 0xeb
+ punpcklqdq ymm3, ymm0
+ vpblendd ymm1, ymm3, 0x60 ; 0 1 2 _ 2 3 4 _
+ pshufb ymm1, ymm5 ; 01 12 23 34
+.v_w4_loop:
+ pinsrd xmm0, [srcq+ssq*2], 1
+ vpbroadcastd ymm2, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ vpbroadcastd ymm3, [srcq+ssq*0]
+ vpblendd ymm2, ymm0, 0xeb
+ vpbroadcastd ymm0, [srcq+ssq*1]
+ punpcklqdq ymm3, ymm0
+ vpblendd ymm2, ymm3, 0x60 ; 4 5 6 _ 6 7 8 _
+ pshufb ymm2, ymm5 ; 45 56 67 78
+ pmaddubsw ymm3, ymm1, ym8 ; a0 b0 c0 d0
+ vperm2i128 ymm1, ymm2, 0x21 ; 23 34 45 56
+ pmaddubsw ymm4, ymm2, ym10 ; a2 b2 c2 d2
+ pmaddubsw ymm1, ym9 ; a1 b1 c1 d1
+ paddw ymm3, ymm4
+ paddw ymm3, ymm1
+ pmulhrsw ymm3, ym7
+ mova ymm1, ymm2
+ mova [tmpq], ymm3
+ add tmpq, 32
+ sub hd, 4
+ jg .v_w4_loop
+ vzeroupper
+ RET
+.v_w8:
+ mova m6, [spel_v_perm8]
+ movq xm1, [srcq+ssq*0]
+ mov r6d, 0x3e
+ movq xm2, [srcq+ssq*1]
+ kmovb k1, r6d
+ vpbroadcastq ym3, [srcq+ssq*2]
+ add srcq, ss3q
+ vpunpcklqdq ym2, [srcq+ssq*0] {1to4}
+ vpunpcklqdq m1{k1}, m3, [srcq+ssq*1] {1to8}
+ movq xm0, [srcq+ssq*1]
+ kshiftlb k2, k1, 2
+ shufpd m1, m2, 0x18 ; 0 1 2 3 4
+ vpermb m1, m6, m1 ; 01 12 23 34
+.v_w8_loop:
+ vpbroadcastq ym3, [srcq+ss3q ]
+ vpunpcklqdq ym0{k1}, ym3, [srcq+ssq*2] {1to4}
+ lea srcq, [srcq+ssq*4]
+ vpbroadcastq m3, [srcq+ssq*1]
+ vpunpcklqdq m0{k2}, m3, [srcq+ssq*0] {1to8}
+ pmaddubsw m4, m1, m8 ; a0 b0 c0 d0
+ vpermb m2, m6, m0 ; 45 56 67 78
+ mova xm0, xm3
+ vshufi32x4 m1, m2, q1032 ; 23 34 45 56
+ pmaddubsw m3, m2, m10 ; a3 b3 c3 d3
+ pmaddubsw m5, m1, m9 ; a2 b2 c2 d2
+ mova m1, m2
+ paddw m4, m3
+ paddw m4, m5
+ pmulhrsw m4, m7
+ mova [tmpq], m4
+ add tmpq, 64
+ sub hd, 4
+ jg .v_w8_loop
+ RET
+.v_w16:
+ mova m11, [spel_v_perm16b]
+ vbroadcasti32x4 m1, [srcq+ssq*0]
+ mov r6d, 0x0f
+ vbroadcasti32x4 ym3, [srcq+ssq*1]
+ vbroadcasti32x4 m2, [srcq+ssq*2]
+ kmovb k1, r6d
+ add srcq, ss3q
+ vbroadcasti32x4 ym4, [srcq+ssq*0]
+ vbroadcasti32x4 m0, [srcq+ssq*1]
+ vshufpd m1{k1}, m3, m2, 0xcc
+ vshufpd m2{k1}, m4, m0, 0xcc
+ vpermb m1, m11, m1 ; 01 12
+ vpermb m2, m11, m2 ; 23 34
+.v_w16_loop:
+ pmaddubsw m3, m1, m8 ; a0 b0
+ pmaddubsw m5, m2, m9 ; a1 b1
+ vbroadcasti32x4 ym6, [srcq+ssq*2]
+ pmaddubsw m4, m2, m8 ; c0 d0
+ vbroadcasti32x4 m2, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ vshufpd m0{k1}, m6, m2, 0xcc
+ vbroadcasti32x4 ym6, [srcq+ssq*0]
+ vpermb m1, m11, m0 ; 45 56
+ vbroadcasti32x4 m0, [srcq+ssq*1]
+ vshufpd m2{k1}, m6, m0, 0xcc
+ pmaddubsw m6, m1, m9 ; c1 d1
+ vpermb m2, m11, m2 ; 67 78
+ paddw m3, m5
+ pmaddubsw m5, m1, m10 ; a2 b2
+ paddw m4, m6
+ pmaddubsw m6, m2, m10 ; c2 d2
+ paddw m3, m5
+ paddw m4, m6
+ pmulhrsw m3, m7
+ pmulhrsw m4, m7
+ mova [tmpq+ 0], m3
+ mova [tmpq+64], m4
+ add tmpq, 64*2
+ sub hd, 4
+ jg .v_w16_loop
+ RET
+.v_w32:
+ movshdup m6, [bilin_v_perm64]
+ movu ym16, [srcq+ssq*0]
+ movu ym17, [srcq+ssq*1]
+ movu ym18, [srcq+ssq*2]
+ add srcq, ss3q
+ movu ym19, [srcq+ssq*0]
+ add srcq, ssq
+ movu ym20, [srcq+ssq*0]
+ vpermt2q m16, m6, m18 ; 0 2
+ vpermt2q m17, m6, m19 ; 1 3
+ vpermt2q m18, m6, m20 ; 2 4
+ punpcklbw m0, m16, m17 ; 01
+ punpcklbw m1, m17, m18 ; 12
+ punpckhbw m2, m16, m17 ; 23
+ punpckhbw m3, m17, m18 ; 34
+.v_w32_loop:
+ movu ym16, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movu ym17, [srcq+ssq*0]
+ pmaddubsw m4, m0, m8 ; a0
+ mova m0, m2
+ pmaddubsw m2, m9 ; a1
+ vpermt2q m16, m6, m17 ; 5 6
+ pmaddubsw m5, m1, m8 ; b0
+ mova m1, m3
+ pmaddubsw m3, m9 ; b1
+ shufpd m18, m16, 0x55 ; 4 5
+ paddw m4, m2
+ punpcklbw m2, m18, m16 ; 45
+ paddw m5, m3
+ punpckhbw m3, m18, m16 ; 56
+ mova m18, m16
+ pmaddubsw m16, m2, m10 ; a2
+ pmaddubsw m17, m3, m10 ; b2
+ paddw m4, m16
+ paddw m5, m17
+ pmulhrsw m4, m7
+ pmulhrsw m5, m7
+ mova [tmpq+ 0], m4
+ mova [tmpq+64], m5
+ add tmpq, 64*2
+ sub hd, 2
+ jg .v_w32_loop
+ vzeroupper
+ RET
+.v_w64:
+.v_w128:
+ mova m6, [bilin_v_perm64]
+ add wd, wd
+ lea r6d, [hq+wq]
+.v_loop0:
+ vpermq m12, m6, [srcq+ssq*0]
+ vpermq m13, m6, [srcq+ssq*1]
+ lea r5, [srcq+ssq*2]
+ vpermq m14, m6, [r5 +ssq*0]
+ vpermq m15, m6, [r5 +ssq*1]
+ lea r5, [r5+ssq*2]
+ vpermq m16, m6, [r5 +ssq*0]
+ mov r7, tmpq
+ punpcklbw m0, m12, m13 ; 01
+ punpckhbw m12, m13
+ punpcklbw m1, m13, m14 ; 12
+ punpckhbw m13, m14
+ punpcklbw m2, m14, m15 ; 23
+ punpckhbw m14, m15
+ punpcklbw m3, m15, m16 ; 34
+ punpckhbw m15, m16
+.v_loop:
+ pmaddubsw m17, m0, m8 ; a0
+ vpermq m5, m6, [r5+ssq*1]
+ pmaddubsw m18, m12, m8
+ mova m0, m2
+ pmaddubsw m2, m9 ; a1
+ mova m12, m14
+ pmaddubsw m14, m9
+ lea r5, [r5+ssq*2]
+ pmaddubsw m19, m1, m8 ; b0
+ pmaddubsw m20, m13, m8
+ mova m1, m3
+ pmaddubsw m3, m9 ; b1
+ mova m13, m15
+ pmaddubsw m15, m9
+ paddw m17, m2
+ punpcklbw m2, m16, m5 ; 67
+ paddw m18, m14
+ punpckhbw m14, m16, m5
+ vpermq m16, m6, [r5+ssq*0]
+ paddw m19, m3
+ pmaddubsw m3, m2, m10 ; a3
+ paddw m20, m15
+ pmaddubsw m15, m14, m10
+ paddw m17, m3
+ punpcklbw m3, m5, m16 ; 78
+ pmaddubsw m4, m3, m10 ; b3
+ paddw m18, m15
+ punpckhbw m15, m5, m16
+ pmaddubsw m5, m15, m10
+ paddw m19, m4
+ paddw m20, m5
+ REPX {pmulhrsw x, m7}, m17, m18, m19, m20
+ mova [r7+wq*0+ 0], m17
+ mova [r7+wq*0+64], m18
+ mova [r7+wq*1+ 0], m19
+ mova [r7+wq*1+64], m20
+ lea r7, [r7+wq*2]
+ sub hd, 2
+ jg .v_loop
+ add srcq, 64
+ add tmpq, 128
+ movzx hd, r6b
+ sub r6d, 1<<8
+ jg .v_loop0
+ vzeroupper
+ RET
+.h:
+ test myd, 0xf00
+ jz mangle(private_prefix %+ _prep_8tap_8bpc_avx512icl).h2
+.hv:
+ vpbroadcastd m8, [pd_2]
+ vpbroadcastd m9, [pd_32]
+ cmp wd, 4
+ jg .hv_w8
+ movzx mxd, mxb
+ vpbroadcastd m11, [base+subpel_filters+mxq*8+2]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ vpbroadcastq m3, [base+subpel_filters+1+myq*8]
+ vbroadcasti128 m10, [subpel_h_shufA]
+ lea r6, [ssq*2+1]
+ mov r3d, 0x30
+ sub srcq, r6
+ kmovb k1, r3d
+ vpbroadcastq ym2, [srcq+ssq*0]
+ lea ss3q, [ssq*3]
+ vpbroadcastq m1, [srcq+ssq*1]
+ kaddb k2, k1, k1
+ vpbroadcastq m2{k1}, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastq m1{k2}, [srcq+ssq*0] ; _ _ 1 3
+ punpcklbw m3, m3
+ vpbroadcastq m2{k2}, [srcq+ssq*1] ; _ 0 2 4
+ psraw m3, 8 ; sign-extend
+ mova m6, [spel_hv_perm4a]
+ kshiftrb k1, k1, 2
+ movu m7, [spel_hv_perm4b]
+ pshufb m1, m10
+ mova m0, m8
+ vpdpbusd m0, m1, m11
+ pshufb m2, m10
+ mova m1, m8
+ vpdpbusd m1, m2, m11
+ pshufd m12, m3, q0000
+ pshufd m13, m3, q1111
+ pshufd m14, m3, q2222
+ packssdw m0, m1 ; _ _ _ 0 1 2 3 4
+ psraw m0, 2
+ vpermb m1, m7, m0 ; 01 12 23 34
+.hv_w4_loop:
+ movq xm3, [srcq+ssq*2]
+ movq xm4, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ vpbroadcastq ym3{k1}, [srcq+ssq*0] ; 5 7
+ vpbroadcastq ym4{k1}, [srcq+ssq*1] ; 6 8
+ pshufb ym3, ym10
+ mova ym2, ym8
+ vpdpbusd ym2, ym3, ym11
+ pshufb ym4, ym10
+ mova ym3, ym8
+ vpdpbusd ym3, ym4, ym11
+ mova m4, m9
+ vpdpwssd m4, m1, m12 ; a0 b0 c0 d0
+ packssdw ym2, ym3 ; 5 6 7 8
+ psraw ym2, 2
+ vshufi32x4 m0, m2, q1032 ; _ 2 3 4 5 6 7 8
+ vpermb m2, m6, m0 ; 23 34 45 56
+ vpermb m1, m7, m0 ; 45 56 67 78
+ vpdpwssd m4, m2, m13 ; a1 b1 c1 d1
+ vpdpwssd m4, m1, m14 ; a2 b2 c2 d2
+ psrad m4, 6
+ vpmovdw [tmpq], m4
+ add tmpq, 32
+ sub hd, 4
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ shr mxd, 16
+ vpbroadcastd m10, [base+subpel_filters+mxq*8+0]
+ vpbroadcastd m11, [base+subpel_filters+mxq*8+4]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ vpbroadcastq m0, [base+subpel_filters+1+myq*8]
+ lea r6, [ssq*2+3]
+ punpcklbw m0, m0
+ sub srcq, r6
+ psraw m0, 8 ; sign-extend
+ lea ss3q, [ssq*3]
+ pshufd m12, m0, q0000
+ pshufd m13, m0, q1111
+ pshufd m14, m0, q2222
+ cmp wd, 8
+ jg .hv_w16
+ movu xm16, [srcq+ssq*0]
+ vbroadcasti32x4 m19, [subpel_h_shufA]
+ vinserti128 ym16, [srcq+ssq*1], 1
+ vbroadcasti32x4 m21, [subpel_h_shufC]
+ vinserti32x4 m16, [srcq+ssq*2], 2
+ add srcq, ss3q
+ vinserti32x4 m16, [srcq+ssq*0], 3
+ movu xm17, [srcq+ssq*1]
+ vbroadcasti32x4 m20, [subpel_h_shufB]
+ pshufb m3, m16, m19 ; 0 1 2 3 0123
+ mova m2, m8
+ pshufb m0, m16, m21 ; 0 1 2 3 89ab
+ vpdpbusd m2, m3, m10
+ mova m3, m8
+ pshufb xm1, xm17, xm19 ; 3 4 5 6 0123
+ vpdpbusd m3, m0, m11
+ mova xm0, xm8
+ pshufb xm18, xm17, xm21 ; 3 4 5 6 89ab
+ vpdpbusd xm0, xm1, xm10
+ mova xm1, xm8
+ pshufb m16, m20 ; 0 1 2 3 4567
+ vpdpbusd xm1, xm18, xm11
+ pshufb xm17, xm20 ; 3 4 5 6 4567
+ vpdpbusd m2, m16, m11
+ vpdpbusd m3, m16, m10
+ vpdpbusd xm0, xm17, xm11
+ vpdpbusd xm1, xm17, xm10
+ packssdw m2, m3
+ packssdw xm0, xm1
+ psraw m2, 2 ; 0 1 2 3
+ psraw xm0, 2 ; 4
+ valignq m0, m2, 2 ; 1 2 3 4
+ punpcklwd m1, m2, m0 ; 01 12 23 34
+ punpckhwd m2, m0
+.hv_w8_loop:
+ movu xm16, [srcq+ssq*2]
+ vinserti128 ym16, [srcq+ss3q ], 1
+ lea srcq, [srcq+ssq*4]
+ vinserti32x4 m16, [srcq+ssq*0], 2
+ vinserti32x4 m16, [srcq+ssq*1], 3
+ pshufb m6, m16, m19 ; 5 6 7 8 0123
+ mova m5, m8
+ pshufb m3, m16, m21 ; 5 6 7 8 89ab
+ vpdpbusd m5, m6, m10
+ mova m6, m8
+ pshufb m16, m20 ; 5 6 7 8 4567
+ vpdpbusd m6, m3, m11
+ mova m3, m9
+ vpdpwssd m3, m1, m12 ; a0 b0 c0 d0
+ mova m4, m9
+ vpdpwssd m4, m2, m12
+ vpdpbusd m5, m16, m11
+ vpdpbusd m6, m16, m10
+ mova m16, m1
+ packssdw m5, m6
+ mova m6, m2
+ psraw m5, 2 ; 5 6 7 8
+ valignq m2, m5, m0, 6 ; 4 5 6 7
+ mova m0, m5
+ punpcklwd m1, m2, m5 ; 45 56 67 78
+ punpckhwd m2, m5
+ vpdpwssd m3, m1, m14 ; a2 b2 c2 d2
+ vpdpwssd m4, m2, m14
+ vshufi32x4 m16, m1, q1032 ; 23 34 45 56
+ vshufi32x4 m6, m2, q1032
+ vpdpwssd m3, m16, m13 ; a1 b1 c1 d1
+ vpdpwssd m4, m6, m13
+ psrad m3, 6
+ psrad m4, 6
+ packssdw m3, m4
+ mova [tmpq], m3
+ add tmpq, 64
+ sub hd, 4
+ jg .hv_w8_loop
+ vzeroupper
+ RET
+.hv_w16:
+ mova m16, [spel_h_perm16]
+ vpbroadcastd m18, [pb_4]
+ add wd, wd
+ paddb m17, m18, m16
+ lea r6d, [hq+wq*8-256]
+ paddb m18, m17
+.hv_w16_loop0:
+ movu ym19, [srcq+ssq*0]
+ vinserti32x8 m19, [srcq+ssq*1], 1
+ lea r5, [srcq+ssq*2]
+ movu ym20, [r5 +ssq*0]
+ vinserti32x8 m20, [r5 +ssq*1], 1
+ lea r5, [r5 +ssq*2]
+ movu ym21, [r5 +ssq*0]
+ mov r7, tmpq
+ vpermb m3, m16, m19 ; 0 1 0123 89ab
+ mova m2, m8
+ vpermb m4, m18, m19 ; 0 1 89ab ghij
+ vpdpbusd m2, m3, m10
+ mova m3, m8
+ vpermb m5, m16, m20 ; 2 3 0123 89ab
+ vpdpbusd m3, m4, m11
+ mova m4, m8
+ vpermb m0, m18, m20 ; 2 3 89ab ghij
+ vpdpbusd m4, m5, m10
+ mova m5, m8
+ vpermb ym1, ym16, ym21 ; 4 0123 89ab
+ vpdpbusd m5, m0, m11
+ mova ym0, ym8
+ vpermb ym6, ym18, ym21 ; 4 89ab ghij
+ vpdpbusd ym0, ym1, ym10
+ mova ym1, ym8
+ vpermb m19, m17, m19 ; 0 1 4567 cdef
+ vpdpbusd ym1, ym6, ym11
+ vpermb m20, m17, m20 ; 2 3 4567 cdef
+ vpdpbusd m2, m19, m11
+ vpdpbusd m3, m19, m10
+ vpermb ym21, ym17, ym21 ; 4 4567 cdef
+ vpdpbusd m4, m20, m11
+ vpdpbusd m5, m20, m10
+ vpdpbusd ym0, ym21, ym11
+ vpdpbusd ym1, ym21, ym10
+ packssdw m2, m3 ; 0 1
+ packssdw m4, m5 ; 2 3
+ packssdw ym0, ym1 ; 4
+ REPX {psraw x, 2}, m2, m4, ym0
+ vshufi32x4 m3, m2, m4, q1032 ; 1 2
+ vshufi32x4 m0, m4, m0, q1032 ; 3 4
+ punpcklwd m1, m2, m3 ; 01 12
+ punpckhwd m2, m3
+ punpcklwd m3, m4, m0 ; 23 34
+ punpckhwd m4, m0
+.hv_w16_loop:
+ movu ym19, [r5+ssq*1]
+ lea r5, [r5+ssq*2]
+ vinserti32x8 m19, [r5+ssq*0], 1
+ vpermb m6, m16, m19 ; 5 6 0123 89ab
+ mova m5, m8
+ vpermb m20, m18, m19 ; 5 6 89ab ghij
+ vpdpbusd m5, m6, m10
+ mova m6, m8
+ vpermb m19, m17, m19 ; 5 6 4567 cdef
+ vpdpbusd m6, m20, m11
+ mova m20, m9
+ vpdpwssd m20, m1, m12 ; a0 b0
+ mova m21, m9
+ vpdpwssd m21, m2, m12
+ vpdpbusd m5, m19, m11
+ vpdpbusd m6, m19, m10
+ vpdpwssd m20, m3, m13 ; a1 b1
+ vpdpwssd m21, m4, m13
+ packssdw m5, m6
+ mova m1, m3
+ psraw m5, 2 ; 5 6
+ mova m2, m4
+ vshufi32x4 m4, m0, m5, q1032 ; 4 5
+ mova m0, m5
+ punpcklwd m3, m4, m0 ; 45 56
+ punpckhwd m4, m0
+ vpdpwssd m20, m3, m14 ; a2 b2
+ vpdpwssd m21, m4, m14
+ psrad m20, 6
+ psrad m21, 6
+ packssdw m20, m21
+ mova [r7+wq*0], ym20
+ vextracti32x8 [r7+wq*1], m20, 1
+ lea r7, [r7+wq*2]
+ sub hd, 2
+ jg .hv_w16_loop
+ add srcq, 16
+ add tmpq, 32
movzx hd, r6b
- mov srcq, r4
- mov dstq, r7
sub r6d, 1<<8
jg .hv_w16_loop0
vzeroupper
@@ -2353,183 +3424,38 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
mova [tmpq+64*1], m1
%endmacro
-%if WIN64
-DECLARE_REG_TMP 6, 4
-%else
-DECLARE_REG_TMP 6, 7
-%endif
-
-%define PREP_8TAP_FN FN prep_8tap,
-
+PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_8bpc
+PREP_8TAP_FN regular_sharp, REGULAR, SHARP, prep_8tap_8bpc
PREP_8TAP_FN sharp, SHARP, SHARP
-PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH
-PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP
-PREP_8TAP_FN smooth, SMOOTH, SMOOTH
-PREP_8TAP_FN sharp_regular, SHARP, REGULAR
-PREP_8TAP_FN regular_sharp, REGULAR, SHARP
-PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR
-PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
-PREP_8TAP_FN regular, REGULAR, REGULAR
-cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
+cglobal prep_8tap_8bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my, stride3
imul mxd, mxm, 0x010101
add mxd, t0d ; 8tap_h, mx, 4tap_h
imul myd, mym, 0x010101
add myd, t1d ; 8tap_v, my, 4tap_v
lea r7, [prep_avx512icl]
- movsxd wq, wm
movifnidn hd, hm
test mxd, 0xf00
jnz .h
test myd, 0xf00
- jnz .v
- tzcnt wd, wd
- movzx wd, word [r7+wq*2+table_offset(prep,)]
- add wq, r7
- lea r6, [strideq*3]
-%if WIN64
- pop r7
-%endif
- jmp wq
-.h:
- test myd, 0xf00
- jnz .hv
- vpbroadcastd m4, [pd_2]
- WIN64_SPILL_XMM 10
- cmp wd, 4
- je .h_w4
- tzcnt wd, wd
- shr mxd, 16
- sub srcq, 3
- movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)]
- vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep_avx512icl+0]
- vpbroadcastd m9, [r7+mxq*8+subpel_filters-prep_avx512icl+4]
- add wq, r7
- jmp wq
-.h_w4:
- movzx mxd, mxb
- vbroadcasti128 ym5, [subpel_h_shufA]
- mov r3d, 0x4
- dec srcq
- vpbroadcastd ym6, [r7+mxq*8+subpel_filters-prep_avx512icl+2]
- kmovb k1, r3d
- lea stride3q, [strideq*3]
-.h_w4_loop:
- movq xm2, [srcq+strideq*0]
- movq xm3, [srcq+strideq*1]
- vpbroadcastq ym2{k1}, [srcq+strideq*2]
- vpbroadcastq ym3{k1}, [srcq+stride3q ]
- lea srcq, [srcq+strideq*4]
- pshufb ym2, ym5
- pshufb ym3, ym5
- mova ym0, ym4
- vpdpbusd ym0, ym2, ym6
- mova ym1, ym4
- vpdpbusd ym1, ym3, ym6
- packssdw ym0, ym1
- psraw ym0, 2
- mova [tmpq], ym0
- add tmpq, 32
- sub hd, 4
- jg .h_w4_loop
- RET
-.h_w8:
- vbroadcasti128 m5, [subpel_h_shufA]
- vbroadcasti128 m6, [subpel_h_shufB]
- vbroadcasti128 m7, [subpel_h_shufC]
- lea stride3q, [strideq*3]
-.h_w8_loop:
- movu xmm3, [srcq+strideq*0]
- vinserti128 ym3, ymm3, [srcq+strideq*1], 1
- vinserti128 m3, [srcq+strideq*2], 2
- vinserti128 m3, [srcq+stride3q ], 3
- lea srcq, [srcq+strideq*4]
- pshufb m1, m3, m5
- pshufb m2, m3, m6
- mova m0, m4
- vpdpbusd m0, m1, m8
- mova m1, m4
- vpdpbusd m1, m2, m8
- pshufb m3, m7
- vpdpbusd m0, m2, m9
- vpdpbusd m1, m3, m9
- packssdw m0, m1
- psraw m0, 2
- mova [tmpq], m0
- add tmpq, 64
- sub hd, 4
- jg .h_w8_loop
- RET
-.h_w16:
- mova m5, [spel_h_perm16a]
- mova m6, [spel_h_perm16b]
- mova m7, [spel_h_perm16c]
- lea stride3q, [strideq*3]
-.h_w16_loop:
- movu ym0, [srcq+strideq*0]
- movu ym1, [srcq+strideq*2]
- vinserti32x8 m0, [srcq+strideq*1], 1
- vinserti32x8 m1, [srcq+stride3q ], 1
- lea srcq, [srcq+strideq*4]
- PREP_8TAP_H
- add tmpq, 64*2
- sub hd, 4
- jg .h_w16_loop
- RET
-.h_w32:
- mova m5, [spel_h_perm32a]
- mova m6, [spel_h_perm32b]
- mova m7, [spel_h_perm32c]
-.h_w32_loop:
- movu m0, [srcq+strideq*0]
- movu m1, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- PREP_8TAP_H
- add tmpq, 64*2
- sub hd, 2
- jg .h_w32_loop
- RET
-.h_w64:
- xor r6d, r6d
- jmp .h_start
-.h_w128:
- mov r6, -64*1
-.h_start:
- mova m5, [spel_h_perm32a]
- mova m6, [spel_h_perm32b]
- mova m7, [spel_h_perm32c]
- sub srcq, r6
- mov r5, r6
-.h_loop:
- movu m0, [srcq+r6+32*0]
- movu m1, [srcq+r6+32*1]
- PREP_8TAP_H
- add tmpq, 64*2
- add r6, 64
- jle .h_loop
- add srcq, strideq
- mov r6, r5
- dec hd
- jg .h_loop
- RET
+ jz mangle(private_prefix %+ _prep_6tap_8bpc_avx512icl).prep
.v:
movzx mxd, myb ; Select 4-tap/8-tap filter multipliers.
shr myd, 16 ; Note that the code is 8-tap only, having
- tzcnt wd, wd
cmp hd, 4 ; a separate 4-tap code path for (4|8|16)x4
cmove myd, mxd ; had a negligible effect on performance.
- ; TODO: Would a 6-tap code path be worth it?
- lea myq, [r7+myq*8+subpel_filters-prep_avx512icl]
- movzx wd, word [r7+wq*2+table_offset(prep, _8tap_v)]
- add wq, r7
- lea stride3q, [strideq*3]
- sub srcq, stride3q
+ tzcnt r5d, wd
+ lea myq, [base+subpel_filters+myq*8]
+ movzx r5d, word [r7+r5*2+table_offset(prep, _8tap_v)]
vpbroadcastd m7, [pw_8192]
vpbroadcastw m8, [myq+0]
+ add r5, r7
vpbroadcastw m9, [myq+2]
+ lea stride3q, [strideq*3]
vpbroadcastw m10, [myq+4]
+ sub srcq, stride3q
vpbroadcastw m11, [myq+6]
- jmp wq
+ jmp r5
.v_w4:
movd xmm0, [srcq+strideq*0]
vpbroadcastd ymm1, [srcq+strideq*2]
@@ -2576,172 +3502,146 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
vzeroupper
RET
.v_w8:
- mov r3d, 0xf044
- kmovw k1, r3d
- kshiftrw k2, k1, 8
- movq xm0, [srcq+strideq*0]
- vpbroadcastq ym1, [srcq+strideq*1]
- vpbroadcastq m2, [srcq+strideq*2]
- vpbroadcastq m3, [srcq+stride3q ]
+ mova m6, [spel_v_perm8]
+ movq xm1, [srcq+strideq*0]
+ mov r6d, 0x3e
+ movq xm2, [srcq+strideq*1]
+ vpbroadcastq ym3, [srcq+strideq*2]
+ kmovb k1, r6d
+ vpbroadcastq ym4, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
- vpbroadcastq m4, [srcq+strideq*0]
- vpbroadcastq m5, [srcq+strideq*1]
- vpbroadcastq m6, [srcq+strideq*2]
- vmovdqa64 ym0{k1}, ym1
- vmovdqa64 ym1{k1}, ym2
- vmovdqa64 m2{k1}, m3
- vmovdqa64 m3{k1}, m4
- vmovdqa64 m4{k1}, m5
- vmovdqa64 m5{k1}, m6
- punpcklbw ym0, ym1 ; 01 12 __ __
- punpcklbw m2, m3 ; 23 34 23 34
- punpcklbw m4, m5 ; 45 56 45 56
- vmovdqa64 m0{k2}, m2 ; 01 12 23 34
- vmovdqa64 m2{k2}, m4 ; 23 34 45 56
+ vpunpcklqdq m1{k1}, m3, [srcq+strideq*0] {1to8}
+ vpunpcklqdq m2{k1}, m4, [srcq+strideq*1] {1to8}
+ movq xm0, [srcq+strideq*2]
+ kshiftlb k2, k1, 2
+ shufpd m1, m2, 0x30 ; 0 1 2 3 4 5
+ vshufi32x4 m2, m1, m0, q0021 ; 2 3 4 5 6 _
+ vpermb m1, m6, m1 ; 01 12 23 34
+ vpermb m2, m6, m2 ; 23 34 45 56
.v_w8_loop:
- vpbroadcastq m1, [srcq+stride3q ]
+ vpbroadcastq ym3, [srcq+strideq*4]
+ vpunpcklqdq ym0{k1}, ym3, [srcq+stride3q] {1to4}
lea srcq, [srcq+strideq*4]
- vpbroadcastq m3, [srcq+strideq*0]
- vpbroadcastq m5, [srcq+strideq*1]
- pmaddubsw m14, m0, m8
- pmaddubsw m15, m2, m9
- vpblendmq m0{k1}, m6, m1
- vpblendmq m2{k1}, m1, m3
- vpbroadcastq m6, [srcq+strideq*2]
- paddw m14, m15
- punpcklbw m2, m0, m2 ; 67 78 67 78
- vpblendmq m12{k1}, m3, m5
- vpblendmq m13{k1}, m5, m6
- vpblendmq m0{k2}, m4, m2 ; 45 56 67 78
- punpcklbw m4, m12, m13 ; 89 9a 89 9a
- vmovdqa64 m2{k2}, m4 ; 67 78 89 9a
- pmaddubsw m12, m0, m10
- pmaddubsw m13, m2, m11
- paddw m14, m12
- paddw m14, m13
- pmulhrsw m14, m7
- mova [tmpq], m14
+ vpbroadcastq m3, [srcq+strideq*2]
+ vpunpcklqdq m0{k2}, m3, [srcq+strideq*1] {1to8}
+ pmaddubsw m4, m1, m8 ; a0 b0 c0 d0
+ mova m1, m2
+ pmaddubsw m5, m2, m9 ; a1 b1 c1 d1
+ vpermb m2, m6, m0 ; 67 78 89 9a
+ mova xm0, xm3
+ vshufi32x4 m1, m2, q1032 ; 45 56 67 78
+ pmaddubsw m3, m2, m11 ; a3 b3 c3 d3
+ paddw m4, m5
+ pmaddubsw m5, m1, m10 ; a2 b2 c2 d2
+ paddw m4, m3
+ paddw m4, m5
+ pmulhrsw m4, m7
+ mova [tmpq], m4
add tmpq, 64
sub hd, 4
jg .v_w8_loop
RET
.v_w16:
- mov r3d, 0xf0
- kmovb k1, r3d
- vbroadcasti128 m0, [srcq+strideq*0]
- vbroadcasti128 m1, [srcq+strideq*1]
- vbroadcasti128 m2, [srcq+strideq*2]
- vbroadcasti128 m3, [srcq+stride3q ]
+ mova m12, [spel_v_perm16b]
+ vbroadcasti32x4 m1, [srcq+strideq*0]
+ mov r6d, 0x0f
+ vbroadcasti32x4 ym4, [srcq+strideq*1]
+ vbroadcasti32x4 m2, [srcq+strideq*2]
+ kmovb k1, r6d
+ vbroadcasti32x4 ym5, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
- vbroadcasti128 m4, [srcq+strideq*0]
- vbroadcasti128 m5, [srcq+strideq*1]
- vbroadcasti128 m6, [srcq+strideq*2]
- vmovdqa64 m0{k1}, m1
- vmovdqa64 m1{k1}, m2
- vmovdqa64 m2{k1}, m3
- vmovdqa64 m3{k1}, m4
- vmovdqa64 m4{k1}, m5
- vmovdqa64 m5{k1}, m6
- shufpd m0, m2, 0xcc ; 0a_2a 0b_2b 1a_3a 1b_3b
- shufpd m1, m3, 0xcc ; 1a_3a 1b_3b 2a_4a 2b_4b
- shufpd m4, m4, 0x44 ; 4a_-- 4b_-- 5a_-- 5b_--
- shufpd m5, m5, 0x44 ; 5a_-- 5b_-- 6a_-- 6b_--
- punpckhbw m2, m0, m1 ; 23a 23b 34a 34b
- punpcklbw m0, m1 ; 01a 01b 12a 12b
- punpcklbw m4, m5 ; 45a 45b 56a 56b
+ vbroadcasti32x4 m3, [srcq+strideq*0]
+ vbroadcasti32x4 ym6, [srcq+strideq*1]
+ vbroadcasti32x4 m0, [srcq+strideq*2]
+ vshufpd m1{k1}, m4, m2, 0xcc
+ vshufpd m2{k1}, m5, m3, 0xcc
+ vshufpd m3{k1}, m6, m0, 0xcc
+ vpermb m1, m12, m1 ; 01 12
+ vpermb m2, m12, m2 ; 23 34
+ vpermb m3, m12, m3 ; 45 56
.v_w16_loop:
- vbroadcasti128 m3, [srcq+stride3q ]
+ pmaddubsw m4, m1, m8 ; a0 b0
+ mova m1, m3
+ pmaddubsw m13, m2, m9 ; a1 b1
+ vbroadcasti32x4 ym6, [srcq+stride3q ]
+ pmaddubsw m5, m2, m8 ; c0 d0
lea srcq, [srcq+strideq*4]
- vbroadcasti128 m5, [srcq+strideq*0]
- vpblendmq m1{k1}, m6, m3
- vmovdqa64 m3{k1}, m5
- pmaddubsw m12, m0, m8
- pmaddubsw m13, m2, m8
- pmaddubsw m14, m2, m9
- pmaddubsw m15, m4, m9
- pmaddubsw m0, m4, m10
- vbroadcasti128 m2, [srcq+strideq*1]
- vbroadcasti128 m6, [srcq+strideq*2]
- paddw m12, m14
- paddw m13, m15
- paddw m12, m0
- vmovdqa64 m5{k1}, m2
- vmovdqa64 m2{k1}, m6
- mova m0, m4
- shufpd m1, m5, 0xcc ; 6a_8a 6b_8b 7a_9a 7b_9b
- shufpd m3, m2, 0xcc ; 7a_9a 7b_9b 8a_Aa 8b_Ab
- punpcklbw m2, m1, m3 ; 67a 67b 78a 78b
- punpckhbw m4, m1, m3 ; 89a 89b 9Aa 9Ab
- pmaddubsw m14, m2, m10
- pmaddubsw m15, m2, m11
- paddw m13, m14
- paddw m12, m15
- pmaddubsw m14, m4, m11
- paddw m13, m14
- pmulhrsw m12, m7
- pmulhrsw m13, m7
- mova [tmpq+ 0], m12
- mova [tmpq+64], m13
+ pmaddubsw m14, m3, m9 ; c1 d1
+ vbroadcasti32x4 m3, [srcq+strideq*0]
+ vshufpd m0{k1}, m6, m3, 0xcc
+ vbroadcasti32x4 ym6, [srcq+strideq*1]
+ vpermb m2, m12, m0 ; 67 78
+ vbroadcasti32x4 m0, [srcq+strideq*2]
+ vshufpd m3{k1}, m6, m0, 0xcc
+ paddw m4, m13
+ pmaddubsw m13, m1, m10 ; a2 b2
+ vpermb m3, m12, m3 ; 89 9a
+ paddw m5, m14
+ pmaddubsw m14, m2, m10 ; c2 d2
+ pmaddubsw m15, m2, m11 ; a3 b3
+ pmaddubsw m6, m3, m11 ; c3 d3
+ paddw m4, m13
+ paddw m5, m14
+ paddw m4, m15
+ paddw m5, m6
+ pmulhrsw m4, m7
+ pmulhrsw m5, m7
+ mova [tmpq+ 0], m4
+ mova [tmpq+64], m5
add tmpq, 64*2
sub hd, 4
jg .v_w16_loop
RET
.v_w32:
- mova m18, [bilin_v_perm64]
- movu ym0, [srcq+strideq*0]
- movu ym1, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- movu ym2, [srcq+strideq*0]
- movu ym3, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- movu ym4, [srcq+strideq*0]
- movu ym5, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- movu ym6, [srcq+strideq*0]
- vpermq m0, m18, m0
- vpermq m1, m18, m1
- vpermq m2, m18, m2
- vpermq m3, m18, m3
- vpermq m4, m18, m4
- vpermq m5, m18, m5
- vpermq m6, m18, m6
- punpcklbw m0, m1
- punpcklbw m1, m2
- punpcklbw m2, m3
- punpcklbw m3, m4
- punpcklbw m4, m5
- punpcklbw m5, m6
+ movshdup m21, [bilin_v_perm64]
+ movu ym16, [srcq+strideq*0]
+ movu ym17, [srcq+strideq*1]
+ movu ym18, [srcq+strideq*2]
+ add srcq, stride3q
+ movu ym19, [srcq+strideq*0]
+ vpermt2q m16, m21, m19 ; 0 3
+ movu ym20, [srcq+strideq*1]
+ vpermt2q m17, m21, m20 ; 1 4
+ movu ym20, [srcq+strideq*2]
+ add srcq, stride3q
+ vpermt2q m18, m21, m20 ; 2 5
+ movu ym20, [srcq+strideq*0]
+ vpermt2q m19, m21, m20 ; 3 6
+ punpcklbw m0, m16, m17 ; 01
+ punpcklbw m1, m17, m18 ; 12
+ punpcklbw m2, m18, m19 ; 23
+ punpckhbw m3, m16, m17 ; 34
+ punpckhbw m4, m17, m18 ; 45
+ punpckhbw m5, m18, m19 ; 56
.v_w32_loop:
- movu ym12, [srcq+strideq*1]
+ movu ym16, [srcq+strideq*1]
lea srcq, [srcq+strideq*2]
- movu ym13, [srcq+strideq*0]
+ movu ym17, [srcq+strideq*0]
pmaddubsw m14, m0, m8
- pmaddubsw m16, m2, m9
- pmaddubsw m15, m1, m8
- pmaddubsw m17, m3, m9
mova m0, m2
+ pmaddubsw m15, m1, m8
mova m1, m3
- vpermq m12, m18, m12
- vpermq m13, m18, m13
- paddw m14, m16
- paddw m15, m17
- pmaddubsw m16, m4, m10
- pmaddubsw m17, m5, m10
- punpcklbw m6, m12
- punpcklbw m12, m13
+ pmaddubsw m2, m9
+ vpermt2q m16, m21, m17 ; 7 8
+ pmaddubsw m3, m9
+ pmaddubsw m12, m4, m10
+ pmaddubsw m13, m5, m10
+ shufpd m19, m16, 0x55 ; 6 7
+ paddw m14, m2
mova m2, m4
+ punpcklbw m4, m19, m16 ; 67
+ paddw m15, m3
mova m3, m5
- paddw m14, m16
- paddw m15, m17
- pmaddubsw m16, m6, m11
- pmaddubsw m17, m12, m11
- mova m4, m6
- mova m5, m12
- paddw m14, m16
- paddw m15, m17
+ punpckhbw m5, m19, m16 ; 78
+ paddw m14, m12
+ paddw m15, m13
+ pmaddubsw m12, m4, m11
+ pmaddubsw m13, m5, m11
+ mova m19, m16
+ paddw m14, m12
+ paddw m15, m13
pmulhrsw m14, m7
pmulhrsw m15, m7
- mova m6, m13
mova [tmpq+ 0], m14
mova [tmpq+64], m15
add tmpq, 64*2
@@ -2750,154 +3650,241 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
vzeroupper
RET
.v_w64:
- mov wd, 64
- jmp .v_start
.v_w128:
- mov wd, 128
-.v_start:
- WIN64_SPILL_XMM 27
- mova m26, [bilin_v_perm64]
- lea r6d, [hq+wq*2]
- mov r5, srcq
- mov r7, tmpq
+ WIN64_SPILL_XMM 24
+ mova m23, [bilin_v_perm64]
+ add wd, wd
+ lea r6d, [hq+wq]
.v_loop0:
- vpermq m0, m26, [srcq+strideq*0]
- vpermq m1, m26, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- vpermq m2, m26, [srcq+strideq*0]
- vpermq m3, m26, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- vpermq m4, m26, [srcq+strideq*0]
- vpermq m5, m26, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- vpermq m6, m26, [srcq+strideq*0]
- punpckhbw m12, m0, m1
- punpcklbw m0, m1
- punpckhbw m13, m1, m2
- punpcklbw m1, m2
- punpckhbw m14, m2, m3
- punpcklbw m2, m3
- punpckhbw m15, m3, m4
- punpcklbw m3, m4
- punpckhbw m16, m4, m5
- punpcklbw m4, m5
- punpckhbw m17, m5, m6
- punpcklbw m5, m6
+ vpermq m12, m23, [srcq+strideq*0]
+ vpermq m13, m23, [srcq+strideq*1]
+ lea r5, [srcq+strideq*2]
+ vpermq m14, m23, [r5 +strideq*0]
+ vpermq m15, m23, [r5 +strideq*1]
+ lea r5, [r5+strideq*2]
+ vpermq m16, m23, [r5 +strideq*0]
+ vpermq m17, m23, [r5 +strideq*1]
+ lea r5, [r5+strideq*2]
+ vpermq m18, m23, [r5 +strideq*0]
+ mov r7, tmpq
+ punpcklbw m0, m12, m13 ; 01
+ punpckhbw m12, m13
+ punpcklbw m1, m13, m14 ; 12
+ punpckhbw m13, m14
+ punpcklbw m2, m14, m15 ; 23
+ punpckhbw m14, m15
+ punpcklbw m3, m15, m16 ; 34
+ punpckhbw m15, m16
+ punpcklbw m4, m16, m17 ; 45
+ punpckhbw m16, m17
+ punpcklbw m5, m17, m18 ; 56
+ punpckhbw m17, m18
.v_loop:
- vpermq m18, m26, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- vpermq m19, m26, [srcq+strideq*0]
- pmaddubsw m20, m0, m8
- pmaddubsw m21, m12, m8
- pmaddubsw m22, m1, m8
- pmaddubsw m23, m13, m8
+ pmaddubsw m19, m0, m8 ; a0
+ vpermq m6, m23, [r5+strideq*1]
+ pmaddubsw m20, m12, m8
mova m0, m2
+ pmaddubsw m2, m9 ; a1
mova m12, m14
+ pmaddubsw m14, m9
+ lea r5, [r5+strideq*2]
+ pmaddubsw m21, m1, m8 ; b0
+ pmaddubsw m22, m13, m8
mova m1, m3
+ pmaddubsw m3, m9 ; b1
mova m13, m15
- pmaddubsw m2, m9
- pmaddubsw m14, m9
- pmaddubsw m3, m9
pmaddubsw m15, m9
- punpckhbw m24, m6, m18
- punpcklbw m6, m18
- paddw m20, m2
- paddw m21, m14
- paddw m22, m3
- paddw m23, m15
+ paddw m19, m2
mova m2, m4
+ pmaddubsw m4, m10 ; a2
+ paddw m20, m14
mova m14, m16
+ pmaddubsw m16, m10
+ paddw m21, m3
mova m3, m5
+ pmaddubsw m5, m10 ; b2
+ paddw m22, m15
mova m15, m17
- pmaddubsw m4, m10
- pmaddubsw m16, m10
- pmaddubsw m5, m10
pmaddubsw m17, m10
- punpckhbw m25, m18, m19
- punpcklbw m18, m19
- paddw m20, m4
- paddw m21, m16
- paddw m22, m5
- paddw m23, m17
- mova m4, m6
- mova m16, m24
- mova m5, m18
- mova m17, m25
- pmaddubsw m6, m11
- pmaddubsw m24, m11
- pmaddubsw m18, m11
- pmaddubsw m25, m11
- paddw m20, m6
- paddw m21, m24
- paddw m22, m18
- paddw m23, m25
- pmulhrsw m20, m7
- pmulhrsw m21, m7
- pmulhrsw m22, m7
- pmulhrsw m23, m7
- mova m6, m19
- mova [tmpq+wq*0+ 0], m20
- mova [tmpq+wq*0+64], m21
- mova [tmpq+wq*2+ 0], m22
- mova [tmpq+wq*2+64], m23
- lea tmpq, [tmpq+wq*4]
+ paddw m19, m4
+ punpcklbw m4, m18, m6 ; 67
+ paddw m20, m16
+ punpckhbw m16, m18, m6
+ vpermq m18, m23, [r5+strideq*0]
+ paddw m21, m5
+ pmaddubsw m5, m4, m11 ; a3
+ paddw m22, m17
+ pmaddubsw m17, m16, m11
+ paddw m19, m5
+ punpcklbw m5, m6, m18 ; 78
+ paddw m20, m17
+ punpckhbw m17, m6, m18
+ pmaddubsw m6, m5, m11 ; b3
+ paddw m21, m6
+ pmaddubsw m6, m17, m11
+ paddw m22, m6
+ REPX {pmulhrsw x, m7}, m19, m20, m21, m22
+ mova [r7+wq*0+ 0], m19
+ mova [r7+wq*0+64], m20
+ mova [r7+wq*1+ 0], m21
+ mova [r7+wq*1+64], m22
+ lea r7, [r7+wq*2]
sub hd, 2
jg .v_loop
- add r5, 64
- add r7, 128
+ add srcq, 64
+ add tmpq, 128
movzx hd, r6b
- mov srcq, r5
- mov tmpq, r7
sub r6d, 1<<8
jg .v_loop0
RET
-.hv:
- WIN64_SPILL_XMM 16
+.h:
+ RESET_STACK_STATE
+ test myd, 0xf00
+ jnz .hv
+.h2:
+ vpbroadcastd m4, [pd_2]
cmp wd, 4
- je .hv_w4
+ je .h_w4
+ tzcnt wd, wd
shr mxd, 16
sub srcq, 3
- vpbroadcastd m10, [r7+mxq*8+subpel_filters-prep_avx512icl+0]
- vpbroadcastd m11, [r7+mxq*8+subpel_filters-prep_avx512icl+4]
- movzx mxd, myb
- shr myd, 16
- cmp hd, 4
- cmove myd, mxd
- tzcnt wd, wd
- vpbroadcastd m8, [pd_2]
- movzx wd, word [r7+wq*2+table_offset(prep, _8tap_hv)]
- vpbroadcastd m9, [pd_32]
+ movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)]
+ vpbroadcastd m8, [base+subpel_filters+mxq*8+0]
+ vpbroadcastd m9, [base+subpel_filters+mxq*8+4]
add wq, r7
- vpbroadcastq m0, [r7+myq*8+subpel_filters-prep_avx512icl]
- lea stride3q, [strideq*3]
- sub srcq, stride3q
- punpcklbw m0, m0
- psraw m0, 8 ; sign-extend
- pshufd m12, m0, q0000
- pshufd m13, m0, q1111
- pshufd m14, m0, q2222
- pshufd m15, m0, q3333
jmp wq
-.hv_w4:
+.h_w4:
+ movzx mxd, mxb
+ vbroadcasti128 ym5, [subpel_h_shufA]
+ mov r3d, 0x4
+ dec srcq
+ vpbroadcastd ym6, [base+subpel_filters+mxq*8+2]
+ kmovb k1, r3d
+ lea stride3q, [strideq*3]
+.h_w4_loop:
+ movq xm2, [srcq+strideq*0]
+ movq xm3, [srcq+strideq*1]
+ vpbroadcastq ym2{k1}, [srcq+strideq*2]
+ vpbroadcastq ym3{k1}, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ pshufb ym2, ym5
+ pshufb ym3, ym5
+ mova ym0, ym4
+ vpdpbusd ym0, ym2, ym6
+ mova ym1, ym4
+ vpdpbusd ym1, ym3, ym6
+ packssdw ym0, ym1
+ psraw ym0, 2
+ mova [tmpq], ym0
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4_loop
+ RET
+.h_w8:
+ vbroadcasti128 m5, [subpel_h_shufA]
+ vbroadcasti128 m6, [subpel_h_shufB]
+ vbroadcasti128 m7, [subpel_h_shufC]
+ lea stride3q, [strideq*3]
+.h_w8_loop:
+ movu xmm3, [srcq+strideq*0]
+ vinserti128 ym3, ymm3, [srcq+strideq*1], 1
+ vinserti128 m3, [srcq+strideq*2], 2
+ vinserti128 m3, [srcq+stride3q ], 3
+ lea srcq, [srcq+strideq*4]
+ pshufb m1, m3, m5
+ pshufb m2, m3, m6
+ mova m0, m4
+ vpdpbusd m0, m1, m8
+ mova m1, m4
+ vpdpbusd m1, m2, m8
+ pshufb m3, m7
+ vpdpbusd m0, m2, m9
+ vpdpbusd m1, m3, m9
+ packssdw m0, m1
+ psraw m0, 2
+ mova [tmpq], m0
+ add tmpq, 64
+ sub hd, 4
+ jg .h_w8_loop
+ RET
+.h_w16:
+ mova m5, [spel_h_perm16]
+ vpbroadcastd m7, [pb_4]
+ lea stride3q, [strideq*3]
+ paddb m6, m7, m5
+ paddb m7, m6
+.h_w16_loop:
+ movu ym0, [srcq+strideq*0]
+ movu ym1, [srcq+strideq*2]
+ vinserti32x8 m0, [srcq+strideq*1], 1
+ vinserti32x8 m1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ PREP_8TAP_H
+ add tmpq, 64*2
+ sub hd, 4
+ jg .h_w16_loop
+ RET
+.h_w32:
+ mova m5, [spel_h_perm32]
+ vpbroadcastd m7, [pb_4]
+ paddb m6, m7, m5
+ paddb m7, m6
+.h_w32_loop:
+ movu m0, [srcq+strideq*0]
+ movu m1, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ PREP_8TAP_H
+ add tmpq, 64*2
+ sub hd, 2
+ jg .h_w32_loop
+ RET
+.h_w64:
+ xor r6d, r6d
+ jmp .h_start
+.h_w128:
+ mov r6, -64*1
+.h_start:
+ mova m5, [spel_h_perm32]
+ vpbroadcastd m7, [pb_4]
+ sub srcq, r6
+ paddb m6, m7, m5
+ paddb m7, m6
+.h_loop0:
+ mov r5, r6
+.h_loop:
+ movu m0, [srcq+r5+32*0]
+ movu m1, [srcq+r5+32*1]
+ PREP_8TAP_H
+ add tmpq, 64*2
+ add r5, 64
+ jle .h_loop
+ add srcq, strideq
+ dec hd
+ jg .h_loop0
+ RET
+.hv:
+ RESET_STACK_STATE
+ vpbroadcastd m8, [pd_2]
+ vpbroadcastd m9, [pd_32]
+ cmp wd, 4
+ jg .hv_w8
movzx mxd, mxb
dec srcq
- vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep_avx512icl+2]
+ vpbroadcastd m11, [base+subpel_filters+mxq*8+2]
movzx mxd, myb
shr myd, 16
cmp hd, 4
cmove myd, mxd
- vpbroadcastq m0, [r7+myq*8+subpel_filters-prep_avx512icl]
+ vpbroadcastq m0, [base+subpel_filters+myq*8]
lea stride3q, [strideq*3]
sub srcq, stride3q
mov r3d, 0x04
kmovb k1, r3d
kshiftlb k2, k1, 2
kshiftlb k3, k1, 4
- vpbroadcastd m10, [pd_2]
- vbroadcasti128 m16, [subpel_h_shufA]
+ vbroadcasti128 m10, [subpel_h_shufA]
punpcklbw m0, m0
psraw m0, 8 ; sign-extend
- vpbroadcastd m11, [pd_32]
pshufd m12, m0, q0000
pshufd m13, m0, q1111
pshufd m14, m0, q2222
@@ -2910,263 +3897,265 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
vpbroadcastq m3{k2}, [srcq+strideq*0]
vpbroadcastq m2{k3}, [srcq+strideq*1]
vpbroadcastq m3{k3}, [srcq+strideq*2]
- mova m17, [spel_hv_perm4a]
- movu m18, [spel_hv_perm4b]
- mova m0, m10
- mova m1, m10
- pshufb m2, m16
- pshufb m3, m16
- vpdpbusd m0, m2, m8
- vpdpbusd m1, m3, m8
+ mova m6, [spel_hv_perm4a]
+ movu m7, [spel_hv_perm4b]
+ mova m0, m8
+ mova m1, m8
+ pshufb m2, m10
+ pshufb m3, m10
+ vpdpbusd m0, m2, m11
+ vpdpbusd m1, m3, m11
packssdw m0, m1 ; _ 0 1 2 3 4 5 6
psraw m0, 2
- vpermb m1, m17, m0 ; 01 12 23 34
- vpermb m2, m18, m0 ; 23 34 45 56
+ vpermb m1, m6, m0 ; 01 12 23 34
+ vpermb m2, m7, m0 ; 23 34 45 56
.hv_w4_loop:
movq xm3, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
movq xm4, [srcq+strideq*0]
vpbroadcastq ym3{k1}, [srcq+strideq*1]
vpbroadcastq ym4{k1}, [srcq+strideq*2]
- mova ym5, ym10
- mova ym6, ym10
- pshufb ym3, ym16
- pshufb ym4, ym16
- vpdpbusd ym5, ym3, ym8
- vpdpbusd ym6, ym4, ym8
- mova m7, m11
- packssdw ym5, ym6 ; 7 8 9 a _ _ _ _
- psraw ym5, 2
- valignq m0, m5, m0, 4 ; _ 4 5 6 7 8 9 a
- vpdpwssd m7, m1, m12
- vpdpwssd m7, m2, m13
- vpermb m1, m17, m0 ; 45 56 67 78
- vpermb m2, m18, m0 ; 67 78 89 9a
- vpdpwssd m7, m1, m14
- vpdpwssd m7, m2, m15
- psrad m7, 6
- vpmovdw [tmpq], m7
+ mova m5, m9
+ pshufb ym3, ym10
+ vpdpwssd m5, m1, m12 ; a0 b0 c0 d0
+ mova ym1, ym8
+ pshufb ym4, ym10
+ vpdpbusd ym1, ym3, ym11
+ mova ym3, ym8
+ vpdpbusd ym3, ym4, ym11
+ vpdpwssd m5, m2, m13 ; a1 b1 c1 d1
+ packssdw ym1, ym3 ; 7 8 9 a
+ psraw ym1, 2
+ vshufi32x4 m0, m1, q1032 ; _ 4 5 6 7 8 9 a
+ vpermb m1, m6, m0 ; 45 56 67 78
+ vpermb m2, m7, m0 ; 67 78 89 9a
+ vpdpwssd m5, m1, m14 ; a2 b2 c2 d2
+ vpdpwssd m5, m2, m15 ; a3 b3 c3 d3
+ psrad m5, 6
+ vpmovdw [tmpq], m5
add tmpq, 32
sub hd, 4
jg .hv_w4_loop
- vzeroupper
RET
.hv_w8:
- WIN64_SPILL_XMM 24
- vbroadcasti128 m16, [subpel_h_shufA]
- vbroadcasti128 m17, [subpel_h_shufB]
- vbroadcasti128 m18, [subpel_h_shufC]
- vinserti128 ym0, [srcq+strideq*0], 1
- vinserti128 m0, [srcq+strideq*1], 2
- vinserti128 m0, [srcq+strideq*2], 3
- movu xm1, [srcq+stride3q ]
+ shr mxd, 16
+ sub srcq, 3
+ vpbroadcastd m10, [base+subpel_filters+mxq*8+0]
+ vpbroadcastd m11, [base+subpel_filters+mxq*8+4]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ vpbroadcastq m0, [base+subpel_filters+myq*8]
+ lea stride3q, [strideq*3]
+ sub srcq, stride3q
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ pshufd m12, m0, q0000
+ pshufd m13, m0, q1111
+ pshufd m14, m0, q2222
+ pshufd m15, m0, q3333
+ cmp wd, 8
+ jg .hv_w16
+ vbroadcasti32x4 m17, [srcq+stride3q ]
+ vinserti32x4 m16, m17, [srcq+strideq*0], 0
+ vbroadcasti32x4 m19, [subpel_h_shufA]
+ vinserti32x4 m16, [srcq+strideq*1], 1
+ vbroadcasti32x4 m21, [subpel_h_shufC]
+ vinserti32x4 m16, [srcq+strideq*2], 2
lea srcq, [srcq+strideq*4]
- vinserti128 ym1, [srcq+strideq*0], 1
- vinserti128 m1, [srcq+strideq*1], 2
- vinserti128 m1, [srcq+strideq*2], 3
+ vinserti128 ym17, [srcq+strideq*0], 1
+ vbroadcasti32x4 m20, [subpel_h_shufB]
+ vinserti32x4 m17, [srcq+strideq*1], 2
+ vinserti32x4 m17, [srcq+strideq*2], 3
+ pshufb m3, m16, m19 ; 0 1 2 3 0123
mova m2, m8
- mova m4, m8
+ pshufb m0, m16, m21 ; 0 1 2 3 89ab
+ vpdpbusd m2, m3, m10
mova m3, m8
- mova m5, m8
- pshufb m20, m0, m16
- pshufb m21, m0, m17
- pshufb m22, m0, m18
- pshufb m23, m1, m16
- pshufb m6, m1, m17
- pshufb m7, m1, m18
- vpdpbusd m2, m20, m10
- vpdpbusd m4, m21, m10
- vpdpbusd m2, m21, m11
- vpdpbusd m4, m22, m11
- vpdpbusd m3, m23, m10
- vpdpbusd m5, m6, m10
- vpdpbusd m3, m6, m11
- vpdpbusd m5, m7, m11
- packssdw m2, m4
- packssdw m3, m5
- psraw m2, 2 ; _ 0 1 2
- psraw m3, 2 ; 3 4 5 6
- valignq m0, m3, m2, 2 ; 0 1 2 3
- valignq m1, m3, m2, 4 ; 1 2 3 4
- valignq m2, m3, m2, 6 ; 2 3 4 5
- punpcklwd m4, m0, m1 ; 01a 12a 23a 34a
- punpckhwd m5, m0, m1 ; 01b 12b 23b 34b
- punpcklwd m6, m2, m3 ; 23a 34a 45a 56a
- punpckhwd m7, m2, m3 ; 23b 34b 45b 56b
+ pshufb m1, m17, m19 ; 3 4 5 6 0123
+ vpdpbusd m3, m0, m11
+ mova m0, m8
+ pshufb m4, m17, m21 ; 3 4 5 6 89ab
+ vpdpbusd m0, m1, m10
+ mova m1, m8
+ pshufb m16, m20 ; 0 1 2 3 4567
+ vpdpbusd m1, m4, m11
+ pshufb m17, m20 ; 3 4 5 6 4567
+ vpdpbusd m2, m16, m11
+ vpdpbusd m3, m16, m10
+ vpdpbusd m0, m17, m11
+ vpdpbusd m1, m17, m10
+ packssdw m2, m3
+ packssdw m0, m1
+ psraw m2, 2 ; 0 1 2 3
+ psraw m0, 2 ; 3 4 5 6
+ vshufi32x4 m4, m2, m0, q2132 ; 2 3 4 5
+ vshufi32x4 m5, m2, m0, q1021 ; 1 2 3 4
+ punpcklwd m3, m4, m0 ; 23 34 45 56
+ punpckhwd m4, m0
+ punpcklwd m1, m2, m5 ; 01 12 23 34
+ punpckhwd m2, m5
.hv_w8_loop:
- movu xm19, [srcq+stride3q ]
+ movu xm18, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
- vinserti128 ym19, [srcq+strideq*0], 1
- vinserti128 m19, [srcq+strideq*1], 2
- vinserti128 m19, [srcq+strideq*2], 3
- mova m20, m9
- mova m21, m9
- mova m22, m8
- mova m23, m8
- vpdpwssd m20, m4, m12
- vpdpwssd m21, m5, m12
- vpdpwssd m20, m6, m13
- vpdpwssd m21, m7, m13
- pshufb m0, m19, m16
- pshufb m1, m19, m17
- pshufb m2, m19, m18
- vpdpbusd m22, m0, m10
- vpdpbusd m23, m1, m10
- vpdpbusd m22, m1, m11
- vpdpbusd m23, m2, m11
- packssdw m22, m23
- psraw m22, 2 ; 7 8 9 A
- valignq m0, m22, m3, 2 ; 4 5 6 7
- valignq m1, m22, m3, 4 ; 5 6 7 8
- valignq m2, m22, m3, 6 ; 6 7 8 9
- mova m3, m22
- punpcklwd m4, m0, m1 ; 45a 56a 67a 78a
- punpckhwd m5, m0, m1 ; 45b 56b 67b 78b
- punpcklwd m6, m2, m3 ; 67a 78a 89a 9Aa
- punpckhwd m7, m2, m3 ; 67b 78b 89b 9Ab
- vpdpwssd m20, m4, m14
- vpdpwssd m21, m5, m14
- vpdpwssd m20, m6, m15
- vpdpwssd m21, m7, m15
- psrad m20, 6
- psrad m21, 6
- packssdw m20, m21
- mova [tmpq], m20
+ vinserti128 ym18, [srcq+strideq*0], 1
+ vinserti32x4 m18, [srcq+strideq*1], 2
+ vinserti32x4 m18, [srcq+strideq*2], 3
+ pshufb m17, m18, m19 ; 7 8 9 a 0123
+ mova m16, m8
+ pshufb m5, m18, m21 ; 7 8 9 a 89ab
+ vpdpbusd m16, m17, m10
+ mova m17, m8
+ pshufb m18, m20 ; 7 8 9 a 4567
+ vpdpbusd m17, m5, m11
+ mova m5, m9
+ vpdpwssd m5, m3, m13 ; a1 b1 c1 d1
+ mova m6, m9
+ vpdpwssd m6, m4, m13
+ vpdpbusd m16, m18, m11
+ vpdpbusd m17, m18, m10
+ vpdpwssd m5, m1, m12 ; a0 b0 c0 d0
+ mova m1, m3
+ vpdpwssd m6, m2, m12
+ mova m2, m4
+ packssdw m16, m17
+ psraw m16, 2 ; 7 8 9 a
+ valignq m4, m16, m0, 6 ; 6 7 8 9
+ mova m0, m16
+ punpcklwd m3, m4, m16 ; 67 78 89 9a
+ punpckhwd m4, m16
+ vpdpwssd m5, m3, m15 ; a3 b3 c3 d3
+ vpdpwssd m6, m4, m15
+ vshufi32x4 m1, m3, q1032 ; 45 56 67 78
+ vshufi32x4 m2, m4, q1032
+ vpdpwssd m5, m1, m14 ; a2 b2 c2 d2
+ vpdpwssd m6, m2, m14
+ psrad m5, 6
+ psrad m6, 6
+ packssdw m5, m6
+ mova [tmpq], m5
add tmpq, 64
sub hd, 4
jg .hv_w8_loop
+ vzeroupper
RET
.hv_w16:
- mov wd, 16*2
- jmp .hv_start
-.hv_w32:
- mov wd, 32*2
- jmp .hv_start
-.hv_w64:
- mov wd, 64*2
- jmp .hv_start
-.hv_w128:
- mov wd, 128*2
-.hv_start:
- WIN64_SPILL_XMM 31
- mova m16, [spel_h_perm16a]
- mova m17, [spel_h_perm16b]
- mova m18, [spel_h_perm16c]
+ WIN64_SPILL_XMM 23
+ mova m16, [spel_h_perm16]
+ vpbroadcastd m18, [pb_4]
+ add wd, wd
+ paddb m17, m18, m16
lea r6d, [hq+wq*8-256]
- mov r5, srcq
+ paddb m18, m17
+.hv_w16_loop0:
+ movu ym19, [srcq+strideq*0]
+ vinserti32x8 m19, [srcq+strideq*1], 1
+ lea r5, [srcq+strideq*2]
+ movu ym20, [r5 +strideq*0]
+ vinserti32x8 m20, [r5 +strideq*1], 1
+ lea r5, [r5 +strideq*2]
+ movu ym21, [r5 +strideq*0]
+ vinserti32x8 m21, [r5 +strideq*1], 1
+ lea r5, [r5 +strideq*2]
+ movu ym22, [r5 +strideq*0]
mov r7, tmpq
-.hv_loop0:
- movu ym0, [srcq+strideq*0]
- vinserti32x8 m0, [srcq+strideq*1], 1
- lea srcq, [srcq+strideq*2]
- movu ym1, [srcq+strideq*0]
- vinserti32x8 m1, [srcq+strideq*1], 1
- lea srcq, [srcq+strideq*2]
- movu ym2, [srcq+strideq*0]
- vinserti32x8 m2, [srcq+strideq*1], 1
- lea srcq, [srcq+strideq*2]
- movu ym3, [srcq+strideq*0]
+ vpermb m3, m16, m19 ; 0 1 0123 89ab
+ mova m2, m8
+ vpermb m4, m18, m19 ; 0 1 89ab ghij
+ vpdpbusd m2, m3, m10
+ mova m3, m8
+ vpermb m5, m16, m20 ; 2 3 0123 89ab
+ vpdpbusd m3, m4, m11
mova m4, m8
+ vpermb m6, m18, m20 ; 2 3 89ab ghij
+ vpdpbusd m4, m5, m10
mova m5, m8
+ vpermb m7, m16, m21 ; 4 5 0123 89ab
+ vpdpbusd m5, m6, m11
mova m6, m8
+ vpermb m0, m18, m21 ; 4 5 89ab ghij
+ vpdpbusd m6, m7, m10
mova m7, m8
- vpermb m19, m16, m0
- vpermb m20, m17, m0
- vpermb m21, m18, m0
- vpermb m22, m16, m1
- vpermb m23, m17, m1
- vpermb m24, m18, m1
- vpermb m25, m16, m2
- vpermb m26, m17, m2
- vpermb m27, m18, m2
- vpermb ym28, ym16, ym3
- vpermb ym29, ym17, ym3
- vpermb ym30, ym18, ym3
- mova m0, m8
- mova m1, m8
- mova ym2, ym8
- mova ym3, ym8
- vpdpbusd m4, m19, m10
- vpdpbusd m5, m20, m10
- vpdpbusd m6, m22, m10
- vpdpbusd m7, m23, m10
- vpdpbusd m0, m25, m10
- vpdpbusd m1, m26, m10
- vpdpbusd ym2, ym28, ym10
- vpdpbusd ym3, ym29, ym10
+ vpermb ym1, ym16, ym22 ; 6 0123 89ab
+ vpdpbusd m7, m0, m11
+ mova ym0, ym8
+ vpermb m19, m17, m19 ; 0 1 4567 cdef
+ vpdpbusd ym0, ym1, ym10
+ vpermb ym1, ym18, ym22 ; 6 89ab ghij
+ vpdpbusd m2, m19, m11
+ vpdpbusd m3, m19, m10
+ mova ym19, ym8
+ vpermb m20, m17, m20 ; 2 3 4567 cdef
+ vpdpbusd ym19, ym1, ym11
+ vpermb m21, m17, m21 ; 4 5 4567 cdef
vpdpbusd m4, m20, m11
- vpdpbusd m5, m21, m11
- vpdpbusd m6, m23, m11
- vpdpbusd m7, m24, m11
- vpdpbusd m0, m26, m11
- vpdpbusd m1, m27, m11
- vpdpbusd ym2, ym29, ym11
- vpdpbusd ym3, ym30, ym11
- packssdw m4, m5
- packssdw m6, m7
- packssdw m0, m1
- packssdw ym2, ym3
- psraw m4, 2 ; 0a 0b 1a 1b
- psraw m6, 2 ; 2a 2b 3a 3b
- psraw m0, 2 ; 4a 4b 5a 5b
- psraw ym2, 2 ; 6a 6b __ __
- vshufi32x4 m5, m4, m6, q1032 ; 1a 1b 2a 2b
- vshufi32x4 m7, m6, m0, q1032 ; 3a 3b 4a 4b
- vshufi32x4 m1, m0, m2, q1032 ; 5a 5b 6a 6b
- punpcklwd m2, m4, m5 ; 01a 01c 12a 12c
- punpckhwd m3, m4, m5 ; 01b 01d 12b 12d
- punpcklwd m4, m6, m7 ; 23a 23c 34a 34c
- punpckhwd m5, m6, m7 ; 23b 23d 34b 34d
- punpcklwd m6, m0, m1 ; 45a 45c 56a 56c
- punpckhwd m7, m0, m1 ; 45b 45d 56b 56d
-.hv_loop:
- movu ym19, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- vinserti32x8 m19, [srcq+strideq*0], 1
+ vpdpbusd m5, m20, m10
+ vpermb ym22, ym17, ym22 ; 6 4567 cdef
+ vpdpbusd m6, m21, m11
+ vpdpbusd m7, m21, m10
+ packssdw m2, m3 ; 0 1
+ vpdpbusd ym0, ym22, ym11
+ packssdw m4, m5 ; 2 3
+ vpdpbusd ym19, ym22, ym10
+ packssdw m6, m7 ; 4 5
+ packssdw ym0, ym19 ; 6
+ REPX {psraw x, 2}, m2, m4, m6, ym0
+ vshufi32x4 m3, m2, m4, q1032 ; 1 2
+ vshufi32x4 m5, m4, m6, q1032 ; 3 4
+ vshufi32x4 m0, m6, m0, q1032 ; 5 6
+ punpcklwd m1, m2, m3 ; 01 12
+ punpckhwd m2, m3
+ punpcklwd m3, m4, m5 ; 23 34
+ punpckhwd m4, m5
+ punpcklwd m5, m6, m0 ; 45 56
+ punpckhwd m6, m0
+.hv_w16_loop:
+ movu ym19, [r5+strideq*1]
+ lea r5, [r5+strideq*2]
+ vinserti32x8 m19, [r5+strideq*0], 1
mova m20, m9
+ vpdpwssd m20, m1, m12 ; a0
+ vpermb m1, m16, m19
mova m21, m9
+ vpdpwssd m21, m2, m12 ; b0
+ vpermb m2, m17, m19
mova m22, m8
- mova m23, m8
- vpdpwssd m20, m2, m12
- vpdpwssd m21, m3, m12
- vpdpwssd m20, m4, m13
- vpdpwssd m21, m5, m13
- vpermb m24, m16, m19
- vpermb m25, m17, m19
- vpermb m26, m18, m19
- vpdpbusd m22, m24, m10
- vpdpbusd m23, m25, m10
- vpdpbusd m22, m25, m11
- vpdpbusd m23, m26, m11
- packssdw m22, m23
- psraw m22, 2 ; 7a 7b 8a 8b
- vshufi32x4 m0, m1, m22, q1032 ; 6a 6b 7a 7b
+ vpdpbusd m22, m1, m10
+ mova m1, m8
+ vpermb m19, m18, m19
+ vpdpbusd m1, m2, m10
+ vpdpwssd m20, m3, m13 ; a1
+ vpdpwssd m21, m4, m13 ; b1
+ vpdpbusd m22, m2, m11
mova m2, m4
- mova m3, m5
- mova m1, m22
+ vpdpbusd m1, m19, m11
mova m4, m6
- mova m5, m7
- punpcklwd m6, m0, m1 ; 67a 67c 78a 78c
- punpckhwd m7, m0, m1 ; 67b 67d 78b 78d
- vpdpwssd m20, m4, m14
- vpdpwssd m21, m5, m14
- vpdpwssd m20, m6, m15
- vpdpwssd m21, m7, m15
+ vpdpwssd m20, m5, m14 ; a2
+ vpdpwssd m21, m6, m14 ; b2
+ packssdw m22, m1
+ mova m1, m3
+ psraw m22, 2 ; 7 8
+ mova m3, m5
+ vshufi32x4 m6, m0, m22, q1032 ; 6 7
+ mova m0, m22
+ punpcklwd m5, m6, m0 ; 67 78
+ punpckhwd m6, m0
+ vpdpwssd m20, m5, m15 ; a3
+ vpdpwssd m21, m6, m15 ; b3
psrad m20, 6
psrad m21, 6
packssdw m20, m21
- mova [tmpq+wq*0], ym20
- vextracti32x8 [tmpq+wq*1], m20, 1
- lea tmpq, [tmpq+wq*2]
+ mova [r7+wq*0], ym20
+ vextracti32x8 [r7+wq*1], m20, 1
+ lea r7, [r7+wq*2]
sub hd, 2
- jg .hv_loop
- add r5, 16
- add r7, 32
+ jg .hv_w16_loop
+ add srcq, 16
+ add tmpq, 32
movzx hd, r6b
- mov srcq, r5
- mov tmpq, r7
sub r6d, 1<<8
- jg .hv_loop0
+ jg .hv_w16_loop0
RET
cglobal warp_affine_8x8t_8bpc, 4, 7, 22, tmp, ts
diff --git a/third_party/dav1d/tests/meson.build b/third_party/dav1d/tests/meson.build
index 11db0a56e9..38a591b5b4 100644
--- a/third_party/dav1d/tests/meson.build
+++ b/third_party/dav1d/tests/meson.build
@@ -100,7 +100,7 @@ if is_asm_enabled
],
)
- test('checkasm', checkasm, suite: 'checkasm', timeout: 180, is_parallel: false)
+ test('checkasm', checkasm, suite: 'checkasm', timeout: 180)
benchmark('checkasm', checkasm, suite: 'checkasm', timeout: 3600, args: '--bench')
endif